From 91cf42ddf9c05c1b92a590e8a0d0214969df6347 Mon Sep 17 00:00:00 2001
From: kronwerk <kronwerk@users.noreply.github.com>
Date: Tue, 10 Dec 2024 10:57:35 +0300
Subject: [PATCH 001/101] added aof-max-size parameter with tests; fixes #540

Signed-off-by: kronwerk <kronwerk@users.noreply.github.com>

improved aof-max-size tests

Signed-off-by: kronwerk <kronwerk@users.noreply.github.com>
---
 src/aof.c                   | 16 +++++++---
 src/config.c                |  1 +
 src/server.c                | 15 +++++++++
 src/server.h                |  1 +
 tests/unit/aof-max-size.tcl | 61 +++++++++++++++++++++++++++++++++++++
 valkey.conf                 |  3 ++
 6 files changed, 92 insertions(+), 5 deletions(-)
 create mode 100644 tests/unit/aof-max-size.tcl

diff --git a/src/aof.c b/src/aof.c
index 0fd3cf5c26..8af3a9928f 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1010,16 +1010,22 @@ int startAppendOnly(void) {
  * the first call is short, there is a end-of-space condition, so the next
  * is likely to fail. However apparently in modern systems this is no longer
  * true, and in general it looks just more resilient to retry the write. If
- * there is an actual error condition we'll get it at the next try. */
-ssize_t aofWrite(int fd, const char *buf, size_t len) {
-    ssize_t nwritten = 0, totwritten = 0;
+ * there is an actual error condition we'll get it at the next try.
+ * We also check for aof-max-size limit here returning "no space" on exceed. */
+ssize_t aofWrite(int fd, const char *buf, size_t len, off_t aof_current_size, unsigned long long aof_max_size) {
+    ssize_t nwritten = 0, totwritten = 0, nonewritten = -1;
+
+    if (aof_max_size && (unsigned long long)aof_current_size >= aof_max_size) {
+        errno = ENOSPC;
+        return nonewritten;
+    }
 
     while (len) {
         nwritten = write(fd, buf, len);
 
         if (nwritten < 0) {
             if (errno == EINTR) continue;
-            return totwritten ? totwritten : -1;
+            return totwritten ? totwritten : nonewritten;
         }
 
         len -= nwritten;
@@ -1119,7 +1125,7 @@ void flushAppendOnlyFile(int force) {
     }
 
     latencyStartMonitor(latency);
-    nwritten = aofWrite(server.aof_fd, server.aof_buf, sdslen(server.aof_buf));
+    nwritten = aofWrite(server.aof_fd, server.aof_buf, sdslen(server.aof_buf), server.aof_current_size, server.aof_max_size);
     latencyEndMonitor(latency);
     /* We want to capture different events for delayed writes:
      * when the delay happens with a pending fsync, or with a saving child
diff --git a/src/config.c b/src/config.c
index cc0f8d2dd8..bcfa465e1f 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3337,6 +3337,7 @@ standardConfig static_configs[] = {
     /* Unsigned Long Long configs */
     createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory),
     createULongLongConfig("cluster-link-sendbuf-limit", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.cluster_link_msg_queue_limit_bytes, 0, MEMORY_CONFIG, NULL, NULL),
+    createULongLongConfig("aof-max-size", NULL, MODIFIABLE_CONFIG, 0, ULLONG_MAX, server.aof_max_size, 0, INTEGER_CONFIG, NULL, NULL),
 
     /* Size_t configs */
     createSizeTConfig("hash-max-listpack-entries", "hash-max-ziplist-entries", MODIFIABLE_CONFIG, 0, LONG_MAX, server.hash_max_listpack_entries, 512, INTEGER_CONFIG, NULL, NULL),
diff --git a/src/server.c b/src/server.c
index 1e38b5ac69..518ecad603 100644
--- a/src/server.c
+++ b/src/server.c
@@ -5800,10 +5800,17 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                 "module_fork_last_cow_size:%zu\r\n", server.stat_module_cow_bytes));
 
         if (server.aof_enabled) {
+            char aof_current_size_hdsk[64];
+            char aof_max_size_hdsk[64];
+            bytesToHuman(aof_current_size_hdsk, sizeof(aof_current_size_hdsk), (unsigned long long)server.aof_current_size);
+            bytesToHuman(aof_max_size_hdsk, sizeof(aof_max_size_hdsk), server.aof_max_size);
             info = sdscatprintf(
                 info,
                 FMTARGS(
                     "aof_current_size:%lld\r\n", (long long)server.aof_current_size,
+                    "aof_current_size_human:%s\r\n", aof_current_size_hdsk,
+                    "aof_max_size:%lld\r\n", server.aof_max_size,
+                    "aof_max_size_human:%s\r\n", aof_max_size_hdsk,
                     "aof_base_size:%lld\r\n", (long long)server.aof_rewrite_base_size,
                     "aof_pending_rewrite:%d\r\n", server.aof_rewrite_scheduled,
                     "aof_buffer_length:%zu\r\n", sdslen(server.aof_buf),
@@ -7130,6 +7137,14 @@ __attribute__((weak)) int main(int argc, char **argv) {
                   server.maxmemory);
     }
 
+    /* Warning the user about suspicious aof-max-size setting. */
+    if (server.aof_max_size > 0 && server.aof_max_size < 1024 * 1024) {
+        serverLog(LL_WARNING,
+                  "WARNING: You specified a aof-max-size value that is less than 1MB (current value is %llu bytes). Are "
+                  "you sure this is what you really want?",
+                  server.aof_max_size);
+    }
+
     serverSetCpuAffinity(server.server_cpulist);
     setOOMScoreAdj(-1);
 
diff --git a/src/server.h b/src/server.h
index 14a16593b0..3ba7a61b7d 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1939,6 +1939,7 @@ struct valkeyServer {
     off_t aof_rewrite_min_size;         /* the AOF file is at least N bytes. */
     off_t aof_rewrite_base_size;        /* AOF size on latest startup or rewrite. */
     off_t aof_current_size;             /* AOF current size (Including BASE + INCRs). */
+    unsigned long long aof_max_size;    /* Max number of disk bytes to use for AOF */
     off_t aof_last_incr_size;           /* The size of the latest incr AOF. */
     off_t aof_last_incr_fsync_offset;   /* AOF offset which is already requested to be synced to disk.
                                          * Compare with the aof_last_incr_size. */
diff --git a/tests/unit/aof-max-size.tcl b/tests/unit/aof-max-size.tcl
new file mode 100644
index 0000000000..4c35220a77
--- /dev/null
+++ b/tests/unit/aof-max-size.tcl
@@ -0,0 +1,61 @@
+start_server {tags {"aof-max-size" "external:skip"}} {
+    r config set auto-aof-rewrite-percentage 0 ; # disable auto-rewrite
+    r config set appendonly yes ; # enable AOF
+
+    set master [srv 0 client]
+    set master_host [srv 0 host]
+    set master_port [srv 0 port]
+
+    test "Low aof-max-size stops writing AOF with ENOSPC" {
+        r set k v
+        r config set aof-max-size 1
+
+        r set k2 v2
+        wait_for_log_messages 0 {"*Error writing to the AOF file: No space left on device*"} 0 100 10
+    }
+
+    test "New write attempts fail and doesn't insrease AOF buffer anymore" {
+        set info1 [r info]
+        set buf1 [getInfoProperty $info1 mem_aof_buffer]
+        set len1 [getInfoProperty $info1 aof_buffer_length]
+
+        catch {r set somelongerkey somelongervalue} err
+        assert {$err eq "MISCONF Errors writing to the AOF file: No space left on device"}
+        assert_equal [r get somelongerkey] ""
+
+        set info2 [r info]
+        set buf2 [getInfoProperty $info2 mem_aof_buffer]
+        set len2 [getInfoProperty $info2 aof_buffer_length]
+        assert_equal $buf1 $buf2
+        assert_equal $len1 $len2
+    }
+
+    test "Increasing aof-max-size fixes AOF write error" {
+        r config set aof-max-size 1000
+        wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} 0 100 10
+
+        assert_equal [r set k3 v3] "OK"
+        assert_equal [r get k3] "v3"
+    }
+
+    test "Meeting aof-max-size does not prevent AOF rewrite" {
+        set loglines [count_log_lines 0] ; # want to check new line, not from previous test
+        
+        # start write load
+        set load_handle0 [start_write_load $master_host $master_port 10]
+        wait_for_condition 50 100 {
+            [r dbsize] > 0
+        } else {
+            fail "No write load detected."
+        }
+
+        waitForBgrewriteaof r
+        r bgrewriteaof
+        wait_for_log_messages 0 {"*Background AOF rewrite finished successfully*"} $loglines 100 10
+        wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} $loglines 100 10
+
+        # stop write load
+        stop_write_load $load_handle0
+        wait_load_handlers_disconnected
+    }
+}
\ No newline at end of file
diff --git a/valkey.conf b/valkey.conf
index e23aea39de..8ea5273045 100644
--- a/valkey.conf
+++ b/valkey.conf
@@ -1653,6 +1653,9 @@ aof-use-rdb-preamble yes
 # the AOF format in a way that may not be compatible with existing AOF parsers.
 aof-timestamp-enabled no
 
+# Maximum size for AOF files on disk in bytes. Ignored, if set to 0.
+aof-max-size 0 
+
 ################################ SHUTDOWN #####################################
 
 # Maximum time to wait for replicas when shutting down, in seconds.

From b57409c7b44f31ce230af4f1020aaa46f84f4909 Mon Sep 17 00:00:00 2001
From: kronwerk <kronwerk@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:46:46 +0300
Subject: [PATCH 002/101] tuned tests

Signed-off-by: kronwerk <kronwerk@users.noreply.github.com>
---
 tests/unit/aof-max-size.tcl | 45 ++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/tests/unit/aof-max-size.tcl b/tests/unit/aof-max-size.tcl
index 4c35220a77..e5526d819f 100644
--- a/tests/unit/aof-max-size.tcl
+++ b/tests/unit/aof-max-size.tcl
@@ -1,25 +1,34 @@
-start_server {tags {"aof-max-size" "external:skip"}} {
+proc setup {{size 1}} {
+    r set k v
+    r config set aof-max-size $size
+    r set k2 v2
+}
+
+proc cleanup {} {
+    r config set aof-max-size 0
+    r flushall
+}
+
+start_server {tags {"external:skip"}} {
     r config set auto-aof-rewrite-percentage 0 ; # disable auto-rewrite
     r config set appendonly yes ; # enable AOF
 
-    set master [srv 0 client]
     set master_host [srv 0 host]
     set master_port [srv 0 port]
 
     test "Low aof-max-size stops writing AOF with ENOSPC" {
-        r set k v
-        r config set aof-max-size 1
-
-        r set k2 v2
+        setup
         wait_for_log_messages 0 {"*Error writing to the AOF file: No space left on device*"} 0 100 10
+        cleanup
     }
 
-    test "New write attempts fail and doesn't insrease AOF buffer anymore" {
+    test "New write attempts when limited with aof-max-size fail and doesn't insrease AOF buffer anymore" {
+        setup
         set info1 [r info]
         set buf1 [getInfoProperty $info1 mem_aof_buffer]
         set len1 [getInfoProperty $info1 aof_buffer_length]
 
-        catch {r set somelongerkey somelongervalue} err
+        catch {r set somelongerkey somelongrvalue} err
         assert {$err eq "MISCONF Errors writing to the AOF file: No space left on device"}
         assert_equal [r get somelongerkey] ""
 
@@ -28,34 +37,28 @@ start_server {tags {"aof-max-size" "external:skip"}} {
         set len2 [getInfoProperty $info2 aof_buffer_length]
         assert_equal $buf1 $buf2
         assert_equal $len1 $len2
+        cleanup
     }
 
     test "Increasing aof-max-size fixes AOF write error" {
+        setup
+        set loglines [count_log_lines 0] ; # want to check new line, not from previous test
         r config set aof-max-size 1000
-        wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} 0 100 10
+        wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} $loglines 100 10
 
         assert_equal [r set k3 v3] "OK"
         assert_equal [r get k3] "v3"
+        cleanup
     }
 
     test "Meeting aof-max-size does not prevent AOF rewrite" {
+        setup 200
         set loglines [count_log_lines 0] ; # want to check new line, not from previous test
-        
-        # start write load
-        set load_handle0 [start_write_load $master_host $master_port 10]
-        wait_for_condition 50 100 {
-            [r dbsize] > 0
-        } else {
-            fail "No write load detected."
-        }
 
         waitForBgrewriteaof r
         r bgrewriteaof
         wait_for_log_messages 0 {"*Background AOF rewrite finished successfully*"} $loglines 100 10
         wait_for_log_messages 0 {"*AOF write error looks solved. The server can write again.*"} $loglines 100 10
-
-        # stop write load
-        stop_write_load $load_handle0
-        wait_load_handlers_disconnected
+        cleanup
     }
 }
\ No newline at end of file

From 089015b50a2f6a5578f59905560f22b0dcd09eb7 Mon Sep 17 00:00:00 2001
From: Jim Brunner <brunnerj@amazon.com>
Date: Wed, 11 Dec 2024 09:47:06 -0800
Subject: [PATCH 003/101] defrag: allow defrag to start during AOF loading
 (#1420)

Addresses https://github.com/valkey-io/valkey/issues/1393

Changes:
* During AOF loading or long running script, this allows defrag to be
initiated.
* The AOF defrag test was corrected to eliminate the wait period and
rely on non-timer invocations.
* Logic for "overage" time in defrag was changed. It previously
accumulated underage leading to large latencies in extreme tests having
very high CPU percentage. After several simple stages were completed
during infrequent blocked processing, a large cycle time would be
experienced.

Signed-off-by: Jim Brunner <brunnerj@amazon.com>
---
 src/defrag.c                 | 14 ++++++++++----
 tests/unit/memefficiency.tcl |  6 +++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index 057fdd50de..2fa067f0dc 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -84,7 +84,7 @@ struct DefragContext {
 
     long long timeproc_id;      // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID)
     monotime timeproc_end_time; // Ending time of previous timerproc execution
-    long timeproc_overage_us;   // A correction value if over/under target CPU percent
+    long timeproc_overage_us;   // A correction value if over target CPU percent
 };
 static struct DefragContext defrag;
 
@@ -1157,7 +1157,7 @@ static int computeDefragCycleUs(void) {
          *  the starvation of the timer. */
         dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent);
 
-        // Also adjust for any accumulated overage(underage).
+        // Also adjust for any accumulated overage.
         dutyCycleUs -= defrag.timeproc_overage_us;
         defrag.timeproc_overage_us = 0;
 
@@ -1176,8 +1176,11 @@ static int computeDefragCycleUs(void) {
  * computeDefragCycleUs computation. */
 static int computeDelayMs(monotime intendedEndtime) {
     defrag.timeproc_end_time = getMonotonicUs();
-    int overage = defrag.timeproc_end_time - intendedEndtime;
+    long overage = defrag.timeproc_end_time - intendedEndtime;
     defrag.timeproc_overage_us += overage; // track over/under desired CPU
+    /* Allow negative overage (underage) to count against existing overage, but don't allow
+     * underage (from short stages) to be accumulated.  */
+    if (defrag.timeproc_overage_us < 0) defrag.timeproc_overage_us = 0;
 
     int targetCpuPercent = server.active_defrag_cpu_percent;
     serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100);
@@ -1189,7 +1192,7 @@ static int computeDelayMs(monotime intendedEndtime) {
     long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent;
     long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us;
     // Only increase delay by the fraction of the overage that would be non-duty-cycle
-    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative
+    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100;
     if (delayUs < 0) delayUs = 0;
     long delayMs = delayUs / 1000; // round down
     return delayMs;
@@ -1254,6 +1257,9 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i
  * actions.  This interface allows defrag to continue running, avoiding a single long defrag step
  * after the long operation completes. */
 void defragWhileBlocked(void) {
+    // This is called infrequently, while timers are not active.  We might need to start defrag.
+    if (!defragIsRunning()) monitorActiveDefrag();
+
     if (!defragIsRunning()) return;
 
     // Save off the timeproc_id.  If we have a normal termination, it will be cleared.
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index abd23b1d83..ce74b7c618 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -138,8 +138,12 @@ run_solo {defrag} {
                 # reset stats and load the AOF file
                 r config resetstat
                 r config set key-load-delay -25 ;# sleep on average 1/25 usec
+                # Note: This test is checking if defrag is working DURING AOF loading (while
+                #       timers are not active).  So we don't give any extra time, and we deactivate
+                #       defrag immediately after the AOF loading is complete.  During loading,
+                #       defrag will get invoked less often, causing starvation prevention.  We
+                #       should expect longer latency measurements.
                 r debug loadaof
-                after 1000 ;# give defrag a chance to work before turning it off
                 r config set activedefrag no
 
                 # measure hits and misses right after aof loading

From 2a2900fd3ff4b7ac11a70772a5dad11eedf75aa0 Mon Sep 17 00:00:00 2001
From: Pierre <105686771+pieturin@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:26:06 -0800
Subject: [PATCH 004/101] Send MEET packet to node if there is no inbound link
 to fix inconsistency when handshake timedout (#1307)

In some cases, when meeting a new node, if the handshake times out, we
can end up with an inconsistent view of the cluster where the new node
knows about all the nodes in the cluster, but the cluster does not know
about this new node (or vice versa).
To detect this inconsistency, we now check if a node has an outbound
link but no inbound link, in this case it probably means this node does
not know us. In this case we (re-)send a MEET packet to this node to do
a new handshake with it.
If we receive a MEET packet from a known node, we disconnect the
outbound link to force a reconnect and sending of a PING packet so that
the other node recognizes the link as belonging to us. This prevents
cases where a node could send MEET packets in a loop because it thinks
the other node does not have an inbound link.

This fixes the bug described in #1251.

---------

Signed-off-by: Pierre Turin <pieturin@amazon.com>
---
 src/cluster_legacy.c                          |  99 ++++++---
 src/cluster_legacy.h                          |   4 +
 tests/support/cluster_util.tcl                |   9 +
 tests/unit/cluster/cluster-multiple-meets.tcl |   4 +-
 tests/unit/cluster/cluster-reliable-meet.tcl  | 208 +++++++++++++++++-
 5 files changed, 291 insertions(+), 33 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index a273fe0d86..d1c6dd0094 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1336,6 +1336,10 @@ clusterLink *createClusterLink(clusterNode *node) {
  * with this link will have the 'link' field set to NULL. */
 void freeClusterLink(clusterLink *link) {
     serverAssert(link != NULL);
+    serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s",
+              link->node ? link->node->name : "<unknown>",
+              link->inbound ? "inbound" : "outbound");
+
     if (link->conn) {
         connClose(link->conn);
         link->conn = NULL;
@@ -1351,6 +1355,7 @@ void freeClusterLink(clusterLink *link) {
         } else if (link->node->inbound_link == link) {
             serverAssert(link->inbound);
             link->node->inbound_link = NULL;
+            link->node->inbound_link_freed_time = mstime();
         }
     }
     zfree(link);
@@ -1490,6 +1495,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
     node->fail_time = 0;
     node->link = NULL;
     node->inbound_link = NULL;
+    node->inbound_link_freed_time = node->ctime;
     memset(node->ip, 0, sizeof(node->ip));
     node->announce_client_ipv4 = sdsempty();
     node->announce_client_ipv6 = sdsempty();
@@ -1696,6 +1702,9 @@ void clusterAddNode(clusterNode *node) {
  *    it is a replica node.
  */
 void clusterDelNode(clusterNode *delnode) {
+    serverAssert(delnode != NULL);
+    serverLog(LL_DEBUG, "Deleting node %.40s from cluster view", delnode->name);
+
     int j;
     dictIterator *di;
     dictEntry *de;
@@ -2078,7 +2087,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) {
 /* Return 1 if we already have a node in HANDSHAKE state matching the
  * specified ip address and port number. This function is used in order to
  * avoid adding a new handshake node for the same address multiple times. */
-int clusterHandshakeInProgress(char *ip, int port, int cport) {
+static int clusterHandshakeInProgress(char *ip, int port, int cport) {
     dictIterator *di;
     dictEntry *de;
 
@@ -2100,7 +2109,7 @@ int clusterHandshakeInProgress(char *ip, int port, int cport) {
  *
  * EAGAIN - There is already a handshake in progress for this address.
  * EINVAL - IP or port are not valid. */
-int clusterStartHandshake(char *ip, int port, int cport) {
+static int clusterStartHandshake(char *ip, int port, int cport) {
     clusterNode *n;
     char norm_ip[NET_IP_STR_LEN];
     struct sockaddr_storage sa;
@@ -3207,33 +3216,48 @@ int clusterProcessPacket(clusterLink *link) {
             }
         }
 
-        /* Add this node if it is new for us and the msg type is MEET.
-         * In this stage we don't try to add the node with the right
-         * flags, replicaof pointer, and so forth, as this details will be
-         * resolved when we'll receive PONGs from the node. The exception
-         * to this is the flag that indicates extensions are supported, as
-         * we want to send extensions right away in the return PONG in order
-         * to reduce the amount of time needed to stabilize the shard ID. */
-        if (!sender && type == CLUSTERMSG_TYPE_MEET) {
-            clusterNode *node;
-
-            node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
-            serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK);
-            getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
-            node->cport = ntohs(hdr->cport);
-            if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
-                node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED;
+        if (type == CLUSTERMSG_TYPE_MEET) {
+            if (!sender) {
+                /* Add this node if it is new for us and the msg type is MEET.
+                 * In this stage we don't try to add the node with the right
+                 * flags, replicaof pointer, and so forth, as this details will be
+                 * resolved when we'll receive PONGs from the node. The exception
+                 * to this is the flag that indicates extensions are supported, as
+                 * we want to send extensions right away in the return PONG in order
+                 * to reduce the amount of time needed to stabilize the shard ID. */
+                clusterNode *node;
+
+                node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
+                serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK);
+                getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
+                node->cport = ntohs(hdr->cport);
+                if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
+                    node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED;
+                }
+                setClusterNodeToInboundClusterLink(node, link);
+                clusterAddNode(node);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+
+                /* If this is a MEET packet from an unknown node, we still process
+                 * the gossip section here since we have to trust the sender because
+                 * of the message type. */
+                clusterProcessGossipSection(hdr, link);
+            } else if (sender->link && now - sender->ctime > server.cluster_node_timeout) {
+                /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not
+                 * know it.
+                 * Freeing my outbound link to that node, to force a reconnect and sending a PING.
+                 * Once that node receives our PING, it should recognize the new connection as an inbound link from me.
+                 * We should only free the outbound link if the node is known for more time than the handshake timeout,
+                 * since during this time, the other side might still be trying to complete the handshake. */
+
+                /* We should always receive a MEET packet on an inbound link. */
+                serverAssert(link != sender->link);
+                serverLog(LL_NOTICE, "Freeing outbound link to node %.40s after receiving a MEET packet from this known node",
+                          sender->name);
+                freeClusterLink(sender->link);
             }
-            setClusterNodeToInboundClusterLink(node, link);
-            clusterAddNode(node);
-            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
         }
 
-        /* If this is a MEET packet from an unknown node, we still process
-         * the gossip section here since we have to trust the sender because
-         * of the message type. */
-        if (!sender && type == CLUSTERMSG_TYPE_MEET) clusterProcessGossipSection(hdr, link);
-
         /* Anyway reply with a PONG */
         clusterSendPing(link, CLUSTERMSG_TYPE_PONG);
     }
@@ -3243,7 +3267,7 @@ int clusterProcessPacket(clusterLink *link) {
         serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type),
                   link->node ? link->node->name : "NULL");
 
-        if (sender && (sender->flags & CLUSTER_NODE_MEET)) {
+        if (sender && nodeInMeetState(sender)) {
             /* Once we get a response for MEET from the sender, we can stop sending more MEET. */
             sender->flags &= ~CLUSTER_NODE_MEET;
             serverLog(LL_NOTICE, "Successfully completed handshake with %.40s (%s)", sender->name,
@@ -3668,7 +3692,7 @@ void clusterLinkConnectHandler(connection *conn) {
      * of a PING one, to force the receiver to add us in its node
      * table. */
     mstime_t old_ping_sent = node->ping_sent;
-    clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
+    clusterSendPing(link, nodeInMeetState(node) ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
     if (old_ping_sent) {
         /* If there was an active ping before the link was
          * disconnected, we want to restore the ping time, otherwise
@@ -3747,7 +3771,9 @@ void clusterReadHandler(connection *conn) {
 
         if (nread <= 0) {
             /* I/O error... */
-            serverLog(LL_DEBUG, "I/O error reading from node link: %s",
+            serverLog(LL_DEBUG, "I/O error reading from node link (%.40s:%s): %s",
+                      link->node ? link->node->name : "<unknown>",
+                      link->inbound ? "inbound" : "outbound",
                       (nread == 0) ? "connection closed" : connGetLastError(conn));
             handleLinkIOError(link);
             return;
@@ -3928,6 +3954,12 @@ void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) {
 /* Send a PING or PONG packet to the specified node, making sure to add enough
  * gossip information. */
 void clusterSendPing(clusterLink *link, int type) {
+    serverLog(LL_DEBUG, "Sending %s packet to node %.40s (%s) on %s link",
+              clusterGetMessageTypeString(type),
+              link->node ? link->node->name : "<unknown>",
+              link->node ? link->node->human_nodename : "<unknown>",
+              link->inbound ? "inbound" : "outbound");
+
     static unsigned long long cluster_pings_sent = 0;
     cluster_pings_sent++;
     int gossipcount = 0; /* Number of gossip sections added so far. */
@@ -4943,6 +4975,15 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_
         clusterDelNode(node);
         return 1;
     }
+    if (node->link != NULL && node->inbound_link == NULL && nodeInNormalState(node) &&
+        now - node->inbound_link_freed_time > handshake_timeout) {
+        /* Node has an outbound link, but no inbound link for more than the handshake timeout.
+         * This probably means this node does not know us yet, whereas we know it.
+         * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. */
+        node->flags |= CLUSTER_NODE_MEET;
+        serverLog(LL_NOTICE, "Sending MEET packet to node %.40s because there is no inbound link for it", node->name);
+        clusterSendPing(node->link, CLUSTERMSG_TYPE_MEET);
+    }
 
     if (node->link == NULL) {
         clusterLink *link = createClusterLink(node);
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 5595402a4d..fb317038d6 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -61,12 +61,14 @@ typedef struct clusterLink {
 #define nodeIsPrimary(n) ((n)->flags & CLUSTER_NODE_PRIMARY)
 #define nodeIsReplica(n) ((n)->flags & CLUSTER_NODE_REPLICA)
 #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
+#define nodeInMeetState(n) ((n)->flags & CLUSTER_NODE_MEET)
 #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
 #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
 #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
 #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
 #define nodeSupportsExtensions(n) ((n)->flags & CLUSTER_NODE_EXTENSIONS_SUPPORTED)
 #define nodeSupportsLightMsgHdr(n) ((n)->flags & CLUSTER_NODE_LIGHT_HDR_SUPPORTED)
+#define nodeInNormalState(n) (!((n)->flags & (CLUSTER_NODE_HANDSHAKE | CLUSTER_NODE_MEET | CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL)))
 
 /* This structure represent elements of node->fail_reports. */
 typedef struct clusterNodeFailReport {
@@ -343,6 +345,8 @@ struct _clusterNode {
                                              * failover scenarios. */
     mstime_t repl_offset_time;              /* Unix time we received offset for this node */
     mstime_t orphaned_time;                 /* Starting time of orphaned primary condition */
+    mstime_t inbound_link_freed_time;       /* Last time we freed the inbound link for this node.
+                                               If it was never freed, it is the same as ctime */
     long long repl_offset;                  /* Last known repl offset for this node. */
     char ip[NET_IP_STR_LEN];                /* Latest known IP address of this node */
     sds announce_client_ipv4;               /* IPv4 for clients only. */
diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl
index 686f00071b..4f641c5e96 100644
--- a/tests/support/cluster_util.tcl
+++ b/tests/support/cluster_util.tcl
@@ -323,6 +323,15 @@ proc get_cluster_nodes {id {status "*"}} {
     return $nodes
 }
 
+# Returns the parsed myself node entry as a dictionary.
+proc get_myself id {
+    set nodes [get_cluster_nodes $id]
+    foreach n $nodes {
+        if {[cluster_has_flag $n myself]} {return $n}
+    }
+    return {}
+}
+
 # Returns 1 if no node knows node_id, 0 if any node knows it.
 proc node_is_forgotten {node_id} {
     for {set j 0} {$j < [llength $::servers]} {incr j} {
diff --git a/tests/unit/cluster/cluster-multiple-meets.tcl b/tests/unit/cluster/cluster-multiple-meets.tcl
index 059f03fbe4..0b5f769930 100644
--- a/tests/unit/cluster/cluster-multiple-meets.tcl
+++ b/tests/unit/cluster/cluster-multiple-meets.tcl
@@ -58,7 +58,7 @@ tags {tls:skip external:skip cluster} {
             } else {
                 fail "Node 1 recognizes node 0 even though it drops PONGs from node 0"
             }
-            assert {[llength [get_cluster_nodes 0 connected]] == 2}
+            assert {[llength [get_cluster_nodes 0]] == 2}
 
             # Drop incoming and outgoing links from/to 1
             R 0 DEBUG CLUSTERLINK KILL ALL [R 1 CLUSTER MYID]
@@ -77,6 +77,8 @@ tags {tls:skip external:skip cluster} {
             # Both a and b will turn to cluster state ok
             wait_for_condition 1000 50 {
                 [CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} &&
+                [llength [get_cluster_nodes 0 connected]] == 2 &&
+                [llength [get_cluster_nodes 1 connected]] == 2 &&
                 [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received]
             } else {
                 fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]"
diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl
index 45f5a6dc89..f189e96d5b 100644
--- a/tests/unit/cluster/cluster-reliable-meet.tcl
+++ b/tests/unit/cluster/cluster-reliable-meet.tcl
@@ -3,6 +3,12 @@ set old_singledb $::singledb
 set ::singledb 1
 
 tags {tls:skip external:skip cluster} {
+    set CLUSTER_PACKET_TYPE_PING 0
+    set CLUSTER_PACKET_TYPE_PONG 1
+    set CLUSTER_PACKET_TYPE_MEET 2
+    set CLUSTER_PACKET_TYPE_NONE -1
+    set CLUSTER_PACKET_TYPE_ALL -2
+
     set base_conf [list cluster-enabled yes]
     start_multiple_servers 2 [list overrides $base_conf] {
         test "Cluster nodes are reachable" {
@@ -22,9 +28,6 @@ tags {tls:skip external:skip cluster} {
             wait_for_cluster_state fail
         }
 
-        set CLUSTER_PACKET_TYPE_MEET 2
-        set CLUSTER_PACKET_TYPE_NONE -1
-
         test "Cluster nodes haven't met each other" {
             assert {[llength [get_cluster_nodes 1]] == 1}
             assert {[llength [get_cluster_nodes 0]] == 1}
@@ -75,3 +78,202 @@ tags {tls:skip external:skip cluster} {
 
 set ::singledb $old_singledb
 
+proc cluster_get_first_node_in_handshake id {
+    set nodes [get_cluster_nodes $id]
+    foreach n $nodes {
+        if {[cluster_has_flag $n handshake]} {
+            return [dict get $n id]
+        }
+    }
+    return {}
+}
+
+proc cluster_nodes_all_know_each_other {num_nodes} {
+    # Collect node IDs dynamically
+    set node_ids {}
+    for {set i 0} {$i < $num_nodes} {incr i} {
+        lappend node_ids [dict get [get_myself $i] id]
+    }
+
+    # Check if all nodes know each other
+    foreach node_id $node_ids {
+        foreach check_node_id $node_ids {
+            for {set node_index 0} {$node_index < $num_nodes} {incr node_index} {
+                if {[cluster_get_node_by_id $node_index $check_node_id] == {}} {
+                    return 0
+                }
+            }
+        }
+    }
+
+    # Verify cluster link counts for each node
+    set expected_links [expr {2 * ($num_nodes - 1)}]
+    for {set i 0} {$i < $num_nodes} {incr i} {
+        if {[llength [R $i CLUSTER LINKS]] != $expected_links} {
+            return 0
+        }
+    }
+
+    return 1
+}
+
+start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} {
+    set CLUSTER_PACKET_TYPE_PING 0
+    set CLUSTER_PACKET_TYPE_PONG 1
+    set CLUSTER_PACKET_TYPE_MEET 2
+    set CLUSTER_PACKET_TYPE_NONE -1
+    set CLUSTER_PACKET_TYPE_ALL -2
+
+    test "Handshake eventually succeeds after node handshake timeout on both sides with inconsistent view of the cluster" {
+        set cluster_port [find_available_port $::baseport $::portcount]
+        start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] {
+            # In this test we will trigger a handshake timeout on both sides of the handshake.
+            # Node 1 and 2 already know each other, then we make node 1 meet node 0:
+            #
+            # Node 1 -- MEET -> Node 0 [Node 0 might learn about Node 2 from the gossip section of the msg]
+            # Node 1 <- PONG -- Node 0 [we drop this message, so Node 1 will eventually mark the handshake as timed out]
+            # Node 1 <- PING -- Node 0 [we drop this message, so Node 1 will never send a PONG and Node 0 will eventually mark the handshake as timed out]
+            #
+            # After the handshake is timed out, we allow all cluster bus messages to go through.
+            # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake.
+
+            set node0_id [dict get [get_myself 0] id]
+            set node1_id [dict get [get_myself 1] id]
+            set node2_id [dict get [get_myself 2] id]
+
+            # Drop all cluster bus messages
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL
+            # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2.
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET
+
+            R 1 CLUSTER MEET [srv 0 host] [srv 0 port] $cluster_port
+
+            # Wait for Node 0 to be in handshake
+            wait_for_condition 10 400 {
+                [cluster_get_first_node_in_handshake 0] != {}
+            } else {
+                fail "Node 0 never entered handshake state"
+            }
+
+            # We want Node 0 to learn about Node 2 through the gossip section of the MEET message
+            set meet_retry 0
+            while {[cluster_get_node_by_id 0 $node2_id] eq {}} {
+                if {$meet_retry == 10} {
+                    error "assertion: Retried to meet Node 0 too many times"
+                }
+                # If Node 0 doesn't know about Node 1 & 2, it means Node 1 did not gossip about node 2 in its MEET message.
+                # So we kill the outbound link from Node 1 to Node 0, to force a reconnect and a re-send of the MEET message.
+                after 100
+                # Since we are in handshake, we use a randomly generated ID we have to find
+                R 1 DEBUG CLUSTERLINK KILL ALL [cluster_get_first_node_in_handshake 1]
+                incr meet_retry 1
+            }
+
+            # Wait for Node 1's handshake to timeout
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 1] eq {}
+            } else {
+                fail "Node 1 never exited handshake state"
+            }
+
+            # Wait for Node 0's handshake to timeout
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 1] eq {}
+            } else {
+                fail "Node 0 never exited handshake state"
+            }
+
+            # At this point Node 0 knows Node 1 & 2 through the gossip, but they don't know Node 0.
+            wait_for_condition 50 100 {
+                [cluster_get_node_by_id 0 $node1_id] != {} &&
+                [cluster_get_node_by_id 0 $node2_id] != {} &&
+                [cluster_get_node_by_id 1 $node0_id] eq {} &&
+                [cluster_get_node_by_id 2 $node0_id] eq {}
+            } else {
+                fail "Unexpected CLUSTER NODES output, nodes 1 & 2 should not know node 0."
+            }
+
+            # Allow all messages to go through again
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+
+            # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inbound link.
+            # Handshake should now complete successfully.
+            wait_for_condition 50 200 {
+                [cluster_nodes_all_know_each_other 3]
+            } else {
+                fail "Unexpected CLUSTER NODES output, all nodes should know each other."
+            }
+        } ;# stop Node 0
+    } ;# test
+} ;# stop cluster
+
+start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} {
+    set CLUSTER_PACKET_TYPE_PING 0
+    set CLUSTER_PACKET_TYPE_PONG 1
+    set CLUSTER_PACKET_TYPE_MEET 2
+    set CLUSTER_PACKET_TYPE_NONE -1
+    set CLUSTER_PACKET_TYPE_ALL -2
+
+    test "Handshake eventually succeeds after node handshake timeout on one side with inconsistent view of the cluster" {
+        set cluster_port [find_available_port $::baseport $::portcount]
+        start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] {
+            # In this test we will trigger a handshake timeout on one side of the handshake.
+            # Node 1 and 2 already know each other, then we make node 0 meet node 1:
+            #
+            # Node 0 -- MEET -> Node 1
+            # Node 0 <- PONG -- Node 1
+            # Node 0 <- PING -- Node 1 [Node 0 will mark the handshake as successful]
+            # Node 0 -- PONG -> Node 1 [we drop this message, so node 1 will eventually mark the handshake as timed out]
+            #
+            # After the handshake is timed out, we allow all cluster bus messages to go through.
+            # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake.
+
+            set node0_id [dict get [get_myself 0] id]
+            set node1_id [dict get [get_myself 1] id]
+            set node2_id [dict get [get_myself 2] id]
+
+            # Drop PONG messages
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG
+            # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2.
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET
+
+            # Node 0 meets node 1
+            R 0 CLUSTER MEET [srv -1 host] [srv -1 port]
+
+            # Wait for node 0 to know about the other nodes in the cluster
+            wait_for_condition 50 100 {
+                [cluster_get_node_by_id 0 $node1_id] != {}
+            } else {
+                fail "Node 0 never learned about node 1"
+            }
+            # At this point, node 0 knows about node 1 and might know node 2 if node 1 gossiped about it.
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 0] eq {}
+            } else {
+                fail "Node 1 never exited handshake state"
+            }
+            # At this point, from node 0 point of view, the handshake with node 1 succeeded.
+
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 1] eq {}
+            } else {
+                fail "Node 1 never exited handshake state"
+            }
+            assert {[cluster_get_node_by_id 1 $node0_id] eq {}}
+            # At this point, from node 1 point of view, the handshake with node 0 timed out.
+
+            # Allow all messages
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+
+            # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inblound link.
+            # Handshake should now complete successfully.
+            wait_for_condition 50 200 {
+                [cluster_nodes_all_know_each_other 3]
+            } else {
+                fail "Unexpected CLUSTER NODES output, all nodes should know each other."
+            }
+        } ;# stop Node 0
+    } ;# test
+} ;# stop cluster

From f5793d8bebbb8ccd34ad767b357168b083ff0481 Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Thu, 12 Dec 2024 23:52:58 +0200
Subject: [PATCH 005/101] Avoid defragging scripts during EVAL command
 execution (#1414)

This can happen when scripts are running for long period of time and the server attempts to defrag it in the whileBlockedCron.

Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
---
 src/defrag.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/defrag.c b/src/defrag.c
index 2fa067f0dc..be7ff07510 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -34,6 +34,7 @@
  */
 
 #include "server.h"
+#include "script.h"
 #include <stddef.h>
 
 #ifdef HAVE_DEFRAG
@@ -1050,6 +1051,9 @@ static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdat
     UNUSED(target);
     UNUSED(privdata);
     if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization
+    /* In case we are in the process of eval some script we do not want to replace the script being run
+     * so we just bail out without really defragging here. */
+    if (scriptIsRunning()) return DEFRAG_DONE;
     activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT);
     return DEFRAG_DONE;
 }

From e407ced83434651ee4ea3e2f8885bc457269f1af Mon Sep 17 00:00:00 2001
From: Vu Diep <54611122+vudiep411@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:42:52 -0800
Subject: [PATCH 006/101] Use `configure-aws-credentials` workflow instead of
 passing `secret_access_key` (#1363)

## Summary
This PR fixes #1346 where we can get rid of the long term credentials by
using OpenID Connect. OpenID Connect (OIDC) allows your GitHub Actions
workflows to access resources in Amazon Web Services (AWS), without
needing to store the AWS credentials as long-lived GitHub secrets.

---------

Signed-off-by: vudiep411 <vdiep@amazon.com>
---
 .github/workflows/build-release-packages.yml  | 43 +++++++++++++------
 .../call-build-linux-arm-packages.yml         | 39 ++++++++---------
 .../call-build-linux-x86-packages.yml         | 39 ++++++++---------
 3 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 094d82de08..44e012d658 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -3,7 +3,12 @@ name: Build Release Packages
 on:
   release:
     types: [published]
-
+  push:
+    paths:
+      - '.github/workflows/build-release-packages.yml'
+      - '.github/workflows/call-build-linux-arm-packages.yml'
+      - '.github/workflows/call-build-linux-x86_64-packages.yml'
+      - 'utils/releasetools/build-config.json'
   workflow_dispatch:
     inputs:
       version:
@@ -11,6 +16,7 @@ on:
         required: true
 
 permissions:
+  id-token: write
   contents: read
 
 jobs:
@@ -20,8 +26,8 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       version: ${{ steps.get_version.outputs.VERSION }}
+      is_test: ${{ steps.check-if-testing.outputs.IS_TEST }}
     steps:
-
       - run: |
           echo "Version: ${{ inputs.version || github.ref_name }}"
         shell: bash
@@ -32,8 +38,13 @@ jobs:
       - name: Get the version
         id: get_version
         run: |
-          VERSION="${INPUT_VERSION}"
+          if [[ "${{ github.event_name }}" == "push" ]]; then
+            VERSION=${{ github.ref_name }}
+          else
+            VERSION="${INPUT_VERSION}"
+          fi
           if [ -z "${VERSION}" ]; then
+            echo "Error: No version specified"
             exit 1
           fi
           echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
@@ -43,6 +54,16 @@ jobs:
           # only ever be a tag
           INPUT_VERSION: ${{ inputs.version || github.ref_name }}
 
+      - name: Check if we are testing
+        id: check-if-testing
+        run: |
+          if [[ "${{ github.event_name }}" == "push" ]]; then
+            echo "IS_TEST=true" >> $GITHUB_OUTPUT
+          else
+            echo "IS_TEST=false" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+
   generate-build-matrix:
     name: Generating build matrix
     runs-on: ubuntu-latest
@@ -56,7 +77,7 @@ jobs:
       - uses: ./.github/actions/generate-package-build-matrix
         id: set-matrix
         with:
-          ref: ${{ inputs.version || github.ref_name }}
+          ref: ${{ needs.release-build-get-meta.outputs.version }}
 
   release-build-linux-x86-packages:
     needs:
@@ -67,11 +88,10 @@ jobs:
       version: ${{ needs.release-build-get-meta.outputs.version }}
       ref: ${{ inputs.version || github.ref_name }}
       build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }}
+      region: us-west-2
     secrets:
-      token: ${{ secrets.GITHUB_TOKEN }}
-      bucket: ${{ secrets.AWS_S3_BUCKET }}
-      access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
-      secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
 
   release-build-linux-arm-packages:
     needs:
@@ -82,8 +102,7 @@ jobs:
       version: ${{ needs.release-build-get-meta.outputs.version }}
       ref: ${{ inputs.version || github.ref_name }}
       build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }}
+      region: us-west-2
     secrets:
-      token: ${{ secrets.GITHUB_TOKEN }}
-      bucket: ${{ secrets.AWS_S3_BUCKET }}
-      access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
-      secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
diff --git a/.github/workflows/call-build-linux-arm-packages.yml b/.github/workflows/call-build-linux-arm-packages.yml
index 2a7bcc533f..65445a83c8 100644
--- a/.github/workflows/call-build-linux-arm-packages.yml
+++ b/.github/workflows/call-build-linux-arm-packages.yml
@@ -15,21 +15,20 @@ on:
         description: The build targets to produce as a JSON matrix.
         type: string
         required: true
+      region:
+        description: The AWS region to push packages into.
+        type: string
+        required: true
     secrets:
-      token:
-        description: The Github token or similar to authenticate with.
+      bucket_name:
+        description: The S3 bucket to push packages into.
+        required: true
+      role_to_assume:
+        description: The role to assume for the S3 bucket.
         required: true
-      bucket:
-        description: The name of the S3 bucket to push packages into.
-        required: false
-      access_key_id:
-        description: The S3 access key id for the bucket.
-        required: false
-      secret_access_key:
-        description: The S3 secret access key for the bucket.
-        required: false
 
 permissions:
+  id-token: write
   contents: read
 
 jobs:
@@ -46,6 +45,12 @@ jobs:
         with:
           ref: ${{ inputs.version }}
 
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: ${{ inputs.region }}
+          role-to-assume: ${{ secrets.role_to_assume }}
+
       - name: Make Valkey
         uses: uraimo/run-on-arch-action@v2
         with:
@@ -65,15 +70,5 @@ jobs:
           mkdir -p packages-files
           cp -rfv $TAR_FILE_NAME.tar* packages-files/
 
-      - name: Install AWS cli.
-        run: |
-          sudo apt-get install -y awscli
-
-      - name: Configure AWS credentials
-        run: |
-          aws configure set region us-west-2
-          aws configure set aws_access_key_id ${{ secrets.access_key_id }}
-          aws configure set aws_secret_access_key ${{ secrets.secret_access_key }}
-
       - name: Sync to S3
-        run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/
+        run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/
diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml
index 9e438fa61a..a603c53c13 100644
--- a/.github/workflows/call-build-linux-x86-packages.yml
+++ b/.github/workflows/call-build-linux-x86-packages.yml
@@ -15,21 +15,20 @@ on:
         description: The build targets to produce as a JSON matrix.
         type: string
         required: true
+      region:
+        description: The AWS region to upload the packages to.
+        type: string
+        required: true
     secrets:
-      token:
-        description: The Github token or similar to authenticate with.
+      bucket_name:
+        description: The name of the S3 bucket to upload the packages to.
+        required: true
+      role_to_assume:
+        description: The role to assume for the S3 bucket.
         required: true
-      bucket:
-        description: The name of the S3 bucket to push packages into.
-        required: false
-      access_key_id:
-        description: The S3 access key id for the bucket.
-        required: false
-      secret_access_key:
-        description: The S3 secret access key for the bucket.
-        required: false
 
 permissions:
+  id-token: write
   contents: read
 
 jobs:
@@ -46,6 +45,12 @@ jobs:
         with:
           ref: ${{ inputs.version }}
 
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: ${{ inputs.region }}
+          role-to-assume: ${{ secrets.role_to_assume }}
+
       - name: Install dependencies
         run: sudo apt-get update && sudo apt-get install -y build-essential libssl-dev libsystemd-dev
 
@@ -63,15 +68,5 @@ jobs:
           mkdir -p packages-files
           cp -rfv $TAR_FILE_NAME.tar* packages-files/
 
-      - name: Install AWS cli.
-        run: |
-          sudo apt-get install -y awscli
-
-      - name: Configure AWS credentials
-        run: |
-          aws configure set region us-west-2
-          aws configure set aws_access_key_id ${{ secrets.access_key_id }}
-          aws configure set aws_secret_access_key ${{ secrets.secret_access_key }}
-
       - name: Sync to S3
-        run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/
+        run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/

From fa97d89f766371e05f8329d88422b7db7845e5da Mon Sep 17 00:00:00 2001
From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:46:35 -0800
Subject: [PATCH 007/101] Fix Valkey binary build workflow, version support
 changes. (#1429)

This change makes the binary build on the target ubuntu version.

This PR also deprecated ubuntu18 and valkey will not support:

- X86:
  - Ubuntu 20
  - Ubuntu 22
  - Ubuntu 24
 - ARM:
   - Ubuntu 20
   - Ubuntu 22

Removed ARM ubuntu 24 as the action we are using for ARM builds does not
support Ubuntu 24.

---------

Signed-off-by: Roshan Khatri <rvkhatri@amazon.com>
---
 .../call-build-linux-x86-packages.yml         |  2 +-
 utils/releasetools/build-config.json          | 21 +++++++------------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml
index a603c53c13..4e68bf85f0 100644
--- a/.github/workflows/call-build-linux-x86-packages.yml
+++ b/.github/workflows/call-build-linux-x86-packages.yml
@@ -35,7 +35,7 @@ jobs:
   build-valkey:
     # Capture source tarball and generate checksum for it
     name: Build package ${{ matrix.distro.target }} ${{ matrix.distro.arch }}
-    runs-on: "ubuntu-latest"
+    runs-on: ${{matrix.distro.target}}
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(inputs.build_matrix) }}
diff --git a/utils/releasetools/build-config.json b/utils/releasetools/build-config.json
index 5e39fae70f..f64bf601ca 100644
--- a/utils/releasetools/build-config.json
+++ b/utils/releasetools/build-config.json
@@ -1,29 +1,24 @@
 {
     "linux_targets": [
+
         {
           "arch": "x86_64",
-          "target": "ubuntu18.04",
+          "target": "ubuntu-20.04",
           "type": "deb",
-          "platform": "bionic"
+          "platform": "focal"
         },
         {
           "arch": "x86_64",
-          "target": "ubuntu20.04",
+          "target": "ubuntu-22.04",
           "type": "deb",
-          "platform": "focal"
+          "platform": "jammy"
         },
         {
           "arch": "x86_64",
-          "target": "ubuntu24.04",
+          "target": "ubuntu-24.04",
           "type": "deb",
           "platform": "noble"
         },
-        {
-          "arch": "arm64",
-          "target": "ubuntu18.04",
-          "type": "deb",
-          "platform": "bionic"
-        },
         {
           "arch": "arm64",
           "target": "ubuntu20.04",
@@ -32,9 +27,9 @@
         },
         {
           "arch": "arm64",
-          "target": "ubuntu24.04",
+          "target": "ubuntu22.04",
           "type": "deb",
-          "platform": "noble"
+          "platform": "jammy"
         }
       ]
 }
\ No newline at end of file

From efa90ff10eade0236ce1230a1fa24c1f7451c731 Mon Sep 17 00:00:00 2001
From: Jim Brunner <brunnerj@amazon.com>
Date: Thu, 12 Dec 2024 14:55:57 -0800
Subject: [PATCH 008/101] defrag: eliminate persistent kvstore pointer and edge
 case fixes (#1430)

This update addresses several issues in defrag:
1. In the defrag redesign
(https://github.com/valkey-io/valkey/pull/1242), a bug was introduced
where `server.cronloops` was no longer being incremented in the
`whileBlockedCron()`. This resulted in some memory statistics not being
updated while blocked.
2. In the test case for AOF loading, we were seeing errors due to defrag
latencies. However, running the math, the latencies are justified given
the extremely high CPU target of the testcase. Adjusted the expected
latency check to allow longer latencies for this case where defrag is
undergoing starvation while AOF loading is in progress.
3. A "stage" is passed a "target". For the main dictionary and expires,
we were passing in a `kvstore*`. However, on flushall or swapdb, the
pointer may change. It's safer and more stable to use an index for the
DB (a DBID). Then if the pointer changes, we can detect the change, and
simply abort the stage. (If there's still fragmentation to deal with,
we'll pick it up again on the next cycle.)
4. We always start a new stage on a new defrag cycle. This gives the new
stage time to run, and prevents latency issues for certain stages which
don't operate incrementally. However, often several stages will require
almost no work, and this will leave a chunk of our CPU allotment unused.
This is mainly an issue in starvation situations (like AOF loading or
LUA script) - where defrag is running infrequently, with a large
duty-cycle. This change allows a new stage to be initiated if we still
have a standard duty-cycle remaining. (This can happen during starvation
situations where the planned duty cycle is larger than the standard
cycle. Most likely this isn't a concern for real scenarios, but it was
observed in testing.)
5. Minor comment correction in `server.h`

Signed-off-by: Jim Brunner <brunnerj@amazon.com>
---
 src/defrag.c                 | 67 ++++++++++++++++++++++--------------
 src/server.c                 |  6 ++++
 src/server.h                 |  3 +-
 tests/unit/memefficiency.tcl |  8 +++--
 4 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index be7ff07510..8c1ad29de2 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -121,7 +121,7 @@ typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdat
 // Private data for main dictionary keys
 typedef struct {
     kvstoreIterState kvstate;
-    serverDb *db;
+    int dbid;
 } defragKeysCtx;
 static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
 
@@ -736,7 +736,7 @@ static void defragModule(serverDb *db, robj *obj) {
 /* for each key we scan in the main dict, this function will attempt to defrag
  * all the various pointers it has. */
 static void defragKey(defragKeysCtx *ctx, robj **elemref) {
-    serverDb *db = ctx->db;
+    serverDb *db = &server.db[ctx->dbid];
     int slot = ctx->kvstate.slot;
     robj *newob, *ob;
     unsigned char *newzl;
@@ -920,7 +920,7 @@ static doneStatus defragLaterStep(monotime endtime, void *privdata) {
         robj *ob = found;
 
         long long key_defragged = server.stat_active_defrag_hits;
-        bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->db->id) == 1);
+        bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->dbid) == 1);
         if (key_defragged != server.stat_active_defrag_hits) {
             server.stat_active_defrag_key_hits++;
         } else {
@@ -963,7 +963,10 @@ static doneStatus defragStageKvstoreHelper(monotime endtime,
         state.cursor = 0;
         return DEFRAG_NOT_DONE;
     }
-    serverAssert(kvs == state.kvs); // Shouldn't change during the stage
+    if (kvs != state.kvs) {
+        // There has been a change of the kvs (flushdb, swapdb, etc.).  Just complete the stage.
+        return DEFRAG_DONE;
+    }
 
     unsigned int iterations = 0;
     unsigned long long prev_defragged = server.stat_active_defrag_hits;
@@ -1013,26 +1016,30 @@ static doneStatus defragStageKvstoreHelper(monotime endtime,
 }
 
 
-// Note: target is a DB, (not a KVS like most stages)
+// Target is a DBID
 static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) {
     UNUSED(privdata);
-    serverDb *db = (serverDb *)target;
+    int dbid = (uintptr_t)target;
+    serverDb *db = &server.db[dbid];
 
     static defragKeysCtx ctx; // STATIC - this persists
     if (endtime == 0) {
-        ctx.db = db;
+        ctx.dbid = dbid;
         // Don't return yet.  Call the helper with endtime==0 below.
     }
-    serverAssert(ctx.db == db);
+    serverAssert(ctx.dbid == dbid);
 
     return defragStageKvstoreHelper(endtime, db->keys,
                                     dbKeysScanCallback, defragLaterStep, &ctx);
 }
 
 
+// Target is a DBID
 static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) {
     UNUSED(privdata);
-    return defragStageKvstoreHelper(endtime, (kvstore *)target,
+    int dbid = (uintptr_t)target;
+    serverDb *db = &server.db[dbid];
+    return defragStageKvstoreHelper(endtime, db->expires,
                                     scanHashtableCallbackCountScanned, NULL, NULL);
 }
 
@@ -1226,29 +1233,38 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i
     }
 
     monotime starttime = getMonotonicUs();
-    monotime endtime = starttime + computeDefragCycleUs();
+    int dutyCycleUs = computeDefragCycleUs();
+    monotime endtime = starttime + dutyCycleUs;
+    bool haveMoreWork = true;
 
     mstime_t latency;
     latencyStartMonitor(latency);
 
-    if (!defrag.current_stage) {
-        defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages));
-        listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages));
-        // Initialize the stage with endtime==0
-        doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata);
-        serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE
-    }
+    do {
+        if (!defrag.current_stage) {
+            defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages));
+            listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages));
+            // Initialize the stage with endtime==0
+            doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata);
+            serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE
+        }
 
-    doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata);
-    if (status == DEFRAG_DONE) {
-        zfree(defrag.current_stage);
-        defrag.current_stage = NULL;
-    }
+        doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata);
+        if (status == DEFRAG_DONE) {
+            zfree(defrag.current_stage);
+            defrag.current_stage = NULL;
+        }
+
+        haveMoreWork = (defrag.current_stage || listLength(defrag.remaining_stages) > 0);
+        /* If we've completed a stage early, and still have a standard time allotment remaining,
+         * we'll start another stage.  This can happen when defrag is running infrequently, and
+         * starvation protection has increased the duty-cycle. */
+    } while (haveMoreWork && getMonotonicUs() <= endtime - server.active_defrag_cycle_us);
 
     latencyEndMonitor(latency);
     latencyAddSampleIfNeeded("active-defrag-cycle", latency);
 
-    if (defrag.current_stage || listLength(defrag.remaining_stages) > 0) {
+    if (haveMoreWork) {
         return computeDelayMs(endtime);
     } else {
         endDefragCycle(true);
@@ -1287,9 +1303,8 @@ static void beginDefragCycle(void) {
     defrag.remaining_stages = listCreate();
 
     for (int dbid = 0; dbid < server.dbnum; dbid++) {
-        serverDb *db = &server.db[dbid];
-        addDefragStage(defragStageDbKeys, db, NULL);
-        addDefragStage(defragStageExpiresKvstore, db->expires, NULL);
+        addDefragStage(defragStageDbKeys, (void *)(uintptr_t)dbid, NULL);
+        addDefragStage(defragStageExpiresKvstore, (void *)(uintptr_t)dbid, NULL);
     }
 
     static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels};
diff --git a/src/server.c b/src/server.c
index 518ecad603..e495730fe2 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1669,6 +1669,12 @@ void whileBlockedCron(void) {
      * latency monitor if this function is called too often. */
     if (server.blocked_last_cron >= server.mstime) return;
 
+    /* Increment server.cronloops so that run_with_period works. */
+    long hz_ms = 1000 / server.hz;
+    int cronloops = (server.mstime - server.blocked_last_cron + (hz_ms - 1)) / hz_ms; // rounding up
+    server.blocked_last_cron += cronloops * hz_ms;
+    server.cronloops += cronloops;
+
     mstime_t latency;
     latencyStartMonitor(latency);
 
diff --git a/src/server.h b/src/server.h
index 3ba7a61b7d..88afb57c81 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1900,8 +1900,7 @@ struct valkeyServer {
     int sanitize_dump_payload;                   /* Enables deep sanitization for ziplist and listpack in RDB and RESTORE. */
     int skip_checksum_validation;                /* Disable checksum validation for RDB and RESTORE payload. */
     int jemalloc_bg_thread;                      /* Enable jemalloc background thread */
-    int active_defrag_configuration_changed;     /* defrag configuration has been changed and need to reconsider
-                                                  * active_defrag_running in computeDefragCycles. */
+    int active_defrag_configuration_changed;     /* Config changed; need to recompute active_defrag_cpu_percent. */
     size_t active_defrag_ignore_bytes;           /* minimum amount of fragmentation waste to start active defrag */
     int active_defrag_threshold_lower;           /* minimum percentage of fragmentation to start active defrag */
     int active_defrag_threshold_upper;           /* maximum percentage of fragmentation at which we use maximum effort */
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index ce74b7c618..78a68a682d 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -172,10 +172,12 @@ run_solo {defrag} {
                 # make sure the defragger did enough work to keep the fragmentation low during loading.
                 # we cannot check that it went all the way down, since we don't wait for full defrag cycle to complete.
                 assert {$frag < 1.4}
-                # since the AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands),
-                # it'll still not block the loading for long periods of time.
+                # The AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands).
+                # Even so, defrag can get starved for periods exceeding 100ms.  Using 200ms for test stability, and
+                # a 75% CPU requirement (as set above), we should allow up to 600ms latency
+                # (as total time = 200 non duty + 600 duty = 800ms, and 75% of 800ms is 600ms).
                 if {!$::no_latency} {
-                    assert {$max_latency <= 40}
+                    assert {$max_latency <= 600}
                 }
             }
             } ;# Active defrag - AOF loading

From c962a3e1496f1ca67ef361e9d36e7c12e37b9659 Mon Sep 17 00:00:00 2001
From: Thalia Archibald <thalia@archibald.dev>
Date: Fri, 13 Dec 2024 02:05:19 -0800
Subject: [PATCH 009/101] Check length before reading in `stringmatchlen`
 (#1431)

Fixes four cases where `stringmatchlen` could overrun the pattern if it
is not terminated with NUL.

These commits are cherry-picked from my
[fork](https://github.com/thaliaarchi/antirez-stringmatch) which
extracts `stringmatch` as a library and compares it to other projects by
antirez which use the same matcher.

Signed-off-by: Thalia Archibald <thalia@archibald.dev>
---
 src/util.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/util.c b/src/util.c
index 0b7af2d3fa..6d99d47e5a 100644
--- a/src/util.c
+++ b/src/util.c
@@ -104,23 +104,23 @@ static int stringmatchlen_impl(const char *pattern,
 
             pattern++;
             patternLen--;
-            not_op = pattern[0] == '^';
+            not_op = patternLen && pattern[0] == '^';
             if (not_op) {
                 pattern++;
                 patternLen--;
             }
             match = 0;
             while (1) {
-                if (pattern[0] == '\\' && patternLen >= 2) {
+                if (patternLen >= 2 && pattern[0] == '\\') {
                     pattern++;
                     patternLen--;
                     if (pattern[0] == string[0]) match = 1;
-                } else if (pattern[0] == ']') {
-                    break;
                 } else if (patternLen == 0) {
                     pattern--;
                     patternLen++;
                     break;
+                } else if (pattern[0] == ']') {
+                    break;
                 } else if (patternLen >= 3 && pattern[1] == '-') {
                     int start = pattern[0];
                     int end = pattern[2];
@@ -173,7 +173,7 @@ static int stringmatchlen_impl(const char *pattern,
         pattern++;
         patternLen--;
         if (stringLen == 0) {
-            while (*pattern == '*') {
+            while (patternLen && *pattern == '*') {
                 pattern++;
                 patternLen--;
             }

From 2b733719b1e09c979a9157c8e92547d093178ae7 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 14 Dec 2024 05:32:54 +0800
Subject: [PATCH 010/101] Skip build-release-packages CI job in forks (#1438)

The CI job was introduced in #1363, we should skip it in forks.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 .github/workflows/build-release-packages.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 44e012d658..c7d5c8fe54 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -23,6 +23,7 @@ jobs:
   # This job provides the version metadata from the tag for the other jobs to use.
   release-build-get-meta:
     name: Get metadata to build
+    if: github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       version: ${{ steps.get_version.outputs.VERSION }}
@@ -66,6 +67,7 @@ jobs:
 
   generate-build-matrix:
     name: Generating build matrix
+    if: github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }}

From 630327cbecc5ba28a0a82357a7ecc4e212825d6f Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 14 Dec 2024 06:26:20 +0800
Subject: [PATCH 011/101] Fix wrong file name in build-release-packages.yml
 (#1437)

Introduced in #1363, the file name does not match.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 .github/workflows/build-release-packages.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index c7d5c8fe54..6c54971bcd 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -7,7 +7,7 @@ on:
     paths:
       - '.github/workflows/build-release-packages.yml'
       - '.github/workflows/call-build-linux-arm-packages.yml'
-      - '.github/workflows/call-build-linux-x86_64-packages.yml'
+      - '.github/workflows/call-build-linux-x86-packages.yml'
       - 'utils/releasetools/build-config.json'
   workflow_dispatch:
     inputs:

From 5bf8a6b40a0af683cf6dbaa3b5d9911c5267bcad Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Sat, 14 Dec 2024 10:13:04 -0800
Subject: [PATCH 012/101] Avoid importing memory aligned malloc (#1442)

We deprecate the usage of classic malloc and free, but under certain
circumstances they might get imported from intrinsics. The original
thought is we should just override malloc and free to use zmalloc and
zfree, but I think we should continue to deprecate it to avoid
accidental imports of allocations.

Closes https://github.com/valkey-io/valkey/issues/1434.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/hyperloglog.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index f0390b3e1e..6056bc0098 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -36,6 +36,9 @@
 #include <math.h>
 
 #ifdef HAVE_AVX2
+/* Define __MM_MALLOC_H to prevent importing the memory aligned
+ * allocation functions, which we don't use. */
+#define __MM_MALLOC_H
 #include <immintrin.h>
 #endif
 

From 8faf3c3c17e6340e70659ad24157c48fc76ae449 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Sat, 14 Dec 2024 10:14:01 -0800
Subject: [PATCH 013/101] Synchronously delete data during defrag tests (#1443)

The creation of fragmentation is delayed when we use lazy-free. You can
induce some of the active-defrag tests to fail by artificially adding a
delay in the lazyfree process, similar to the issues seen in #1433 and
issues like
https://github.com/valkey-io/valkey/actions/runs/12267010712/job/34226304803#step:7:6538.
The solution is to always do sync free during tests.

Might close https://github.com/valkey-io/valkey/issues/1433.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/memefficiency.tcl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index 78a68a682d..8f6e5e8dd3 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -47,6 +47,8 @@ run_solo {defrag} {
             r config set active-defrag-ignore-bytes 2mb
             r config set maxmemory 100mb
             r config set maxmemory-policy allkeys-lru
+            r config set lazyfree-lazy-user-del no
+            r config set lazyfree-lazy-user-flush no
 
             populate 700000 asdf1 150
             populate 100 asdf1 150 0 false 1000

From 3931ee48c392fae4eb23184a10afb5a111cf69d5 Mon Sep 17 00:00:00 2001
From: Rain Valentine <rsg000@gmail.com>
Date: Sat, 14 Dec 2024 11:53:48 -0800
Subject: [PATCH 014/101] Replace dict with new hashtable for sets datatype
 (#1176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new `hashtable` provides faster lookups and uses less memory than
`dict`.

A TCL test case "SRANDMEMBER with a dict containing long chain" is
deleted because it's covered by a hashtable unit test
"test_random_entry_with_long_chain", which is already present.

This change also moves some logic from dismissMemory (object.c) to
zmadvise_dontneed (zmalloc.c), so the hashtable implementation which
needs the dismiss functionality doesn't need to depend on object.c and
server.h.

This PR follows #1186.

---------

Signed-off-by: Rain Valentine <rsg000@gmail.com>
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/db.c                |  72 +++++++++++------
 src/debug.c             |  29 ++++---
 src/defrag.c            |  42 ++++++----
 src/hashtable.c         |  10 ++-
 src/hashtable.h         |   3 +-
 src/lazyfree.c          |   6 +-
 src/module.c            |  39 ++++++---
 src/object.c            |  56 +++++++------
 src/rdb.c               |  42 +++++-----
 src/server.c            |  57 +++++--------
 src/server.h            |   9 ++-
 src/t_set.c             | 172 ++++++++++++++++++++--------------------
 src/t_zset.c            |  24 +++---
 src/zmalloc.c           |  19 ++++-
 src/zmalloc.h           |   2 +-
 tests/unit/info.tcl     |   8 +-
 tests/unit/type/set.tcl | 107 +------------------------
 17 files changed, 326 insertions(+), 371 deletions(-)

diff --git a/src/db.c b/src/db.c
index 2bd40ba74b..1223d00c8d 100644
--- a/src/db.c
+++ b/src/db.c
@@ -978,7 +978,7 @@ void keysScanCallback(void *privdata, void *entry) {
 
 /* This callback is used by scanGenericCommand in order to collect elements
  * returned by the dictionary iterator into a list. */
-void scanCallback(void *privdata, const dictEntry *de) {
+void dictScanCallback(void *privdata, const dictEntry *de) {
     scanData *data = (scanData *)privdata;
     list *keys = data->keys;
     robj *o = data->o;
@@ -998,9 +998,7 @@ void scanCallback(void *privdata, const dictEntry *de) {
         }
     }
 
-    if (o->type == OBJ_SET) {
-        key = keysds;
-    } else if (o->type == OBJ_HASH) {
+    if (o->type == OBJ_HASH) {
         key = keysds;
         if (!data->only_keys) {
             val = dictGetVal(de);
@@ -1013,13 +1011,33 @@ void scanCallback(void *privdata, const dictEntry *de) {
             val = sdsnewlen(buf, len);
         }
     } else {
-        serverPanic("Type not handled in SCAN callback.");
+        serverPanic("Type not handled in dict SCAN callback.");
     }
 
     listAddNodeTail(keys, key);
     if (val) listAddNodeTail(keys, val);
 }
 
+void hashtableScanCallback(void *privdata, void *entry) {
+    scanData *data = (scanData *)privdata;
+    robj *o = data->o;
+    list *keys = data->keys;
+    data->sampled++;
+
+    /* currently only implemented for SET scan */
+    serverAssert(o && o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE);
+    sds key = (sds)entry; /* Specific for OBJ_SET */
+
+    /* Filter element if it does not match the pattern. */
+    if (data->pattern) {
+        if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) {
+            return;
+        }
+    }
+
+    listAddNodeTail(keys, key);
+}
+
 /* Try to parse a SCAN cursor stored at object 'o':
  * if the cursor is valid, store it as unsigned integer into *cursor and
  * returns C_OK. Otherwise return C_ERR and send an error to the
@@ -1083,7 +1101,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
     sds typename = NULL;
     long long type = LLONG_MAX;
     int patlen = 0, use_pattern = 0, only_keys = 0;
-    dict *ht;
 
     /* Object must be NULL (to iterate keys names), or the type of the object
      * must be Set, Sorted Set, or Hash. */
@@ -1152,34 +1169,35 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
      * just return everything inside the object in a single call, setting the
      * cursor to zero to signal the end of the iteration. */
 
-    /* Handle the case of a hash table. */
-    ht = NULL;
+    /* Handle the case of kvstore, dict or hashtable. */
+    dict *dict_table = NULL;
+    hashtable *hashtable_table = NULL;
+    int shallow_copied_list_items = 0;
     if (o == NULL) {
-        ht = NULL;
-    } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) {
-        ht = o->ptr;
+        shallow_copied_list_items = 1;
+    } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable_table = o->ptr;
+        shallow_copied_list_items = 1;
     } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) {
-        ht = o->ptr;
+        dict_table = o->ptr;
+        shallow_copied_list_items = 1;
     } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = o->ptr;
-        ht = zs->dict;
+        dict_table = zs->dict;
+        /* scanning ZSET allocates temporary strings even though it's a dict */
+        shallow_copied_list_items = 0;
     }
 
     list *keys = listCreate();
-    /* Set a free callback for the contents of the collected keys list.
-     * For the main keyspace dict, and when we scan a key that's dict encoded
-     * (we have 'ht'), we don't need to define free method because the strings
-     * in the list are just a shallow copy from the pointer in the dictEntry.
-     * When scanning a key with other encodings (e.g. listpack), we need to
-     * free the temporary strings we add to that list.
-     * The exception to the above is ZSET, where we do allocate temporary
-     * strings even when scanning a dict. */
-    if (o && (!ht || o->type == OBJ_ZSET)) {
+    /* Set a free callback for the contents of the collected keys list if they
+     * are deep copied temporary strings. We must not free them if they are just
+     * a shallow copy - a pointer to the actual data in the data structure */
+    if (!shallow_copied_list_items) {
         listSetFreeMethod(keys, (void (*)(void *))sdsfree);
     }
 
-    /* For main dictionary scan or data structure using hashtable. */
-    if (!o || ht) {
+    /* For main hash table scan or scannable data structure. */
+    if (!o || dict_table || hashtable_table) {
         /* We set the max number of iterations to ten times the specified
          * COUNT, so if the hash table is in a pathological state (very
          * sparsely populated) we avoid to block too much time at the cost
@@ -1188,7 +1206,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
 
         /* We pass scanData which have three pointers to the callback:
          * 1. data.keys: the list to which it will add new elements;
-         * 2. data.o: the object containing the dictionary so that
+         * 2. data.o: the object containing the hash table so that
          * it is possible to fetch more data in a type-dependent way;
          * 3. data.type: the specified type scan in the db, LLONG_MAX means
          * type matching is no needed;
@@ -1219,8 +1237,10 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
              * If cursor is empty, we should try exploring next non-empty slot. */
             if (o == NULL) {
                 cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data);
+            } else if (dict_table) {
+                cursor = dictScan(dict_table, cursor, dictScanCallback, &data);
             } else {
-                cursor = dictScan(ht, cursor, scanCallback, &data);
+                cursor = hashtableScan(hashtable_table, cursor, hashtableScanCallback, &data);
             }
         } while (cursor && maxiterations-- && data.sampled < count);
     } else if (o->type == OBJ_SET) {
diff --git a/src/debug.c b/src/debug.c
index d63d12f762..4efe12e237 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -916,30 +916,35 @@ void debugCommand(client *c) {
         addReplyVerbatim(c, stats, sdslen(stats), "txt");
         sdsfree(stats);
     } else if (!strcasecmp(c->argv[1]->ptr, "htstats-key") && c->argc >= 3) {
-        robj *o;
-        dict *ht = NULL;
         int full = 0;
-
         if (c->argc >= 4 && !strcasecmp(c->argv[3]->ptr, "full")) full = 1;
 
-        if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr)) == NULL) return;
+        robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr);
+        if (o == NULL) return;
 
-        /* Get the hash table reference from the object, if possible. */
+        /* Get the dict reference from the object, if possible. */
+        dict *d = NULL;
+        hashtable *ht = NULL;
         switch (o->encoding) {
         case OBJ_ENCODING_SKIPLIST: {
             zset *zs = o->ptr;
-            ht = zs->dict;
+            d = zs->dict;
         } break;
-        case OBJ_ENCODING_HT: ht = o->ptr; break;
+        case OBJ_ENCODING_HT: d = o->ptr; break;
+        case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break;
         }
 
-        if (ht == NULL) {
-            addReplyError(c, "The value stored at the specified key is not "
-                             "represented using an hash table");
-        } else {
+        if (d != NULL) {
             char buf[4096];
-            dictGetStats(buf, sizeof(buf), ht, full);
+            dictGetStats(buf, sizeof(buf), d, full);
             addReplyVerbatim(c, buf, strlen(buf), "txt");
+        } else if (ht != NULL) {
+            char buf[4096];
+            hashtableGetStats(buf, sizeof(buf), ht, full);
+            addReplyVerbatim(c, buf, strlen(buf), "txt");
+        } else {
+            addReplyError(c, "The value stored at the specified key is not "
+                             "represented using an hash table");
         }
     } else if (!strcasecmp(c->argv[1]->ptr, "change-repl-id") && c->argc == 2) {
         serverLog(LL_NOTICE, "Changing replication IDs after receiving DEBUG change-repl-id");
diff --git a/src/defrag.c b/src/defrag.c
index 8c1ad29de2..8e7fc8449e 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -34,6 +34,7 @@
  */
 
 #include "server.h"
+#include "hashtable.h"
 #include "script.h"
 #include <stddef.h>
 
@@ -379,6 +380,20 @@ static void activeDefragSdsDict(dict *d, int val_type) {
     } while (cursor != 0);
 }
 
+void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) {
+    UNUSED(privdata);
+    sds *sds_ref = (sds *)entry_ref;
+    sds new_sds = activeDefragSds(*sds_ref);
+    if (new_sds != NULL) *sds_ref = new_sds;
+}
+
+void activeDefragSdsHashtable(hashtable *ht) {
+    unsigned long cursor = 0;
+    do {
+        cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
+    } while (cursor != 0);
+}
+
 /* Defrag a list of ptr, sds or robj string values */
 static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
     quicklistNode *newnode, *node = *node_ref;
@@ -497,11 +512,9 @@ static void scanCallbackCountScanned(void *privdata, const dictEntry *de) {
 }
 
 static void scanLaterSet(robj *ob, unsigned long *cursor) {
-    if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return;
-    dict *d = ob->ptr;
-    dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc,
-                                     .defragKey = (dictDefragAllocFunction *)activeDefragSds};
-    *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL);
+    if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return;
+    hashtable *ht = ob->ptr;
+    *cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
 }
 
 static void scanLaterHash(robj *ob, unsigned long *cursor) {
@@ -560,15 +573,16 @@ static void defragHash(robj *ob) {
 }
 
 static void defragSet(robj *ob) {
-    dict *d, *newd;
-    serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT);
-    d = ob->ptr;
-    if (dictSize(d) > server.active_defrag_max_scan_fields)
+    serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE);
+    hashtable *ht = ob->ptr;
+    if (hashtableSize(ht) > server.active_defrag_max_scan_fields) {
         defragLater(ob);
-    else
-        activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL);
-    /* defrag the dict struct and tables */
-    if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd;
+    } else {
+        activeDefragSdsHashtable(ht);
+    }
+    /* defrag the hashtable struct and tables */
+    hashtable *newHashtable = hashtableDefragTables(ht, activeDefragAlloc);
+    if (newHashtable) ob->ptr = newHashtable;
 }
 
 /* Defrag callback for radix tree iterator, called for each node,
@@ -766,7 +780,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) {
             serverPanic("Unknown list encoding");
         }
     } else if (ob->type == OBJ_SET) {
-        if (ob->encoding == OBJ_ENCODING_HT) {
+        if (ob->encoding == OBJ_ENCODING_HASHTABLE) {
             defragSet(ob);
         } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) {
             void *newptr, *ptr = ob->ptr;
diff --git a/src/hashtable.c b/src/hashtable.c
index 9d963b9ddc..11ba360800 100644
--- a/src/hashtable.c
+++ b/src/hashtable.c
@@ -1023,7 +1023,7 @@ void *hashtableMetadata(hashtable *ht) {
 }
 
 /* Returns the number of entries stored. */
-size_t hashtableSize(hashtable *ht) {
+size_t hashtableSize(const hashtable *ht) {
     return ht->used[0] + ht->used[1];
 }
 
@@ -1180,6 +1180,14 @@ hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)) {
     return ht1;
 }
 
+/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've
+ * forked and memory won't be used again. See zmadvise_dontneed() */
+void dismissHashtable(hashtable *ht) {
+    for (int i = 0; i < 2; i++) {
+        zmadvise_dontneed(ht->tables[i], numBuckets(ht->bucket_exp[i]) * sizeof(bucket *));
+    }
+}
+
 /* Returns 1 if an entry was found matching the key. Also points *found to it,
  * if found is provided. Returns 0 if no matching entry was found. */
 int hashtableFind(hashtable *ht, const void *key, void **found) {
diff --git a/src/hashtable.h b/src/hashtable.h
index 242531df8f..4291cf5a5d 100644
--- a/src/hashtable.h
+++ b/src/hashtable.h
@@ -108,7 +108,7 @@ void hashtableRelease(hashtable *ht);
 void hashtableEmpty(hashtable *ht, void(callback)(hashtable *));
 hashtableType *hashtableGetType(hashtable *ht);
 void *hashtableMetadata(hashtable *ht);
-size_t hashtableSize(hashtable *ht);
+size_t hashtableSize(const hashtable *ht);
 size_t hashtableBuckets(hashtable *ht);
 size_t hashtableChainedBuckets(hashtable *ht, int table);
 size_t hashtableMemUsage(hashtable *ht);
@@ -123,6 +123,7 @@ int hashtableTryExpand(hashtable *ht, size_t size);
 int hashtableExpandIfNeeded(hashtable *ht);
 int hashtableShrinkIfNeeded(hashtable *ht);
 hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *));
+void dismissHashtable(hashtable *ht);
 
 /* Entries */
 int hashtableFind(hashtable *ht, const void *key, void **found);
diff --git a/src/lazyfree.c b/src/lazyfree.c
index 14a4454d7a..4b4c7f06ad 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -116,9 +116,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) {
     if (obj->type == OBJ_LIST && obj->encoding == OBJ_ENCODING_QUICKLIST) {
         quicklist *ql = obj->ptr;
         return ql->len;
-    } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HT) {
-        dict *ht = obj->ptr;
-        return dictSize(ht);
+    } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = obj->ptr;
+        return hashtableSize(ht);
     } else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = obj->ptr;
         return zs->zsl->length;
diff --git a/src/module.c b/src/module.c
index 9bcf68646e..36283e2c73 100644
--- a/src/module.c
+++ b/src/module.c
@@ -11017,20 +11017,20 @@ typedef struct {
     ValkeyModuleScanKeyCB fn;
 } ScanKeyCBData;
 
-static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
+static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) {
     ScanKeyCBData *data = privdata;
     sds key = dictGetKey(de);
     robj *o = data->key->value;
     robj *field = createStringObject(key, sdslen(key));
     robj *value = NULL;
-    if (o->type == OBJ_SET) {
-        value = NULL;
-    } else if (o->type == OBJ_HASH) {
+    if (o->type == OBJ_HASH) {
         sds val = dictGetVal(de);
         value = createStringObject(val, sdslen(val));
     } else if (o->type == OBJ_ZSET) {
         double *val = (double *)dictGetVal(de);
         value = createStringObjectFromLongDouble(*val, 0);
+    } else {
+        serverPanic("unexpected object type");
     }
 
     data->fn(data->key, field, value, data->user_data);
@@ -11038,6 +11038,17 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
     if (value) decrRefCount(value);
 }
 
+static void moduleScanKeyHashtableCallback(void *privdata, void *entry) {
+    ScanKeyCBData *data = privdata;
+    robj *o = data->key->value;
+    serverAssert(o->type == OBJ_SET);
+    sds key = entry;
+    robj *field = createStringObject(key, sdslen(key));
+
+    data->fn(data->key, field, NULL, data->user_data);
+    decrRefCount(field);
+}
+
 /* Scan api that allows a module to scan the elements in a hash, set or sorted set key
  *
  * Callback for scan implementation.
@@ -11091,14 +11102,15 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
         errno = EINVAL;
         return 0;
     }
-    dict *ht = NULL;
+    dict *d = NULL;
+    hashtable *ht = NULL;
     robj *o = key->value;
     if (o->type == OBJ_SET) {
-        if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr;
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr;
     } else if (o->type == OBJ_HASH) {
-        if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr;
+        if (o->encoding == OBJ_ENCODING_HT) d = o->ptr;
     } else if (o->type == OBJ_ZSET) {
-        if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->dict;
+        if (o->encoding == OBJ_ENCODING_SKIPLIST) d = ((zset *)o->ptr)->dict;
     } else {
         errno = EINVAL;
         return 0;
@@ -11108,9 +11120,16 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
         return 0;
     }
     int ret = 1;
-    if (ht) {
+    if (d) {
+        ScanKeyCBData data = {key, privdata, fn};
+        cursor->cursor = dictScan(d, cursor->cursor, moduleScanKeyDictCallback, &data);
+        if (cursor->cursor == 0) {
+            cursor->done = 1;
+            ret = 0;
+        }
+    } else if (ht) {
         ScanKeyCBData data = {key, privdata, fn};
-        cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, &data);
+        cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data);
         if (cursor->cursor == 0) {
             cursor->done = 1;
             ret = 0;
diff --git a/src/object.c b/src/object.c
index ac1c26adf9..15363f31b8 100644
--- a/src/object.c
+++ b/src/object.c
@@ -429,9 +429,9 @@ robj *createListListpackObject(void) {
 }
 
 robj *createSetObject(void) {
-    dict *d = dictCreate(&setDictType);
-    robj *o = createObject(OBJ_SET, d);
-    o->encoding = OBJ_ENCODING_HT;
+    hashtable *ht = hashtableCreate(&setHashtableType);
+    robj *o = createObject(OBJ_SET, ht);
+    o->encoding = OBJ_ENCODING_HASHTABLE;
     return o;
 }
 
@@ -506,7 +506,7 @@ void freeListObject(robj *o) {
 
 void freeSetObject(robj *o) {
     switch (o->encoding) {
-    case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break;
+    case OBJ_ENCODING_HASHTABLE: hashtableRelease((hashtable *)o->ptr); break;
     case OBJ_ENCODING_INTSET:
     case OBJ_ENCODING_LISTPACK: zfree(o->ptr); break;
     default: serverPanic("Unknown set encoding type");
@@ -622,23 +622,23 @@ void dismissListObject(robj *o, size_t size_hint) {
 
 /* See dismissObject() */
 void dismissSetObject(robj *o, size_t size_hint) {
-    if (o->encoding == OBJ_ENCODING_HT) {
-        dict *set = o->ptr;
-        serverAssert(dictSize(set) != 0);
+    if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = o->ptr;
+        serverAssert(hashtableSize(ht) != 0);
         /* We iterate all nodes only when average member size is bigger than a
          * page size, and there's a high chance we'll actually dismiss something. */
-        if (size_hint / dictSize(set) >= server.page_size) {
-            dictEntry *de;
-            dictIterator *di = dictGetIterator(set);
-            while ((de = dictNext(di)) != NULL) {
-                dismissSds(dictGetKey(de));
+        if (size_hint / hashtableSize(ht) >= server.page_size) {
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, ht);
+            void *next;
+            while (hashtableNext(&iter, &next)) {
+                sds item = next;
+                dismissSds(item);
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iter);
         }
 
-        /* Dismiss hash table memory. */
-        dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0]) * sizeof(dictEntry *));
-        dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1]) * sizeof(dictEntry *));
+        dismissHashtable(ht);
     } else if (o->encoding == OBJ_ENCODING_INTSET) {
         dismissMemory(o->ptr, intsetBlobLen((intset *)o->ptr));
     } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
@@ -728,7 +728,7 @@ void dismissStreamObject(robj *o, size_t size_hint) {
  * modifies any keys due to write traffic, it'll cause CoW which consume
  * physical memory. In the child process, after serializing the key and value,
  * the data is definitely not accessed again, so to avoid unnecessary CoW, we
- * try to release their memory back to OS. see dismissMemory().
+ * try to release their memory back to OS. see zmadvise_dontneed().
  *
  * Because of the cost of iterating all node/field/member/entry of complex data
  * types, we iterate and dismiss them only when approximate average we estimate
@@ -1109,6 +1109,7 @@ char *strEncoding(int encoding) {
     case OBJ_ENCODING_RAW: return "raw";
     case OBJ_ENCODING_INT: return "int";
     case OBJ_ENCODING_HT: return "hashtable";
+    case OBJ_ENCODING_HASHTABLE: return "hashtable";
     case OBJ_ENCODING_QUICKLIST: return "quicklist";
     case OBJ_ENCODING_LISTPACK: return "listpack";
     case OBJ_ENCODING_INTSET: return "intset";
@@ -1160,17 +1161,20 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
             serverPanic("Unknown list encoding");
         }
     } else if (o->type == OBJ_SET) {
-        if (o->encoding == OBJ_ENCODING_HT) {
-            d = o->ptr;
-            di = dictGetIterator(d);
-            asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d));
-            while ((de = dictNext(di)) != NULL && samples < sample_size) {
-                ele = dictGetKey(de);
-                elesize += dictEntryMemUsage(de) + sdsAllocSize(ele);
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtable *ht = o->ptr;
+            asize = sizeof(*o) + hashtableMemUsage(ht);
+
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, ht);
+            void *next;
+            while (hashtableNext(&iter, &next) && samples < sample_size) {
+                sds element = next;
+                elesize += sdsAllocSize(element);
                 samples++;
             }
-            dictReleaseIterator(di);
-            if (samples) asize += (double)elesize / samples * dictSize(d);
+            hashtableResetIterator(&iter);
+            if (samples) asize += (double)elesize / samples * hashtableSize(ht);
         } else if (o->encoding == OBJ_ENCODING_INTSET) {
             asize = sizeof(*o) + zmalloc_size(o->ptr);
         } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
diff --git a/src/rdb.c b/src/rdb.c
index 6e990736bc..5fb77a2897 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -692,7 +692,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) {
     case OBJ_SET:
         if (o->encoding == OBJ_ENCODING_INTSET)
             return rdbSaveType(rdb, RDB_TYPE_SET_INTSET);
-        else if (o->encoding == OBJ_ENCODING_HT)
+        else if (o->encoding == OBJ_ENCODING_HASHTABLE)
             return rdbSaveType(rdb, RDB_TYPE_SET);
         else if (o->encoding == OBJ_ENCODING_LISTPACK)
             return rdbSaveType(rdb, RDB_TYPE_SET_LISTPACK);
@@ -876,26 +876,26 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
         }
     } else if (o->type == OBJ_SET) {
         /* Save a set value */
-        if (o->encoding == OBJ_ENCODING_HT) {
-            dict *set = o->ptr;
-            dictIterator *di = dictGetIterator(set);
-            dictEntry *de;
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtable *set = o->ptr;
 
-            if ((n = rdbSaveLen(rdb, dictSize(set))) == -1) {
-                dictReleaseIterator(di);
+            if ((n = rdbSaveLen(rdb, hashtableSize(set))) == -1) {
                 return -1;
             }
             nwritten += n;
 
-            while ((de = dictNext(di)) != NULL) {
-                sds ele = dictGetKey(de);
+            hashtableIterator iterator;
+            hashtableInitIterator(&iterator, set);
+            void *next;
+            while (hashtableNext(&iterator, &next)) {
+                sds ele = next;
                 if ((n = rdbSaveRawString(rdb, (unsigned char *)ele, sdslen(ele))) == -1) {
-                    dictReleaseIterator(di);
+                    hashtableResetIterator(&iterator);
                     return -1;
                 }
                 nwritten += n;
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iterator);
         } else if (o->encoding == OBJ_ENCODING_INTSET) {
             size_t l = intsetBlobLen((intset *)o->ptr);
 
@@ -1909,8 +1909,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             o = createSetObject();
             /* It's faster to expand the dict to the right size asap in order
              * to avoid rehashing */
-            if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr, len) != DICT_OK) {
-                rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
+            if (!hashtableTryExpand(o->ptr, len)) {
+                rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
                 decrRefCount(o);
                 return NULL;
             }
@@ -1949,8 +1949,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                      * of many small ones. It's OK since lpSafeToAdd doesn't
                      * care about individual elements, only the total size. */
                     setTypeConvert(o, OBJ_ENCODING_LISTPACK);
-                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) {
-                    rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
+                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) {
+                    rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
                     sdsfree(sdsele);
                     decrRefCount(o);
                     return NULL;
@@ -1970,8 +1970,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                         return NULL;
                     }
                     o->ptr = lpAppend(o->ptr, (unsigned char *)sdsele, elelen);
-                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) {
-                    rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
+                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) {
+                    rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
                     sdsfree(sdsele);
                     decrRefCount(o);
                     return NULL;
@@ -1980,8 +1980,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
 
             /* This will also be called when the set was just converted
              * to a regular hash table encoded set. */
-            if (o->encoding == OBJ_ENCODING_HT) {
-                if (dictAdd((dict *)o->ptr, sdsele, NULL) != DICT_OK) {
+            if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+                if (!hashtableAdd((hashtable *)o->ptr, sdsele)) {
                     rdbReportCorruptRDB("Duplicate set members detected");
                     decrRefCount(o);
                     sdsfree(sdsele);
@@ -2356,7 +2356,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             }
             o->type = OBJ_SET;
             o->encoding = OBJ_ENCODING_INTSET;
-            if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HT);
+            if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE);
             break;
         case RDB_TYPE_SET_LISTPACK:
             if (deep_integrity_validation) server.stat_dump_payload_sanitizations++;
@@ -2376,7 +2376,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                 decrRefCount(o);
                 goto emptykey;
             }
-            if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HT);
+            if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE);
             break;
         case RDB_TYPE_ZSET_ZIPLIST: {
             unsigned char *lp = lpNew(encoded_len);
diff --git a/src/server.c b/src/server.c
index e495730fe2..da06884eb1 100644
--- a/src/server.c
+++ b/src/server.c
@@ -372,6 +372,7 @@ void dictDictDestructor(void *val) {
     dictRelease((dict *)val);
 }
 
+/* Returns 1 when keys match */
 int dictSdsKeyCompare(const void *key1, const void *key2) {
     int l1, l2;
     l1 = sdslen((sds)key1);
@@ -380,6 +381,12 @@ int dictSdsKeyCompare(const void *key1, const void *key2) {
     return memcmp(key1, key2, l1) == 0;
 }
 
+/* Returns 0 when keys match */
+int hashtableSdsKeyCompare(const void *key1, const void *key2) {
+    const sds sds1 = (const sds)key1, sds2 = (const sds)key2;
+    return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2);
+}
+
 size_t dictSdsEmbedKey(unsigned char *buf, size_t buf_len, const void *key, uint8_t *key_offset) {
     return sdscopytobuffer(buf, buf_len, (sds)key, key_offset);
 }
@@ -542,17 +549,11 @@ dictType objectKeyHeapPointerValueDictType = {
     NULL                  /* allow to expand */
 };
 
-/* Set dictionary type. Keys are SDS strings, values are not used. */
-dictType setDictType = {
-    dictSdsHash,       /* hash function */
-    NULL,              /* key dup */
-    dictSdsKeyCompare, /* key compare */
-    dictSdsDestructor, /* key destructor */
-    NULL,              /* val destructor */
-    NULL,              /* allow to expand */
-    .no_value = 1,     /* no values in this dict */
-    .keys_are_odd = 1  /* an SDS string is always an odd pointer */
-};
+/* Set hashtable type. Items are SDS strings */
+hashtableType setHashtableType = {
+    .hashFunction = dictSdsHash,
+    .keyCompare = hashtableSdsKeyCompare,
+    .entryDestructor = dictSdsDestructor};
 
 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
 dictType zsetDictType = {
@@ -572,11 +573,6 @@ const void *hashtableObjectGetKey(const void *entry) {
     return objectGetKey(entry);
 }
 
-int hashtableSdsKeyCompare(const void *key1, const void *key2) {
-    const sds sds1 = (const sds)key1, sds2 = (const sds)key2;
-    return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2);
-}
-
 int hashtableObjKeyCompare(const void *key1, const void *key2) {
     const robj *o1 = key1, *o2 = key2;
     return hashtableSdsKeyCompare(o1->ptr, o2->ptr);
@@ -645,6 +641,11 @@ dictType sdsReplyDictType = {
     NULL               /* allow to expand */
 };
 
+/* Hashtable type without destructor */
+hashtableType sdsReplyHashtableType = {
+    .hashFunction = dictSdsCaseHash,
+    .keyCompare = hashtableSdsKeyCompare};
+
 /* Keylist hash table type has unencoded Objects as keys and
  * lists as values. It's used for blocking operations (BLPOP) and to
  * map swapped keys to a list of clients waiting for this keys to be loaded. */
@@ -6528,27 +6529,7 @@ void sendChildInfo(childInfoType info_type, size_t keys, char *pname) {
     sendChildInfoGeneric(info_type, keys, -1, pname);
 }
 
-/* Try to release pages back to the OS directly (bypassing the allocator),
- * in an effort to decrease CoW during fork. For small allocations, we can't
- * release any full page, so in an effort to avoid getting the size of the
- * allocation from the allocator (malloc_size) when we already know it's small,
- * we check the size_hint. If the size is not already known, passing a size_hint
- * of 0 will lead the checking the real size of the allocation.
- * Also please note that the size may be not accurate, so in order to make this
- * solution effective, the judgement for releasing memory pages should not be
- * too strict. */
-void dismissMemory(void *ptr, size_t size_hint) {
-    if (ptr == NULL) return;
-
-    /* madvise(MADV_DONTNEED) can not release pages if the size of memory
-     * is too small, we try to release only for the memory which the size
-     * is more than half of page size. */
-    if (size_hint && size_hint <= server.page_size / 2) return;
-
-    zmadvise_dontneed(ptr);
-}
-
-/* Dismiss big chunks of memory inside a client structure, see dismissMemory() */
+/* Dismiss big chunks of memory inside a client structure, see zmadvise_dontneed() */
 void dismissClientMemory(client *c) {
     /* Dismiss client query buffer and static reply buffer. */
     dismissMemory(c->buf, c->buf_usable_size);
@@ -6579,7 +6560,7 @@ void dismissClientMemory(client *c) {
 /* In the child process, we don't need some buffers anymore, and these are
  * likely to change in the parent when there's heavy write traffic.
  * We dismiss them right away, to avoid CoW.
- * see dismissMemory(). */
+ * see zmadvise_dontneed(). */
 void dismissMemoryInChild(void) {
     /* madvise(MADV_DONTNEED) may not work if Transparent Huge Pages is enabled. */
     if (server.thp_enabled) return;
diff --git a/src/server.h b/src/server.h
index 88afb57c81..b07144de92 100644
--- a/src/server.h
+++ b/src/server.h
@@ -83,6 +83,8 @@ typedef long long ustime_t; /* microsecond time type. */
 #include "connection.h" /* Connection abstraction */
 #include "memory_prefetch.h"
 
+#define dismissMemory zmadvise_dontneed
+
 #define VALKEYMODULE_CORE 1
 typedef struct serverObject robj;
 #include "valkeymodule.h" /* Modules API defines. */
@@ -873,6 +875,7 @@ struct ValkeyModuleDigest {
 #define OBJ_ENCODING_QUICKLIST 9  /* Encoded as linked list of listpacks */
 #define OBJ_ENCODING_STREAM 10    /* Encoded as a radix tree of listpacks */
 #define OBJ_ENCODING_LISTPACK 11  /* Encoded as a listpack */
+#define OBJ_ENCODING_HASHTABLE 12 /* Encoded as a hashtable */
 
 #define LRU_BITS 24
 #define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */
@@ -2635,7 +2638,7 @@ typedef struct {
     robj *subject;
     int encoding;
     int ii; /* intset iterator */
-    dictIterator *di;
+    hashtableIterator *hashtable_iterator;
     unsigned char *lpi; /* listpack iterator */
 } setTypeIterator;
 
@@ -2666,7 +2669,7 @@ extern struct valkeyServer server;
 extern struct sharedObjectsStruct shared;
 extern dictType objectKeyPointerValueDictType;
 extern dictType objectKeyHeapPointerValueDictType;
-extern dictType setDictType;
+extern hashtableType setHashtableType;
 extern dictType BenchmarkDictType;
 extern dictType zsetDictType;
 extern hashtableType kvstoreKeysHashtableType;
@@ -2681,6 +2684,7 @@ extern dictType objToDictDictType;
 extern hashtableType kvstoreChannelHashtableType;
 extern dictType modulesDictType;
 extern dictType sdsReplyDictType;
+extern hashtableType sdsReplyHashtableType;
 extern dictType keylistDictType;
 extern dict *modules;
 
@@ -3375,7 +3379,6 @@ void rejectCommandFormat(client *c, const char *fmt, ...);
 void *activeDefragAlloc(void *ptr);
 robj *activeDefragStringOb(robj *ob);
 void dismissSds(sds s);
-void dismissMemory(void *ptr, size_t size_hint);
 void dismissMemoryInChild(void);
 
 #define RESTART_SERVER_NONE 0
diff --git a/src/t_set.c b/src/t_set.c
index 997fa2f5c9..4279baf82f 100644
--- a/src/t_set.c
+++ b/src/t_set.c
@@ -28,6 +28,7 @@
  */
 
 #include "server.h"
+#include "hashtable.h"
 #include "intset.h" /* Compact integer set structure */
 
 /*-----------------------------------------------------------------------------
@@ -50,7 +51,7 @@ robj *setTypeCreate(sds value, size_t size_hint) {
     /* We may oversize the set by using the hint if the hint is not accurate,
      * but we will assume this is acceptable to maximize performance. */
     robj *o = createSetObject();
-    dictExpand(o->ptr, size_hint);
+    hashtableExpand(o->ptr, size_hint);
     return o;
 }
 
@@ -59,7 +60,7 @@ robj *setTypeCreate(sds value, size_t size_hint) {
 void setTypeMaybeConvert(robj *set, size_t size_hint) {
     if ((set->encoding == OBJ_ENCODING_LISTPACK && size_hint > server.set_max_listpack_entries) ||
         (set->encoding == OBJ_ENCODING_INTSET && size_hint > server.set_max_intset_entries)) {
-        setTypeConvertAndExpand(set, OBJ_ENCODING_HT, size_hint, 1);
+        setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, size_hint, 1);
     }
 }
 
@@ -74,7 +75,7 @@ static size_t intsetMaxEntries(void) {
 /* Converts intset to HT if it contains too many entries. */
 static void maybeConvertIntset(robj *subject) {
     serverAssert(subject->encoding == OBJ_ENCODING_INTSET);
-    if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HT);
+    if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HASHTABLE);
 }
 
 /* When you know all set elements are integers, call this to convert the set to
@@ -91,7 +92,7 @@ static void maybeConvertToIntset(robj *set) {
     while (setTypeNext(si, &str, &len, &llval) != -1) {
         if (str) {
             /* If the element is returned as a string, we may be able to convert
-             * it to integer. This happens for OBJ_ENCODING_HT. */
+             * it to integer. This happens for OBJ_ENCODING_HASHTABLE. */
             serverAssert(string2ll(str, len, (long long *)&llval));
         }
         uint8_t success = 0;
@@ -134,20 +135,21 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
     }
 
     serverAssert(str);
-    if (set->encoding == OBJ_ENCODING_HT) {
+    if (set->encoding == OBJ_ENCODING_HASHTABLE) {
         /* Avoid duping the string if it is an sds string. */
         sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len);
-        dict *ht = set->ptr;
-        void *position = dictFindPositionForInsert(ht, sdsval, NULL);
-        if (position) {
+        hashtable *ht = set->ptr;
+        hashtablePosition position;
+        if (hashtableFindPositionForInsert(ht, sdsval, &position, NULL)) {
             /* Key doesn't already exist in the set. Add it but dup the key. */
             if (sdsval == str) sdsval = sdsdup(sdsval);
-            dictInsertAtPosition(ht, sdsval, position);
+            hashtableInsertAtPosition(ht, sdsval, &position);
+            return 1;
         } else if (sdsval != str) {
             /* String is already a member. Free our temporary sds copy. */
             sdsfree(sdsval);
+            return 0;
         }
-        return (position != NULL);
     } else if (set->encoding == OBJ_ENCODING_LISTPACK) {
         unsigned char *lp = set->ptr;
         unsigned char *p = lpFirst(lp);
@@ -166,8 +168,8 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
                 set->ptr = lp;
             } else {
                 /* Size limit is reached. Convert to hashtable and add. */
-                setTypeConvertAndExpand(set, OBJ_ENCODING_HT, lpLength(lp) + 1, 1);
-                serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK);
+                setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, lpLength(lp) + 1, 1);
+                serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len)));
             }
             return 1;
         }
@@ -204,10 +206,10 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
                 set->ptr = lp;
                 return 1;
             } else {
-                setTypeConvertAndExpand(set, OBJ_ENCODING_HT, intsetLen(set->ptr) + 1, 1);
+                setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, intsetLen(set->ptr) + 1, 1);
                 /* The set *was* an intset and this value is not integer
-                 * encodable, so dictAdd should always work. */
-                serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK);
+                 * encodable, so hashtableAdd should always work. */
+                serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len)));
                 return 1;
             }
         }
@@ -242,9 +244,9 @@ int setTypeRemoveAux(robj *setobj, char *str, size_t len, int64_t llval, int str
         str_is_sds = 0;
     }
 
-    if (setobj->encoding == OBJ_ENCODING_HT) {
+    if (setobj->encoding == OBJ_ENCODING_HASHTABLE) {
         sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len);
-        int deleted = (dictDelete(setobj->ptr, sdsval) == DICT_OK);
+        int deleted = hashtableDelete(setobj->ptr, sdsval);
         if (sdsval != str) sdsfree(sdsval); /* free temp copy */
         return deleted;
     } else if (setobj->encoding == OBJ_ENCODING_LISTPACK) {
@@ -298,11 +300,11 @@ int setTypeIsMemberAux(robj *set, char *str, size_t len, int64_t llval, int str_
     } else if (set->encoding == OBJ_ENCODING_INTSET) {
         long long llval;
         return string2ll(str, len, &llval) && intsetFind(set->ptr, llval);
-    } else if (set->encoding == OBJ_ENCODING_HT && str_is_sds) {
-        return dictFind(set->ptr, (sds)str) != NULL;
-    } else if (set->encoding == OBJ_ENCODING_HT) {
+    } else if (set->encoding == OBJ_ENCODING_HASHTABLE && str_is_sds) {
+        return hashtableFind(set->ptr, (sds)str, NULL);
+    } else if (set->encoding == OBJ_ENCODING_HASHTABLE) {
         sds sdsval = sdsnewlen(str, len);
-        int result = dictFind(set->ptr, sdsval) != NULL;
+        int result = hashtableFind(set->ptr, sdsval, NULL);
         sdsfree(sdsval);
         return result;
     } else {
@@ -314,8 +316,8 @@ setTypeIterator *setTypeInitIterator(robj *subject) {
     setTypeIterator *si = zmalloc(sizeof(setTypeIterator));
     si->subject = subject;
     si->encoding = subject->encoding;
-    if (si->encoding == OBJ_ENCODING_HT) {
-        si->di = dictGetIterator(subject->ptr);
+    if (si->encoding == OBJ_ENCODING_HASHTABLE) {
+        si->hashtable_iterator = hashtableCreateIterator(subject->ptr);
     } else if (si->encoding == OBJ_ENCODING_INTSET) {
         si->ii = 0;
     } else if (si->encoding == OBJ_ENCODING_LISTPACK) {
@@ -327,7 +329,7 @@ setTypeIterator *setTypeInitIterator(robj *subject) {
 }
 
 void setTypeReleaseIterator(setTypeIterator *si) {
-    if (si->encoding == OBJ_ENCODING_HT) dictReleaseIterator(si->di);
+    if (si->encoding == OBJ_ENCODING_HASHTABLE) hashtableReleaseIterator(si->hashtable_iterator);
     zfree(si);
 }
 
@@ -340,7 +342,7 @@ void setTypeReleaseIterator(setTypeIterator *si) {
  * (str and len) or (llele) depending on whether the value is stored as a string
  * or as an integer internally.
  *
- * If OBJ_ENCODING_HT is returned, then str points to an sds string and can be
+ * If OBJ_ENCODING_HASHTABLE is returned, then str points to an sds string and can be
  * used as such. If OBJ_ENCODING_INTSET, then llele is populated and str is
  * pointed to NULL. If OBJ_ENCODING_LISTPACK is returned, the value can be
  * either a string or an integer. If *str is not NULL, then str and len are
@@ -353,10 +355,10 @@ void setTypeReleaseIterator(setTypeIterator *si) {
  *
  * When there are no more elements -1 is returned. */
 int setTypeNext(setTypeIterator *si, char **str, size_t *len, int64_t *llele) {
-    if (si->encoding == OBJ_ENCODING_HT) {
-        dictEntry *de = dictNext(si->di);
-        if (de == NULL) return -1;
-        *str = dictGetKey(de);
+    if (si->encoding == OBJ_ENCODING_HASHTABLE) {
+        void *next;
+        if (!hashtableNext(si->hashtable_iterator, &next)) return -1;
+        *str = next;
         *len = sdslen(*str);
         *llele = -123456789; /* Not needed. Defensive. */
     } else if (si->encoding == OBJ_ENCODING_INTSET) {
@@ -406,15 +408,16 @@ sds setTypeNextObject(setTypeIterator *si) {
  * object. The return value of the function is the object->encoding
  * field of the object and can be used by the caller to check if the
  * int64_t pointer or the str and len pointers were populated, as for
- * setTypeNext. If OBJ_ENCODING_HT is returned, str is pointed to a
+ * setTypeNext. If OBJ_ENCODING_HASHTABLE is returned, str is pointed to a
  * string which is actually an sds string and it can be used as such.
  *
  * Note that both the str, len and llele pointers should be passed and cannot
  * be NULL. If str is set to NULL, the value is an integer stored in llele. */
 int setTypeRandomElement(robj *setobj, char **str, size_t *len, int64_t *llele) {
-    if (setobj->encoding == OBJ_ENCODING_HT) {
-        dictEntry *de = dictGetFairRandomKey(setobj->ptr);
-        *str = dictGetKey(de);
+    if (setobj->encoding == OBJ_ENCODING_HASHTABLE) {
+        void *entry = NULL;
+        hashtableFairRandomEntry(setobj->ptr, &entry);
+        *str = entry;
         *len = sdslen(*str);
         *llele = -123456789; /* Not needed. Defensive. */
     } else if (setobj->encoding == OBJ_ENCODING_INTSET) {
@@ -457,14 +460,14 @@ robj *setTypePopRandom(robj *set) {
             obj = createStringObject(str, len);
         else
             obj = createStringObjectFromLongLong(llele);
-        setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT);
+        setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
     }
     return obj;
 }
 
 unsigned long setTypeSize(const robj *subject) {
-    if (subject->encoding == OBJ_ENCODING_HT) {
-        return dictSize((const dict *)subject->ptr);
+    if (subject->encoding == OBJ_ENCODING_HASHTABLE) {
+        return hashtableSize((const hashtable *)subject->ptr);
     } else if (subject->encoding == OBJ_ENCODING_INTSET) {
         return intsetLen((const intset *)subject->ptr);
     } else if (subject->encoding == OBJ_ENCODING_LISTPACK) {
@@ -474,7 +477,7 @@ unsigned long setTypeSize(const robj *subject) {
     }
 }
 
-/* Convert the set to specified encoding. The resulting dict (when converting
+/* Convert the set to specified encoding. The resulting hashtable (when converting
  * to a hash table) is presized to hold the number of elements in the original
  * set. */
 void setTypeConvert(robj *setobj, int enc) {
@@ -489,28 +492,28 @@ int setTypeConvertAndExpand(robj *setobj, int enc, unsigned long cap, int panic)
     setTypeIterator *si;
     serverAssertWithInfo(NULL, setobj, setobj->type == OBJ_SET && setobj->encoding != enc);
 
-    if (enc == OBJ_ENCODING_HT) {
-        dict *d = dictCreate(&setDictType);
+    if (enc == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = hashtableCreate(&setHashtableType);
         sds element;
 
-        /* Presize the dict to avoid rehashing */
+        /* Presize the hashtable to avoid rehashing */
         if (panic) {
-            dictExpand(d, cap);
-        } else if (dictTryExpand(d, cap) != DICT_OK) {
-            dictRelease(d);
+            hashtableExpand(ht, cap);
+        } else if (!hashtableTryExpand(ht, cap)) {
+            hashtableRelease(ht);
             return C_ERR;
         }
 
         /* To add the elements we extract integers and create Objects */
         si = setTypeInitIterator(setobj);
         while ((element = setTypeNextObject(si)) != NULL) {
-            serverAssert(dictAdd(d, element, NULL) == DICT_OK);
+            serverAssert(hashtableAdd(ht, element));
         }
         setTypeReleaseIterator(si);
 
         freeSetObject(setobj); /* frees the internals but not setobj itself */
-        setobj->encoding = OBJ_ENCODING_HT;
-        setobj->ptr = d;
+        setobj->encoding = OBJ_ENCODING_HASHTABLE;
+        setobj->ptr = ht;
     } else if (enc == OBJ_ENCODING_LISTPACK) {
         /* Preallocate the minimum two bytes per element (enc/value + backlen) */
         size_t estcap = cap * 2;
@@ -568,10 +571,10 @@ robj *setTypeDup(robj *o) {
         memcpy(new_lp, lp, sz);
         set = createObject(OBJ_SET, new_lp);
         set->encoding = OBJ_ENCODING_LISTPACK;
-    } else if (o->encoding == OBJ_ENCODING_HT) {
+    } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
         set = createSetObject();
-        dict *d = o->ptr;
-        dictExpand(set->ptr, dictSize(d));
+        hashtable *ht = o->ptr;
+        hashtableExpand(set->ptr, hashtableSize(ht));
         si = setTypeInitIterator(o);
         char *str;
         size_t len;
@@ -891,8 +894,8 @@ void spopWithCountCommand(client *c) {
                 if (!newset) {
                     newset = str ? createSetListpackObject() : createIntsetObject();
                 }
-                setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HT);
-                setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT);
+                setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
+                setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
             }
         }
 
@@ -1001,8 +1004,6 @@ void srandmemberWithCountCommand(client *c) {
     size_t len;
     int64_t llele;
 
-    dict *d;
-
     if (getRangeLongFromObjectOrReply(c, c->argv[2], -LONG_MAX, LONG_MAX, &l, NULL) != C_OK) return;
     if (l >= 0) {
         count = (unsigned long)l;
@@ -1111,8 +1112,8 @@ void srandmemberWithCountCommand(client *c) {
         return;
     }
 
-    /* For CASE 3 and CASE 4 we need an auxiliary dictionary. */
-    d = dictCreate(&sdsReplyDictType);
+    /* For CASE 3 and CASE 4 we need an auxiliary hashtable. */
+    hashtable *ht = hashtableCreate(&sdsReplyHashtableType);
 
     /* CASE 3:
      * The number of elements inside the set is not greater than
@@ -1126,29 +1127,25 @@ void srandmemberWithCountCommand(client *c) {
     if (count * SRANDMEMBER_SUB_STRATEGY_MUL > size) {
         setTypeIterator *si;
 
-        /* Add all the elements into the temporary dictionary. */
+        /* Add all the elements into the temporary hashtable. */
         si = setTypeInitIterator(set);
-        dictExpand(d, size);
+        hashtableExpand(ht, size);
         while (setTypeNext(si, &str, &len, &llele) != -1) {
-            int retval = DICT_ERR;
-
             if (str == NULL) {
-                retval = dictAdd(d, sdsfromlonglong(llele), NULL);
+                serverAssert(hashtableAdd(ht, (void *)sdsfromlonglong(llele)));
             } else {
-                retval = dictAdd(d, sdsnewlen(str, len), NULL);
+                serverAssert(hashtableAdd(ht, (void *)sdsnewlen(str, len)));
             }
-            serverAssert(retval == DICT_OK);
         }
         setTypeReleaseIterator(si);
-        serverAssert(dictSize(d) == size);
+        serverAssert(hashtableSize(ht) == size);
 
         /* Remove random elements to reach the right count. */
         while (size > count) {
-            dictEntry *de;
-            de = dictGetFairRandomKey(d);
-            dictUnlink(d, dictGetKey(de));
-            sdsfree(dictGetKey(de));
-            dictFreeUnlinkedEntry(d, de);
+            void *element;
+            hashtableFairRandomEntry(ht, &element);
+            hashtableDelete(ht, element);
+            sdsfree((sds)element);
             size--;
         }
     }
@@ -1161,7 +1158,7 @@ void srandmemberWithCountCommand(client *c) {
         unsigned long added = 0;
         sds sdsele;
 
-        dictExpand(d, count);
+        hashtableExpand(ht, count);
         while (added < count) {
             setTypeRandomElement(set, &str, &len, &llele);
             if (str == NULL) {
@@ -1172,7 +1169,7 @@ void srandmemberWithCountCommand(client *c) {
             /* Try to add the object to the dictionary. If it already exists
              * free it, otherwise increment the number of objects we have
              * in the result dictionary. */
-            if (dictAdd(d, sdsele, NULL) == DICT_OK)
+            if (hashtableAdd(ht, sdsele))
                 added++;
             else
                 sdsfree(sdsele);
@@ -1181,14 +1178,15 @@ void srandmemberWithCountCommand(client *c) {
 
     /* CASE 3 & 4: send the result to the user. */
     {
-        dictIterator *di;
-        dictEntry *de;
+        hashtableIterator iter;
+        hashtableInitIterator(&iter, ht);
 
         addReplyArrayLen(c, count);
-        di = dictGetIterator(d);
-        while ((de = dictNext(di)) != NULL) addReplyBulkSds(c, dictGetKey(de));
-        dictReleaseIterator(di);
-        dictRelease(d);
+        serverAssert(count == hashtableSize(ht));
+        void *element;
+        while (hashtableNext(&iter, &element)) addReplyBulkSds(c, (sds)element);
+        hashtableResetIterator(&iter);
+        hashtableRelease(ht);
     }
 }
 
@@ -1336,7 +1334,7 @@ void sinterGenericCommand(client *c,
     while ((encoding = setTypeNext(si, &str, &len, &intobj)) != -1) {
         for (j = 1; j < setnum; j++) {
             if (sets[j] == sets[0]) continue;
-            if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HT)) break;
+            if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE)) break;
         }
 
         /* Only take action when all sets contain the member */
@@ -1355,7 +1353,7 @@ void sinterGenericCommand(client *c,
             } else {
                 if (str && only_integers) {
                     /* It may be an integer although we got it as a string. */
-                    if (encoding == OBJ_ENCODING_HT && string2ll(str, len, (long long *)&intobj)) {
+                    if (encoding == OBJ_ENCODING_HASHTABLE && string2ll(str, len, (long long *)&intobj)) {
                         if (dstset->encoding == OBJ_ENCODING_LISTPACK || dstset->encoding == OBJ_ENCODING_INTSET) {
                             /* Adding it as an integer is more efficient. */
                             str = NULL;
@@ -1365,7 +1363,7 @@ void sinterGenericCommand(client *c,
                         only_integers = 0;
                     }
                 }
-                setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HT);
+                setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE);
             }
         }
     }
@@ -1467,7 +1465,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
         /* For a SET's encoding, according to the factory method setTypeCreate(), currently have 3 types:
          * 1. OBJ_ENCODING_INTSET
          * 2. OBJ_ENCODING_LISTPACK
-         * 3. OBJ_ENCODING_HT
+         * 3. OBJ_ENCODING_HASHTABLE
          * 'dstset_encoding' is used to determine which kind of encoding to use when initialize 'dstset'.
          *
          * If all sets are all OBJ_ENCODING_INTSET encoding or 'dstkey' is not null, keep 'dstset'
@@ -1478,8 +1476,8 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
          * the hashtable is more efficient when find and compare than the listpack. The corresponding
          * time complexity are O(1) vs O(n). */
         if (!dstkey && dstset_encoding == OBJ_ENCODING_INTSET &&
-            (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HT)) {
-            dstset_encoding = OBJ_ENCODING_HT;
+            (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HASHTABLE)) {
+            dstset_encoding = OBJ_ENCODING_HASHTABLE;
         }
         sets[j] = setobj;
         if (j > 0 && sets[0] == sets[j]) {
@@ -1536,7 +1534,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
 
             si = setTypeInitIterator(sets[j]);
             while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) {
-                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
             }
             setTypeReleaseIterator(si);
         }
@@ -1556,11 +1554,11 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
             for (j = 1; j < setnum; j++) {
                 if (!sets[j]) continue;        /* no key is an empty set. */
                 if (sets[j] == sets[0]) break; /* same set! */
-                if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HT)) break;
+                if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HASHTABLE)) break;
             }
             if (j == setnum) {
                 /* There is no other set with this element. Add it. */
-                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
             }
         }
         setTypeReleaseIterator(si);
@@ -1578,9 +1576,9 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
             si = setTypeInitIterator(sets[j]);
             while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) {
                 if (j == 0) {
-                    cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                    cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
                 } else {
-                    cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                    cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
                 }
             }
             setTypeReleaseIterator(si);
diff --git a/src/t_zset.c b/src/t_zset.c
index 105d57b7c3..e8c5a369b7 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -2069,9 +2069,7 @@ typedef struct {
                 int ii;
             } is;
             struct {
-                dict *dict;
-                dictIterator *di;
-                dictEntry *de;
+                hashtableIterator *iter;
             } ht;
             struct {
                 unsigned char *lp;
@@ -2126,10 +2124,8 @@ void zuiInitIterator(zsetopsrc *op) {
         if (op->encoding == OBJ_ENCODING_INTSET) {
             it->is.is = op->subject->ptr;
             it->is.ii = 0;
-        } else if (op->encoding == OBJ_ENCODING_HT) {
-            it->ht.dict = op->subject->ptr;
-            it->ht.di = dictGetIterator(op->subject->ptr);
-            it->ht.de = dictNext(it->ht.di);
+        } else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
+            it->ht.iter = hashtableCreateIterator(op->subject->ptr);
         } else if (op->encoding == OBJ_ENCODING_LISTPACK) {
             it->lp.lp = op->subject->ptr;
             it->lp.p = lpFirst(it->lp.lp);
@@ -2166,8 +2162,8 @@ void zuiClearIterator(zsetopsrc *op) {
         iterset *it = &op->iter.set;
         if (op->encoding == OBJ_ENCODING_INTSET) {
             UNUSED(it); /* skip */
-        } else if (op->encoding == OBJ_ENCODING_HT) {
-            dictReleaseIterator(it->ht.di);
+        } else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtableReleaseIterator(it->ht.iter);
         } else if (op->encoding == OBJ_ENCODING_LISTPACK) {
             UNUSED(it);
         } else {
@@ -2235,13 +2231,11 @@ int zuiNext(zsetopsrc *op, zsetopval *val) {
 
             /* Move to next element. */
             it->is.ii++;
-        } else if (op->encoding == OBJ_ENCODING_HT) {
-            if (it->ht.de == NULL) return 0;
-            val->ele = dictGetKey(it->ht.de);
+        } else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
+            void *next;
+            if (!hashtableNext(it->ht.iter, &next)) return 0;
+            val->ele = next;
             val->score = 1.0;
-
-            /* Move to next element. */
-            it->ht.de = dictNext(it->ht.di);
         } else if (op->encoding == OBJ_ENCODING_LISTPACK) {
             if (it->lp.p == NULL) return 0;
             val->estr = lpGetValue(it->lp.p, &val->elen, &val->ell);
diff --git a/src/zmalloc.c b/src/zmalloc.c
index a696111e47..b1de4f2af1 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -451,15 +451,25 @@ void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
     zmalloc_oom_handler = oom_handler;
 }
 
-/* Use 'MADV_DONTNEED' to release memory to operating system quickly.
- * We do that in a fork child process to avoid CoW when the parent modifies
- * these shared pages. */
-void zmadvise_dontneed(void *ptr) {
+/* Try to release pages back to the OS directly using 'MADV_DONTNEED' (bypassing
+ * the allocator) in a fork child process to avoid CoW when the parent modifies
+ * those shared pages. For small allocations, we can't release any full page,
+ * so in an effort to avoid getting the size of the allocation from the
+ * allocator (malloc_size) when we already know it's small, we check the
+ * size_hint. If the size is not already known, passing a size_hint of 0 will
+ * lead the checking the real size of the allocation.
+ * Also please note that the size may be not accurate, so in order to make this
+ * solution effective, the judgement for releasing memory pages should not be
+ * too strict. */
+void zmadvise_dontneed(void *ptr, size_t size_hint) {
 #if defined(USE_JEMALLOC) && defined(__linux__)
+    if (ptr == NULL) return;
+
     static size_t page_size = 0;
     if (page_size == 0) page_size = sysconf(_SC_PAGESIZE);
     size_t page_size_mask = page_size - 1;
 
+    if (size_hint && size_hint / 2 < page_size) return;
     size_t real_size = zmalloc_size(ptr);
     if (real_size < page_size) return;
 
@@ -473,6 +483,7 @@ void zmadvise_dontneed(void *ptr) {
     }
 #else
     (void)(ptr);
+    (void)(size_hint);
 #endif
 }
 
diff --git a/src/zmalloc.h b/src/zmalloc.h
index 38c2bae864..68b4df63aa 100644
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@@ -139,7 +139,7 @@ size_t zmalloc_get_smap_bytes_by_field(char *field, long pid);
 size_t zmalloc_get_memory_size(void);
 void zlibc_free(void *ptr);
 void zlibc_trim(void);
-void zmadvise_dontneed(void *ptr);
+void zmadvise_dontneed(void *ptr, size_t size_hint);
 
 #ifndef HAVE_MALLOC_SIZE
 size_t zmalloc_size(void *ptr);
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index cf7f633a8c..e50faba62b 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -515,10 +515,10 @@ start_server {tags {"info" "external:skip"}} {
         set info_mem [r info memory]
         set mem_stats [r memory stats]
         assert_equal [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] {0}
-        # overhead.db.hashtable.lut = memory overhead of hashset including hashset struct and tables
-        set hashset_overhead [dict get $mem_stats overhead.db.hashtable.lut]
-        if {$hashset_overhead < 140} {
-            # 32-bit version (hashset struct + 1 bucket of 64 bytes)
+        # overhead.db.hashtable.lut = memory overhead of hashtable including hashtable struct and tables
+        set hashtable_overhead [dict get $mem_stats overhead.db.hashtable.lut]
+        if {$hashtable_overhead < 140} {
+            # 32-bit version (hashtable struct + 1 bucket of 64 bytes)
             set bits 32
         } else {
             set bits 64
diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl
index 944c3d3d98..1871ec9b4d 100644
--- a/tests/unit/type/set.tcl
+++ b/tests/unit/type/set.tcl
@@ -33,6 +33,7 @@ start_server {
         assert_equal {0 1} [r smismember myset bla foo]
         assert_equal {0} [r smismember myset bla]
         assert_equal "bar $initelems($type)" [lsort [r smembers myset]]
+        r memory usage myset
     }
     }
 
@@ -51,6 +52,7 @@ start_server {
         assert_equal {0 1} [r smismember myset 18 16]
         assert_equal {0} [r smismember myset 18]
         assert_equal {16 17} [lsort [r smembers myset]]
+        r memory usage myset
     }
 
     test {SMISMEMBER SMEMBERS SCARD against non set} {
@@ -1029,111 +1031,6 @@ foreach type {single multiple single_multiple} {
         r srem $myset {*}$members
     }
 
-    proc verify_rehashing_completed_key {myset table_size keys} {
-        set htstats [r debug HTSTATS-KEY $myset]
-        assert {![string match {*rehashing target*} $htstats]}
-        return {[string match {*table size: $table_size*number of elements: $keys*} $htstats]}
-    }
-
-    test "SRANDMEMBER with a dict containing long chain" {
-        set origin_save [config_get_set save ""]
-        set origin_max_lp [config_get_set set-max-listpack-entries 0]
-        set origin_save_delay [config_get_set rdb-key-save-delay 2147483647]
-
-        # 1) Create a hash set with 100000 members.
-        set members {}
-        for {set i 0} {$i < 100000} {incr i} {
-            lappend members [format "m:%d" $i]
-        }
-        create_set myset $members
-
-        # 2) Wait for the hash set rehashing to finish.
-        while {[is_rehashing myset]} {
-            r srandmember myset 100
-        }
-
-        # 3) Turn off the rehashing of this set, and remove the members to 500.
-        r bgsave
-        rem_hash_set_top_N myset [expr {[r scard myset] - 500}]
-        assert_equal [r scard myset] 500
-
-        # 4) Kill RDB child process to restart rehashing.
-        set pid1 [get_child_pid 0]
-        catch {exec kill -9 $pid1}
-        waitForBgsave r
-
-        # 5) Let the set hash to start rehashing
-        r spop myset 1
-        assert [is_rehashing myset]
-
-        # 6) Verify that when rdb saving is in progress, rehashing will still be performed (because
-        # the ratio is extreme) by waiting for it to finish during an active bgsave.
-        r bgsave
-
-        while {[is_rehashing myset]} {
-            r srandmember myset 1
-        }
-        if {$::verbose} {
-            puts [r debug HTSTATS-KEY myset full]
-        }
-
-        set pid1 [get_child_pid 0]
-        catch {exec kill -9 $pid1}
-        waitForBgsave r
-
-        # 7) Check that eventually, SRANDMEMBER returns all elements.
-        array set allmyset {}
-        foreach ele [r smembers myset] {
-            set allmyset($ele) 1
-        }
-        unset -nocomplain auxset
-        set iterations 1000
-        while {$iterations != 0} {
-            incr iterations -1
-            set res [r srandmember myset -10]
-            foreach ele $res {
-                set auxset($ele) 1
-            }
-            if {[lsort [array names allmyset]] eq
-                [lsort [array names auxset]]} {
-                break;
-            }
-        }
-        assert {$iterations != 0}
-
-        # 8) Remove the members to 30 in order to calculate the value of Chi-Square Distribution,
-        #    otherwise we would need more iterations.
-        rem_hash_set_top_N myset [expr {[r scard myset] - 30}]
-        assert_equal [r scard myset] 30
-        
-        # Hash set rehashing would be completed while removing members from the `myset`
-        # We also check the size and members in the hash table.
-        verify_rehashing_completed_key myset 64 30
-
-        # Now that we have a hash set with only one long chain bucket.
-        set htstats [r debug HTSTATS-KEY myset full]
-        assert {[regexp {different slots: ([0-9]+)} $htstats - different_slots]}
-        assert {[regexp {max chain length: ([0-9]+)} $htstats - max_chain_length]}
-        assert {$different_slots == 1 && $max_chain_length == 30}
-
-        # 9) Use positive count (PATH 4) to get 10 elements (out of 30) each time.
-        unset -nocomplain allkey
-        set iterations 1000
-        while {$iterations != 0} {
-            incr iterations -1
-            set res [r srandmember myset 10]
-            foreach ele $res {
-                lappend allkey $ele
-            }
-        }
-        # validate even distribution of random sampling (df = 29, 73 means 0.00001 probability)
-        assert_lessthan [chi_square_value $allkey] 73
-
-        r config set save $origin_save
-        r config set set-max-listpack-entries $origin_max_lp
-        r config set rdb-key-save-delay $origin_save_delay
-    } {OK} {needs:debug slow}
-
     proc setup_move {} {
         r del myset3{t} myset4{t}
         create_set myset1{t} {1 a b}

From aa35b89456d871c9c0292dcba07d9452540e1aad Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sun, 15 Dec 2024 12:09:53 +0800
Subject: [PATCH 015/101] Automatic failover vote is not limited by two times
 the node timeout (#1356)

This is a follow of #1305, we now decided to apply the same change
to automatic failover as well, that is, move forward with removing
it for both automatic and manual failovers.

Quote from Ping during the review:
Note that we already debounce transient primary failures with node
timeout, ensuring failover is only triggered after sustained outages.
Election timing is naturally staggered by replica spacing, making the
likelihood of simultaneous elections from replicas of the same shard
very low. The one-vote-per-epoch rule further throttles retries and
ensures orderly elections. On top of that, quorum-based primary failure
confirmation, cluster-state convergence, and slot ownership validation
are all built into the process.

Quote from Madelyn during the review:
It against the specific primary. It's to prevent double failovers.
If a primary just took over we don't want someone else to try to
take over and give the new primary some amount of time to take over.
I have not seen this issue though, it might have been over optimizing?
The double failure mode, where a node fails and then another node fails
within the nodetimeout also doesn't seem that common either though.

So the conclusion is that we all agreed to remove it completely,
it will make the code a lot simpler. And if there is other specific
edge cases we are missing, we will fix it in other way.

See discussion #1305 for more information.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c                   | 19 -------------
 src/cluster_legacy.h                   |  2 --
 tests/unit/cluster/manual-failover.tcl | 39 ++++++++++++++++++++------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index d1c6dd0094..418070f69c 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1505,7 +1505,6 @@ clusterNode *createClusterNode(char *nodename, int flags) {
     node->cport = 0;
     node->tls_port = 0;
     node->fail_reports = listCreate();
-    node->voted_time = 0;
     node->orphaned_time = 0;
     node->repl_offset_time = 0;
     node->repl_offset = 0;
@@ -4396,23 +4395,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
         return;
     }
 
-    /* We did not voted for a replica about this primary for two
-     * times the node timeout. This is not strictly needed for correctness
-     * of the algorithm but makes the base case more linear.
-     *
-     * This limitation does not restrict manual failover. If a user initiates
-     * a manual failover, we need to allow it to vote, otherwise the manual
-     * failover may time out. */
-    if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
-        serverLog(LL_WARNING,
-                  "Failover auth denied to %.40s (%s): "
-                  "can't vote for any replica of %.40s (%s) within %lld milliseconds",
-                  node->name, node->human_nodename,
-                  node->replicaof->name, node->replicaof->human_nodename,
-                  (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time)));
-        return;
-    }
-
     /* The replica requesting the vote must have a configEpoch for the claimed
      * slots that is >= the one of the primaries currently serving the same
      * slots in the current configuration. */
@@ -4434,7 +4416,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
 
     /* We can vote for this replica. */
     server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
-    if (!force_ack) node->replicaof->voted_time = mstime();
     clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
     clusterSendFailoverAuth(node);
     serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename,
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index fb317038d6..d3e1c3459e 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -341,8 +341,6 @@ struct _clusterNode {
     mstime_t pong_received;                 /* Unix time we received the pong */
     mstime_t data_received;                 /* Unix time we received any data */
     mstime_t fail_time;                     /* Unix time when FAIL flag was set */
-    mstime_t voted_time;                    /* Last time we voted for a replica of this primary in non manual
-                                             * failover scenarios. */
     mstime_t repl_offset_time;              /* Unix time we received offset for this node */
     mstime_t orphaned_time;                 /* Starting time of orphaned primary condition */
     mstime_t inbound_link_freed_time;       /* Last time we freed the inbound link for this node.
diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl
index 220ffc3eaf..dbcbb26380 100644
--- a/tests/unit/cluster/manual-failover.tcl
+++ b/tests/unit/cluster/manual-failover.tcl
@@ -189,11 +189,6 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval
         set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6
         set CLUSTER_PACKET_TYPE_NONE -1
 
-        # Setting a large timeout to make sure we hit the voted_time limit.
-        R 0 config set cluster-node-timeout 150000
-        R 1 config set cluster-node-timeout 150000
-        R 2 config set cluster-node-timeout 150000
-
         # Let replica drop FAILOVER_AUTH_ACK so that the election won't
         # get the enough votes and the election will time out.
         R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK
@@ -229,10 +224,6 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval
         pause_process [srv 0 pid]
         wait_for_cluster_state fail
 
-        # Setting a large timeout to make sure we hit the voted_time limit.
-        R 1 config set cluster-node-timeout 150000
-        R 2 config set cluster-node-timeout 150000
-
         # R 3 performs an automatic failover and it will work.
         R 3 config set cluster-replica-no-failover no
         wait_for_condition 1000 50 {
@@ -272,6 +263,36 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval
     }
 } ;# start_cluster
 
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
+    test "Automatic failover vote is not limited by two times the node timeout - mixed failover" {
+        R 3 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave} &&
+            [s -3 role] eq {master}
+        } else {
+            fail "The first failover does not happen"
+        }
+        wait_for_cluster_propagation
+
+        R 0 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {master} &&
+            [s -3 role] eq {slave}
+        } else {
+            fail "The second failover does not happen"
+        }
+        wait_for_cluster_propagation
+
+        # Let R 3 trigger the automatic failover
+        pause_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [s -3 role] eq {master}
+        } else {
+            fail "The third failover does not happen"
+        }
+    }
+} ;# start_cluster
+
 start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
     test "Manual failover will reset the on-going election" {
         set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5

From 4b7e07f84be96dc1a85b14b7a014b5a7e253752f Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 16 Dec 2024 13:43:48 +0800
Subject: [PATCH 016/101] Drop the MEET packet if the link node is in handshake
 state (#1436)

After #1307 got merged, we notice there is a assert happen in setClusterNodeToInboundClusterLink:
```
=== ASSERTION FAILED ===
==> '!link->node' is not true
```

In #778, we will call setClusterNodeToInboundClusterLink to attach the node to the link
during the MEET processing, so if we receive a another MEET packet in a short time, the
node is still in handshake state, we will meet this assert and crash the server.

If the link is bound to a node and the node is in the handshake state, and we receive
a MEET packet, it may be that the sender sent multiple MEET packets so in here we are
dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert
will happen if the other sends a MEET packet because it detects that there is no inbound
link, this node creates a new node in HANDSHAKE state (with a random node name), and
respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET
flag. This node is supposed to open an outbound connection to the other node in the next
cron cycle, but before this happens, the other node re-sends a MEET on the same link
because it still detects no inbound connection.

Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name
and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent
us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET
packet eliminate the handshake state.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 418070f69c..9ddcf6678d 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3003,7 +3003,8 @@ int clusterIsValidPacket(clusterLink *link) {
     }
 
     if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) {
-        serverLog(LL_WARNING, "Dropping packet that matches debug drop filter");
+        serverLog(LL_WARNING, "Dropping packet of type %s that matches debug drop filter",
+                  clusterGetMessageTypeString(type));
         return 0;
     }
 
@@ -3094,7 +3095,7 @@ int clusterProcessPacket(clusterLink *link) {
         if (server.debug_cluster_close_link_on_packet_drop &&
             (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) {
             freeClusterLink(link);
-            serverLog(LL_WARNING, "Closing link for matching packet type %hu", type);
+            serverLog(LL_WARNING, "Closing link for matching packet type %s", clusterGetMessageTypeString(type));
             return 0;
         }
         return 1;
@@ -3110,8 +3111,8 @@ int clusterProcessPacket(clusterLink *link) {
             freeClusterLink(link);
             serverLog(
                 LL_NOTICE,
-                "Closing link for node that sent a lightweight message of type %hu as its first message on the link",
-                type);
+                "Closing link for node that sent a lightweight message of type %s as its first message on the link",
+                clusterGetMessageTypeString(type));
             return 0;
         }
         clusterNode *sender = link->node;
@@ -3120,6 +3121,27 @@ int clusterProcessPacket(clusterLink *link) {
         return 1;
     }
 
+    if (type == CLUSTERMSG_TYPE_MEET && link->node && nodeInHandshake(link->node)) {
+        /* If the link is bound to a node and the node is in the handshake state, and we receive
+         * a MEET packet, it may be that the sender sent multiple MEET packets so in here we are
+         * dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert
+         * will happen if the other sends a MEET packet because it detects that there is no inbound
+         * link, this node creates a new node in HANDSHAKE state (with a random node name), and
+         * respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET
+         * flag. This node is supposed to open an outbound connection to the other node in the next
+         * cron cycle, but before this happens, the other node re-sends a MEET on the same link
+         * because it still detects no inbound connection. We improved the re-send logic of MEET in
+         * #1441, now we will only re-send MEET packet once every handshake timeout period.
+         *
+         * Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name
+         * and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent
+         * us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET
+         * packet eliminate the handshake state. */
+        serverLog(LL_NOTICE, "Dropping MEET packet from node %.40s because the node is already in handshake state",
+                  link->node->name);
+        return 1;
+    }
+
     uint16_t flags = ntohs(hdr->flags);
     uint64_t sender_claimed_current_epoch = 0, sender_claimed_config_epoch = 0;
     clusterNode *sender = getNodeFromLinkAndMsg(link, hdr);

From d76ca02c3a019ccc994a7316605c0def46d75721 Mon Sep 17 00:00:00 2001
From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com>
Date: Mon, 16 Dec 2024 13:01:34 -0800
Subject: [PATCH 017/101] Fix the secrete for test bucket. (#1447)

We have set the secret as `AWS_S3_TEST_BUCKET` for test bucket and I
missed it in the initial review.

Signed-off-by: Roshan Khatri <rvkhatri@amazon.com>
---
 .github/workflows/build-release-packages.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 6c54971bcd..3f1ca2627b 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -59,8 +59,10 @@ jobs:
         id: check-if-testing
         run: |
           if [[ "${{ github.event_name }}" == "push" ]]; then
+            echo "This is a test workflow -> We will upload to the Test S3 Bucket"
             echo "IS_TEST=true" >> $GITHUB_OUTPUT
           else
+            echo "This is a Release workflow -> We will upload to the Release S3 Bucket"
             echo "IS_TEST=false" >> $GITHUB_OUTPUT
           fi
         shell: bash
@@ -92,7 +94,7 @@ jobs:
       build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }}
       region: us-west-2
     secrets:
-      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
       role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
 
   release-build-linux-arm-packages:
@@ -106,5 +108,5 @@ jobs:
       build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }}
       region: us-west-2
     secrets:
-      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
       role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}

From bc4f865acac3579fccac5d97128eee90c17d3be8 Mon Sep 17 00:00:00 2001
From: xbasel <103044017+xbasel@users.noreply.github.com>
Date: Tue, 17 Dec 2024 18:04:27 +0200
Subject: [PATCH 018/101] Fix test_reclaimFilePageCache to avoid tmpfs (#1379)

Avoid tmpfs as fadvise(FADV_DONTNEED) has no effect on memory-backed
filesystems.

Fixes https://github.com/valkey-io/valkey/issues/897

---------

Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
---
 src/unit/test_util.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/unit/test_util.c b/src/unit/test_util.c
index 4558c38c3b..9858318e06 100644
--- a/src/unit/test_util.c
+++ b/src/unit/test_util.c
@@ -6,6 +6,11 @@
 #include "../util.h"
 #include "test_help.h"
 
+#if defined(__linux__)
+#include <sys/statfs.h>
+#include <linux/magic.h>
+#endif
+
 int test_string2ll(int argc, char **argv, int flags) {
     UNUSED(argc);
     UNUSED(argv);
@@ -291,6 +296,15 @@ int test_reclaimFilePageCache(int argc, char **argv, int flags) {
     if (flags & UNIT_TEST_VALGRIND) return 0;
 
 #if defined(__linux__)
+    struct statfs stats;
+
+    /* Check if /tmp is memory-backed (e.g., tmpfs) */
+    if (statfs("/tmp", &stats) == 0) {
+        if (stats.f_type != TMPFS_MAGIC) { // Not tmpfs, use /tmp
+            return 0;
+        }
+    }
+
     char *tmpfile = "/tmp/redis-reclaim-cache-test";
     int fd = open(tmpfile, O_RDWR | O_CREAT, 0644);
     TEST_ASSERT(fd >= 0);

From 5c414bf784d1319e7d4f2705a77cc582642db36a Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Tue, 17 Dec 2024 19:07:55 +0200
Subject: [PATCH 019/101] Introduce FORCE_DEFRAG compilation option to allow
 activedefrag run when allocator is not jemalloc (#1303)

Introduce compile time option to force activedefrag to run even when
jemalloc is not used as the allocator.
This is in order to be able to run tests with defrag enabled
while using memory instrumentation tools.

fixes: https://github.com/valkey-io/valkey/issues/1241

---------

Signed-off-by: ranshid <ranshid@amazon.com>
Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
---
 .github/workflows/daily.yml | 46 +++++++++++++++++++++++++++++
 CMakeLists.txt              |  1 +
 deps/CMakeLists.txt         |  4 ++-
 src/CMakeLists.txt          |  6 ++++
 src/Makefile                |  5 ++++
 src/allocator_defrag.c      | 59 ++++++++++++++++++++++++++++++++++---
 src/allocator_defrag.h      | 10 ++++---
 src/config.c                |  2 +-
 src/defrag.c                | 28 ------------------
 src/server.h                |  5 ++++
 tests/support/server.tcl    |  5 ++++
 tests/test_helper.tcl       |  4 +++
 tests/unit/info.tcl         |  2 +-
 13 files changed, 138 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index c06d73440d..44386f5ffd 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -689,6 +689,52 @@ jobs:
         if: true && !contains(github.event.inputs.skiptests, 'unittest')
         run: ./src/valkey-unit-tests --accurate
 
+  test-sanitizer-force-defrag:
+    runs-on: ubuntu-latest
+    if: |
+      (github.event_name == 'workflow_dispatch' ||
+        (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
+        (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
+      !contains(github.event.inputs.skipjobs, 'sanitizer')
+    timeout-minutes: 14400
+    strategy:
+      fail-fast: false
+    steps:
+      - name: prep
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV
+          echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV
+          echo "skipjobs: ${{github.event.inputs.skipjobs}}"
+          echo "skiptests: ${{github.event.inputs.skiptests}}"
+          echo "test_args: ${{github.event.inputs.test_args}}"
+          echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          repository: ${{ env.GITHUB_REPOSITORY }}
+          ref: ${{ env.GITHUB_HEAD_REF }}
+      - name: make
+        run: make all-with-unit-tests OPT=-O3 SANITIZER=address DEBUG_FORCE_DEFRAG=yes USE_JEMALLOC=no SERVER_CFLAGS='-Werror'
+      - name: testprep
+        run: |
+          sudo apt-get update
+          sudo apt-get install tcl8.6 tclx -y
+      - name: test
+        if: true && !contains(github.event.inputs.skiptests, 'valkey')
+        run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}}
+      - name: module api test
+        if: true && !contains(github.event.inputs.skiptests, 'modules')
+        run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}}
+      - name: sentinel tests
+        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
+        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
+      - name: cluster tests
+        if: true && !contains(github.event.inputs.skiptests, 'cluster')
+        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
+      - name: unittest
+        if: true && !contains(github.event.inputs.skiptests, 'unittest')
+        run: ./src/valkey-unit-tests
+
   test-rpm-distros-jemalloc:
     if: |
       (github.event_name == 'workflow_dispatch' ||
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77d0c4e7d8..55b18cb994 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,3 +41,4 @@ unset(BUILD_UNIT_TESTS CACHE)
 unset(BUILD_TEST_MODULES CACHE)
 unset(BUILD_EXAMPLE_MODULES CACHE)
 unset(USE_TLS CACHE)
+unset(DEBUG_FORCE_DEFRAG CACHE)
diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt
index c904b94031..3f5b04dc22 100644
--- a/deps/CMakeLists.txt
+++ b/deps/CMakeLists.txt
@@ -1,4 +1,6 @@
-add_subdirectory(jemalloc)
+if (USE_JEMALLOC)
+    add_subdirectory(jemalloc)
+endif ()
 add_subdirectory(lua)
 
 # Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b87dff3db0..90d7e25cf4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -22,6 +22,12 @@ if (VALKEY_RELEASE_BUILD)
     set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif ()
 
+if (DEBUG_FORCE_DEFRAG)
+    message(STATUS "Forcing Active Defrag run on valkey-server")
+    target_compile_definitions(valkey-server PRIVATE DEBUG_FORCE_DEFRAG)
+    target_compile_definitions(valkey-server PRIVATE HAVE_DEFRAG)
+endif ()
+
 if (BUILD_SANITIZER)
     # 'BUILD_SANITIZER' is defined in ValkeySetup module (based on user input)
     # If defined, the variables 'VALKEY_SANITAIZER_CFLAGS' and 'VALKEY_SANITAIZER_LDFLAGS'
diff --git a/src/Makefile b/src/Makefile
index 8552deb3d9..e52f4f08d3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -130,6 +130,11 @@ ifdef REDIS_LDFLAGS
     SERVER_LDFLAGS := $(REDIS_LDFLAGS)
 endif
 
+# Special case of forcing defrag to run even though we have no Jemlloc support
+ifeq ($(DEBUG_FORCE_DEFRAG), yes)
+	SERVER_CFLAGS +=-DHAVE_DEFRAG -DDEBUG_FORCE_DEFRAG
+endif
+
 FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS)
 FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG)
 FINAL_LIBS=-lm
diff --git a/src/allocator_defrag.c b/src/allocator_defrag.c
index b2330c95e0..5e805b3044 100644
--- a/src/allocator_defrag.c
+++ b/src/allocator_defrag.c
@@ -43,12 +43,10 @@
  * the other component to ensure both are using the same allocator configuration.
  */
 
-#include <stdio.h>
+#include "server.h"
 #include "serverassert.h"
 #include "allocator_defrag.h"
 
-#define UNUSED(x) (void)(x)
-
 #if defined(HAVE_DEFRAG) && defined(USE_JEMALLOC)
 
 #define STRINGIFY_(x) #x
@@ -402,8 +400,56 @@ int allocatorShouldDefrag(void *ptr) {
                               je_cb.bin_info[binind].nregs - SLAB_NFREE(out, 0));
 }
 
-#else
+/* Utility function to get the fragmentation ratio from jemalloc.
+ * It is critical to do that by comparing only heap maps that belong to
+ * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this
+ * fragmentation ratio in order to decide if a defrag action should be taken
+ * or not, a false detection can cause the defragmenter to waste a lot of CPU
+ * without the possibility of getting any results. */
+float getAllocatorFragmentation(size_t *out_frag_bytes) {
+    size_t resident, active, allocated, frag_smallbins_bytes;
+    zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL);
+    frag_smallbins_bytes = allocatorDefragGetFragSmallbins();
+    /* Calculate the fragmentation ratio as the proportion of wasted memory in small
+     * bins (which are defraggable) relative to the total allocated memory (including large bins).
+     * This is because otherwise, if most of the memory usage is large bins, we may show high percentage,
+     * despite the fact it's not a lot of memory for the user. */
+    float frag_pct = (float)frag_smallbins_bytes / allocated * 100;
+    float rss_pct = ((float)resident / allocated) * 100 - 100;
+    size_t rss_bytes = resident - allocated;
+    if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes;
+    serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)",
+              allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes);
+    return frag_pct;
+}
 
+#elif defined(DEBUG_FORCE_DEFRAG)
+int allocatorDefragInit(void) {
+    return 0;
+}
+void allocatorDefragFree(void *ptr, size_t size) {
+    UNUSED(size);
+    zfree(ptr);
+}
+__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) {
+    return zmalloc(size);
+    return NULL;
+}
+unsigned long allocatorDefragGetFragSmallbins(void) {
+    return 0;
+}
+
+int allocatorShouldDefrag(void *ptr) {
+    UNUSED(ptr);
+    return 1;
+}
+
+float getAllocatorFragmentation(size_t *out_frag_bytes) {
+    *out_frag_bytes = server.active_defrag_ignore_bytes + 1;
+    return server.active_defrag_threshold_upper;
+}
+
+#else
 int allocatorDefragInit(void) {
     return -1;
 }
@@ -423,4 +469,9 @@ int allocatorShouldDefrag(void *ptr) {
     UNUSED(ptr);
     return 0;
 }
+
+float getAllocatorFragmentation(size_t *out_frag_bytes) {
+    UNUSED(out_frag_bytes);
+    return 0;
+}
 #endif
diff --git a/src/allocator_defrag.h b/src/allocator_defrag.h
index 7fb56208b6..7947bef72c 100644
--- a/src/allocator_defrag.h
+++ b/src/allocator_defrag.h
@@ -5,10 +5,11 @@
 #include <jemalloc/jemalloc.h>
 /* We can enable the server defrag capabilities only if we are using Jemalloc
  * and the version that has the experimental.utilization namespace in mallctl . */
-#if defined(JEMALLOC_VERSION_MAJOR) &&                              \
-    (JEMALLOC_VERSION_MAJOR > 5 ||                                  \
-     (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \
-     (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1))
+#if (defined(JEMALLOC_VERSION_MAJOR) &&                                                                 \
+     (JEMALLOC_VERSION_MAJOR > 5 ||                                                                     \
+      (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) ||                                    \
+      (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1))) || \
+    defined(DEBUG_FORCE_DEFRAG)
 #define HAVE_DEFRAG
 #endif
 #endif
@@ -18,5 +19,6 @@ void allocatorDefragFree(void *ptr, size_t size);
 __attribute__((malloc)) void *allocatorDefragAlloc(size_t size);
 unsigned long allocatorDefragGetFragSmallbins(void);
 int allocatorShouldDefrag(void *ptr);
+float getAllocatorFragmentation(size_t *out_frag_bytes);
 
 #endif /* __ALLOCATOR_DEFRAG_H */
diff --git a/src/config.c b/src/config.c
index bcfa465e1f..f08b79ebbd 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3186,7 +3186,7 @@ standardConfig static_configs[] = {
     createBoolConfig("replica-read-only", "slave-read-only", DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_replica_ro, 1, NULL, NULL),
     createBoolConfig("replica-ignore-maxmemory", "slave-ignore-maxmemory", MODIFIABLE_CONFIG, server.repl_replica_ignore_maxmemory, 1, NULL, NULL),
     createBoolConfig("jemalloc-bg-thread", NULL, MODIFIABLE_CONFIG, server.jemalloc_bg_thread, 1, NULL, updateJemallocBgThread),
-    createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, 0, isValidActiveDefrag, NULL),
+    createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, CONFIG_ACTIVE_DEFRAG_DEFAULT, isValidActiveDefrag, NULL),
     createBoolConfig("syslog-enabled", NULL, IMMUTABLE_CONFIG, server.syslog_enabled, 0, NULL, NULL),
     createBoolConfig("cluster-enabled", NULL, IMMUTABLE_CONFIG, server.cluster_enabled, 0, NULL, NULL),
     createBoolConfig("appendonly", NULL, MODIFIABLE_CONFIG | DENY_LOADING_CONFIG, server.aof_enabled, 0, NULL, updateAppendonly),
diff --git a/src/defrag.c b/src/defrag.c
index 8e7fc8449e..6522d9aa7b 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -149,11 +149,6 @@ static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper
 static list *defrag_later;
 static unsigned long defrag_later_cursor;
 
-
-/* this method was added to jemalloc in order to help us understand which
- * pointers are worthwhile moving and which aren't */
-int je_get_defrag_hint(void *ptr);
-
 /* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block.
  * It is the responsibility of the caller to free the old block if a non-NULL value (new block)
  * is returned.  (Returns NULL if no relocation was needed.)
@@ -824,29 +819,6 @@ static void dbKeysScanCallback(void *privdata, void *elemref) {
     server.stat_active_defrag_scanned++;
 }
 
-/* Utility function to get the fragmentation ratio from jemalloc.
- * It is critical to do that by comparing only heap maps that belong to
- * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this
- * fragmentation ratio in order to decide if a defrag action should be taken
- * or not, a false detection can cause the defragmenter to waste a lot of CPU
- * without the possibility of getting any results. */
-static float getAllocatorFragmentation(size_t *out_frag_bytes) {
-    size_t resident, active, allocated, frag_smallbins_bytes;
-    zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL);
-    frag_smallbins_bytes = allocatorDefragGetFragSmallbins();
-    /* Calculate the fragmentation ratio as the proportion of wasted memory in small
-     * bins (which are defraggable) relative to the total allocated memory (including large bins).
-     * This is because otherwise, if most of the memory usage is large bins, we may show high percentage,
-     * despite the fact it's not a lot of memory for the user. */
-    float frag_pct = (float)frag_smallbins_bytes / allocated * 100;
-    float rss_pct = ((float)resident / allocated) * 100 - 100;
-    size_t rss_bytes = resident - allocated;
-    if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes;
-    serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)",
-              allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes);
-    return frag_pct;
-}
-
 /* Defrag scan callback for a pubsub channels hashtable. */
 static void defragPubsubScanCallback(void *privdata, void *elemref) {
     defragPubSubCtx *ctx = privdata;
diff --git a/src/server.h b/src/server.h
index b07144de92..eb19d18c8d 100644
--- a/src/server.h
+++ b/src/server.h
@@ -148,6 +148,11 @@ struct hdr_histogram;
 #define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60      /* Grace period in seconds for replica main \
                                                      * channel to establish psync. */
 #define LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT 100 /* Default: 0.1 seconds */
+#if !defined(DEBUG_FORCE_DEFRAG)
+#define CONFIG_ACTIVE_DEFRAG_DEFAULT 0
+#else
+#define CONFIG_ACTIVE_DEFRAG_DEFAULT 1
+#endif
 
 /* Bucket sizes for client eviction pools. Each bucket stores clients with
  * memory usage of up to twice the size of the bucket below it. */
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 7257339042..8c545d900a 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -221,6 +221,11 @@ proc tags_acceptable {tags err_return} {
         return 0
     }
 
+    if {$::debug_defrag && [lsearch $tags "debug_defrag:skip"] >= 0} {
+        set err "Not supported on server compiled with DEBUG_FORCE_DEFRAG option"
+        return 0
+    }
+
     if {$::singledb && [lsearch $tags "singledb:skip"] >= 0} {
         set err "Not supported on singledb"
         return 0
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 1f0658071a..8a4125e48d 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -92,6 +92,7 @@ set ::large_memory 0
 set ::log_req_res 0
 set ::force_resp3 0
 set ::solo_tests_count 0
+set ::debug_defrag 0
 
 # Set to 1 when we are running in client mode. The server test uses a
 # server-client model to run tests simultaneously. The server instance
@@ -607,6 +608,7 @@ proc print_help_screen {} {
         "--ignore-encoding  Don't validate object encoding."
         "--ignore-digest    Don't use debug digest validations."
         "--large-memory     Run tests using over 100mb."
+        "--debug-defrag     Indicate the test is running against server compiled with DEBUG_FORCE_DEFRAG option"
         "--help             Print this help screen."
     } "\n"]
 }
@@ -748,6 +750,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
         set ::ignoreencoding 1
     } elseif {$opt eq {--ignore-digest}} {
         set ::ignoredigest 1
+    } elseif {$opt eq {--debug-defrag}} {
+        set ::debug_defrag 1
     } elseif {$opt eq {--help}} {
         print_help_screen
         exit 0
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index e50faba62b..a27043fa88 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -10,7 +10,7 @@ proc latency_percentiles_usec {cmd} {
     return [latencyrstat_percentiles $cmd r]
 }
 
-start_server {tags {"info" "external:skip"}} {
+start_server {tags {"info" "external:skip" "debug_defrag:skip"}} {
     start_server {} {
 
         test {latencystats: disable/enable} {

From 1e20853898d05d7caef842bfbe84223c4e54f267 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= <viktor@szepe.net>
Date: Wed, 18 Dec 2024 02:45:43 +0100
Subject: [PATCH 020/101] Discover and fix new typos (#1446)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upgrade `typos` and fix corresponding typos

---------

Signed-off-by: Viktor Szépe <viktor@szepe.net>
---
 .github/workflows/spell-check.yml             |  2 +-
 src/geohash_helper.c                          |  2 +-
 src/server.c                                  |  4 +-
 src/server.h                                  |  2 +-
 src/zmalloc.c                                 |  2 +-
 tests/integration/aof-multi-part.tcl          | 84 +++++++++----------
 tests/integration/aof.tcl                     |  4 +-
 .../integration/dual-channel-replication.tcl  | 12 +--
 tests/support/aofmanifest.tcl                 |  4 +-
 tests/support/test.tcl                        |  4 +-
 10 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml
index 69d9b9cb6a..14db670b24 100644
--- a/.github/workflows/spell-check.yml
+++ b/.github/workflows/spell-check.yml
@@ -26,7 +26,7 @@ jobs:
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
       - name: Install typos
-        uses: taiki-e/install-action@cd5df4de2e75f3b819ba55f780f7bb8cd4a05a41 # v2.32.2
+        uses: taiki-e/install-action@fe9759bf4432218c779595708e80a1aadc85cedc # v2.46.10
         with:
           tool: typos
 
diff --git a/src/geohash_helper.c b/src/geohash_helper.c
index aa4b4743a6..c05c2f2634 100644
--- a/src/geohash_helper.c
+++ b/src/geohash_helper.c
@@ -48,7 +48,7 @@
 
 /// @brief The usual PI/180 constant
 const double DEG_TO_RAD = 0.017453292519943295769236907684886;
-/// @brief Earth's quatratic mean radius for WGS-84
+/// @brief Earth's quadratic mean radius for WGS-84
 const double EARTH_RADIUS_IN_METERS = 6372797.560856;
 
 const double MERCATOR_MAX = 20037726.37;
diff --git a/src/server.c b/src/server.c
index da06884eb1..db39970632 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1702,7 +1702,7 @@ static void sendGetackToReplicas(void) {
     robj *argv[3];
     argv[0] = shared.replconf;
     argv[1] = shared.getack;
-    argv[2] = shared.special_asterick; /* Not used argument. */
+    argv[2] = shared.special_asterisk; /* Not used argument. */
     replicationFeedReplicas(-1, argv, 3);
 }
 
@@ -2088,7 +2088,7 @@ void createSharedObjects(void) {
     shared.load = createStringObject("LOAD", 4);
     shared.createconsumer = createStringObject("CREATECONSUMER", 14);
     shared.getack = createStringObject("GETACK", 6);
-    shared.special_asterick = createStringObject("*", 1);
+    shared.special_asterisk = createStringObject("*", 1);
     shared.special_equals = createStringObject("=", 1);
     shared.redacted = makeObjectShared(createStringObject("(redacted)", 10));
 
diff --git a/src/server.h b/src/server.h
index eb19d18c8d..b0e4ae1050 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1444,7 +1444,7 @@ struct sharedObjectsStruct {
         *rpoplpush, *lmove, *blmove, *zpopmin, *zpopmax, *emptyscan, *multi, *exec, *left, *right, *hset, *srem,
         *xgroup, *xclaim, *script, *replconf, *eval, *persist, *set, *pexpireat, *pexpire, *time, *pxat, *absttl,
         *retrycount, *force, *justid, *entriesread, *lastid, *ping, *setid, *keepttl, *load, *createconsumer, *getack,
-        *special_asterick, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk,
+        *special_asterisk, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk,
         *smessagebulk, *select[PROTO_SHARED_SELECT_CMDS], *integers[OBJ_SHARED_INTEGERS],
         *mbulkhdr[OBJ_SHARED_BULKHDR_LEN], /* "*<value>\r\n" */
         *bulkhdr[OBJ_SHARED_BULKHDR_LEN],  /* "$<value>\r\n" */
diff --git a/src/zmalloc.c b/src/zmalloc.c
index b1de4f2af1..3abf9a31a0 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -762,7 +762,7 @@ void zlibc_trim(void) {
 /* For proc_pidinfo() used later in zmalloc_get_smap_bytes_by_field().
  * Note that this file cannot be included in zmalloc.h because it includes
  * a Darwin queue.h file where there is a "LIST_HEAD" macro (!) defined
- * conficting with user code. */
+ * conflicting with user code. */
 #include <libproc.h>
 #endif
 
diff --git a/tests/integration/aof-multi-part.tcl b/tests/integration/aof-multi-part.tcl
index 5c4f24b7d4..9a23031c08 100644
--- a/tests/integration/aof-multi-part.tcl
+++ b/tests/integration/aof-multi-part.tcl
@@ -4,11 +4,11 @@ set server_path [tmpdir server.multi.aof]
 set aof_dirname "appendonlydir"
 set aof_basename "appendonly.aof"
 set aof_dirpath "$server_path/$aof_dirname"
-set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix"
-set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_sufix$::aof_format_suffix"
-set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix"
-set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_sufix$::aof_format_suffix"
-set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_sufix$::aof_format_suffix"
+set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix"
+set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_suffix$::aof_format_suffix"
+set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix"
+set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_suffix$::aof_format_suffix"
+set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_suffix$::aof_format_suffix"
 set aof_manifest_file "$server_path/$aof_dirname/${aof_basename}$::manifest_suffix"
 set aof_old_name_old_path "$server_path/$aof_basename"
 set aof_old_name_new_path "$aof_dirpath/$aof_basename"
@@ -705,7 +705,7 @@ tags {"external:skip"} {
             set client [valkey [srv host] [srv port] 0 $::tls]
             wait_done_loading $client
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.1.base.rdb seq 1 type b}
@@ -728,7 +728,7 @@ tags {"external:skip"} {
             set client [valkey [srv host] [srv port] 0 $::tls]
             wait_done_loading $client
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.1.base.aof seq 1 type b}
@@ -750,7 +750,7 @@ tags {"external:skip"} {
         start_server_aof [list dir $server_path aof-use-rdb-preamble no] {
             wait_done_loading r
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.1.base.aof seq 1 type b}
@@ -827,8 +827,8 @@ tags {"external:skip"} {
 
             # Check we really have these files
             assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"]
 
             r bgrewriteaof
             waitForBgrewriteaof r
@@ -842,13 +842,13 @@ tags {"external:skip"} {
             assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name]
             # Wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"]
 
             stop_write_load $load_handle0
             wait_load_handlers_disconnected
@@ -901,11 +901,11 @@ tags {"external:skip"} {
                 {file appendonly.aof.5.incr.aof seq 5 type i}
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"]
 
             stop_write_load $load_handle0
             wait_load_handlers_disconnected
@@ -936,17 +936,17 @@ tags {"external:skip"} {
 
             # Wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"]
 
             set d1 [r debug digest]
             r debug loadaof
@@ -965,10 +965,10 @@ tags {"external:skip"} {
                 {file appendonly.aof.4.base.rdb seq 4 type b}
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"]
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
@@ -990,13 +990,13 @@ tags {"external:skip"} {
 
             # Wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"]
         }
 
         test "AOF enable/disable auto gc" {
@@ -1018,10 +1018,10 @@ tags {"external:skip"} {
                 {file appendonly.aof.3.incr.aof seq 3 type i}
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"]
 
             r config set aof-disable-auto-gc no
 
@@ -1033,10 +1033,10 @@ tags {"external:skip"} {
 
             # wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
@@ -1192,7 +1192,7 @@ tags {"external:skip"} {
             waitForBgrewriteaof r
 
             # Can create New INCR AOF
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_suffix}${::aof_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.11.base.rdb seq 11 type b}
@@ -1248,7 +1248,7 @@ tags {"external:skip"} {
                 # Make sure manifest file is not created
                 assert_equal 0 [check_file_exist $aof_dirpath $aof_manifest_name]
                 # Make sure BASE AOF is not created
-                assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"]
+                assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"]
 
                 # Make sure the next AOFRW has started
                 wait_for_condition 1000 50 {
diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl
index 33c7c12d4b..3a666bbd15 100644
--- a/tests/integration/aof.tcl
+++ b/tests/integration/aof.tcl
@@ -4,8 +4,8 @@ set server_path [tmpdir server.aof]
 set aof_dirname "appendonlydir"
 set aof_basename "appendonly.aof"
 set aof_dirpath "$server_path/$aof_dirname"
-set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix"
-set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix"
+set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix"
+set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix"
 set aof_manifest_file "$server_path/$aof_dirname/$aof_basename$::manifest_suffix"
 
 tags {"aof external:skip"} {
diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index e417dad6c9..8191b9f699 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -355,8 +355,8 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 verify_replica_online $primary 0 500
                 verify_replica_online $primary 1 500
 
-                wait_for_value_to_propegate_to_replica $primary $replica1 "key1"
-                wait_for_value_to_propegate_to_replica $primary $replica2 "key1"
+                wait_for_value_to_propagate_to_replica $primary $replica1 "key1"
+                wait_for_value_to_propagate_to_replica $primary $replica2 "key1"
 
                 assert {[s 0 total_forks] eq "1" }       
             }
@@ -374,8 +374,8 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 $replica2 replicaof $primary_host $primary_port
                 verify_replica_online $primary 0 500
                 verify_replica_online $primary 1 500
-                wait_for_value_to_propegate_to_replica $primary $replica1 "key2"
-                wait_for_value_to_propegate_to_replica $primary $replica2 "key2"
+                wait_for_value_to_propagate_to_replica $primary $replica1 "key2"
+                wait_for_value_to_propagate_to_replica $primary $replica2 "key2"
                 wait_for_condition 50 1000 {
                     [status $replica1 master_link_status] == "up"
                 } else {
@@ -444,7 +444,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 } else {
                     fail "Replica is not synced"
                 }
-                wait_for_value_to_propegate_to_replica $primary $replica1 "key3"
+                wait_for_value_to_propagate_to_replica $primary $replica1 "key3"
 
                 # Verify that we did not use dual-channel-replication sync
                 assert {[status $primary sync_partial_ok] == $cur_psync}
@@ -483,7 +483,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "Replica is not synced"
             }
-            wait_for_value_to_propegate_to_replica $primary $replica "key1"
+            wait_for_value_to_propagate_to_replica $primary $replica "key1"
             # Confirm the occurrence of a race condition.
             wait_for_log_messages -1 {"*<Dual Channel> Psync established after rdb load*"} 0 2000 1
         }
diff --git a/tests/support/aofmanifest.tcl b/tests/support/aofmanifest.tcl
index 308d1172aa..fc20bacc99 100644
--- a/tests/support/aofmanifest.tcl
+++ b/tests/support/aofmanifest.tcl
@@ -1,5 +1,5 @@
-set ::base_aof_sufix ".base"
-set ::incr_aof_sufix ".incr"
+set ::base_aof_suffix ".base"
+set ::incr_aof_suffix ".incr"
 set ::manifest_suffix ".manifest"
 set ::aof_format_suffix ".aof"
 set ::rdb_format_suffix ".rdb"
diff --git a/tests/support/test.tcl b/tests/support/test.tcl
index 262dc66041..3fd74d0387 100644
--- a/tests/support/test.tcl
+++ b/tests/support/test.tcl
@@ -160,12 +160,12 @@ proc verify_replica_online {master replica_idx max_retry} {
     } 
 }
 
-proc wait_for_value_to_propegate_to_replica {master replica key} {
+proc wait_for_value_to_propagate_to_replica {master replica key} {
     set val [$master get $key]
     wait_for_condition 50 500 {
                 ([$replica get $key] eq $val)
     } else {
-        error "Key $key did not propegate. Expected $val but got [$replica get $key]"
+        error "Key $key did not propagate. Expected $val but got [$replica get $key]"
     }
 }
 

From cbe08dd0f29a5d27027a9b21695c83e5e1cc9972 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Tue, 17 Dec 2024 17:48:53 -0800
Subject: [PATCH 021/101] Fix undefined behavior defined by ASAN (#1451)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Asan now supports making sure you are passing in the correct pointer
type, which seems useful but we can't support it since we pass in an
incorrect pointer in several places. This is most commonly done with
generic free functions, where we simply cast it to the correct type.

It's not a lot of code to clean up, so it seems appropriate to cleanup
instead of disabling the check.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/acl.c                | 20 ++++++++++----------
 src/adlist.c             |  6 ++++++
 src/adlist.h             |  1 +
 src/call_reply.c         |  2 +-
 src/db.c                 |  2 +-
 src/defrag.c             |  2 +-
 src/eval.c               |  4 ++--
 src/functions.c          |  2 +-
 src/listpack.c           |  6 ++++++
 src/listpack.h           |  1 +
 src/module.c             |  2 +-
 src/networking.c         |  2 +-
 src/replication.c        |  2 +-
 src/t_stream.c           | 19 +++++++++++++++----
 src/unit/test_listpack.c |  2 +-
 src/unit/test_ziplist.c  |  2 +-
 16 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/src/acl.c b/src/acl.c
index cfcf102887..d1f970a805 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -297,11 +297,6 @@ int ACLListMatchSds(void *a, void *b) {
     return sdscmp(a, b) == 0;
 }
 
-/* Method to free list elements from ACL users password/patterns lists. */
-void ACLListFreeSds(void *item) {
-    sdsfree(item);
-}
-
 /* Method to duplicate list elements from ACL users password/patterns lists. */
 void *ACLListDupSds(void *item) {
     return sdsdup(item);
@@ -374,7 +369,7 @@ aclSelector *ACLCreateSelector(int flags) {
     listSetFreeMethod(selector->patterns, ACLListFreeKeyPattern);
     listSetDupMethod(selector->patterns, ACLListDupKeyPattern);
     listSetMatchMethod(selector->channels, ACLListMatchSds);
-    listSetFreeMethod(selector->channels, ACLListFreeSds);
+    listSetFreeMethod(selector->channels, sdsfreeVoid);
     listSetDupMethod(selector->channels, ACLListDupSds);
     memset(selector->allowed_commands, 0, sizeof(selector->allowed_commands));
 
@@ -445,7 +440,7 @@ user *ACLCreateUser(const char *name, size_t namelen) {
     u->passwords = listCreate();
     u->acl_string = NULL;
     listSetMatchMethod(u->passwords, ACLListMatchSds);
-    listSetFreeMethod(u->passwords, ACLListFreeSds);
+    listSetFreeMethod(u->passwords, sdsfreeVoid);
     listSetDupMethod(u->passwords, ACLListDupSds);
 
     u->selectors = listCreate();
@@ -489,6 +484,11 @@ void ACLFreeUser(user *u) {
     zfree(u);
 }
 
+/* Used for generic free functions. */
+static void ACLFreeUserVoid(void *u) {
+    ACLFreeUser(u);
+}
+
 /* When a user is deleted we need to cycle the active
  * connections in order to kill all the pending ones that
  * are authenticated with such user. */
@@ -2445,12 +2445,12 @@ sds ACLLoadFromFile(const char *filename) {
             c->user = new_user;
         }
 
-        if (user_channels) raxFreeWithCallback(user_channels, (void (*)(void *))listRelease);
-        raxFreeWithCallback(old_users, (void (*)(void *))ACLFreeUser);
+        if (user_channels) raxFreeWithCallback(user_channels, listReleaseVoid);
+        raxFreeWithCallback(old_users, ACLFreeUserVoid);
         sdsfree(errors);
         return NULL;
     } else {
-        raxFreeWithCallback(Users, (void (*)(void *))ACLFreeUser);
+        raxFreeWithCallback(Users, ACLFreeUserVoid);
         Users = old_users;
         errors =
             sdscat(errors, "WARNING: ACL errors detected, no change to the previously active ACL rules was performed");
diff --git a/src/adlist.c b/src/adlist.c
index 11b152592b..0dc77cc038 100644
--- a/src/adlist.c
+++ b/src/adlist.c
@@ -77,6 +77,12 @@ void listRelease(list *list) {
     zfree(list);
 }
 
+/* Just like listRelease, but takes the list as a (void *).
+ * Useful as generic free callback. */
+void listReleaseVoid(void *l) {
+    listRelease((list *)l);
+}
+
 /* Add a new node to the list, to head, containing the specified 'value'
  * pointer as value.
  *
diff --git a/src/adlist.h b/src/adlist.h
index bfc4280434..c642c1c791 100644
--- a/src/adlist.h
+++ b/src/adlist.h
@@ -72,6 +72,7 @@ typedef struct list {
 /* Prototypes */
 list *listCreate(void);
 void listRelease(list *list);
+void listReleaseVoid(void *list);
 void listEmpty(list *list);
 list *listAddNodeHead(list *list, void *value);
 list *listAddNodeTail(list *list, void *value);
diff --git a/src/call_reply.c b/src/call_reply.c
index 00d196081e..dc981b8be8 100644
--- a/src/call_reply.c
+++ b/src/call_reply.c
@@ -559,7 +559,7 @@ CallReply *callReplyCreateError(sds reply, void *private_data) {
         sdsfree(reply);
     }
     list *deferred_error_list = listCreate();
-    listSetFreeMethod(deferred_error_list, (void (*)(void *))sdsfree);
+    listSetFreeMethod(deferred_error_list, sdsfreeVoid);
     listAddNodeTail(deferred_error_list, sdsnew(err_buff));
     return callReplyCreate(err_buff, deferred_error_list, private_data);
 }
diff --git a/src/db.c b/src/db.c
index 1223d00c8d..e31d7e7f7f 100644
--- a/src/db.c
+++ b/src/db.c
@@ -1193,7 +1193,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
      * are deep copied temporary strings. We must not free them if they are just
      * a shallow copy - a pointer to the actual data in the data structure */
     if (!shallow_copied_list_items) {
-        listSetFreeMethod(keys, (void (*)(void *))sdsfree);
+        listSetFreeMethod(keys, sdsfreeVoid);
     }
 
     /* For main hash table scan or scannable data structure. */
diff --git a/src/defrag.c b/src/defrag.c
index 6522d9aa7b..e9f40d4fab 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -421,7 +421,7 @@ static void activeDefragQuickListNodes(quicklist *ql) {
 static void defragLater(robj *obj) {
     if (!defrag_later) {
         defrag_later = listCreate();
-        listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree);
+        listSetFreeMethod(defrag_later, sdsfreeVoid);
         defrag_later_cursor = 0;
     }
     sds key = sdsdup(objectGetKey(obj));
diff --git a/src/eval.c b/src/eval.c
index a9c50cdf90..e9fac531f5 100644
--- a/src/eval.c
+++ b/src/eval.c
@@ -204,7 +204,7 @@ void scriptingInit(int setup) {
      * and we need to free them respectively. */
     lctx.lua_scripts = dictCreate(&shaScriptObjectDictType);
     lctx.lua_scripts_lru_list = listCreate();
-    listSetFreeMethod(lctx.lua_scripts_lru_list, (void (*)(void *))sdsfree);
+    listSetFreeMethod(lctx.lua_scripts_lru_list, sdsfreeVoid);
     lctx.lua_scripts_mem = 0;
 
     luaRegisterServerAPI(lua);
@@ -777,7 +777,7 @@ void ldbInit(void) {
     ldb.conn = NULL;
     ldb.active = 0;
     ldb.logs = listCreate();
-    listSetFreeMethod(ldb.logs, (void (*)(void *))sdsfree);
+    listSetFreeMethod(ldb.logs, sdsfreeVoid);
     ldb.children = listCreate();
     ldb.src = NULL;
     ldb.lines = 0;
diff --git a/src/functions.c b/src/functions.c
index b694e35252..feb82d4ab7 100644
--- a/src/functions.c
+++ b/src/functions.c
@@ -348,7 +348,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l
             } else {
                 if (!old_libraries_list) {
                     old_libraries_list = listCreate();
-                    listSetFreeMethod(old_libraries_list, (void (*)(void *))engineLibraryFree);
+                    listSetFreeMethod(old_libraries_list, engineLibraryDispose);
                 }
                 libraryUnlink(functions_lib_ctx_dst, old_li);
                 listAddNodeTail(old_libraries_list, old_li);
diff --git a/src/listpack.c b/src/listpack.c
index 2dfb321f56..76c2f9ea38 100644
--- a/src/listpack.c
+++ b/src/listpack.c
@@ -250,6 +250,12 @@ void lpFree(unsigned char *lp) {
     lp_free(lp);
 }
 
+/* Same as lpFree, but useful for when you are passing the listpack
+ * into a generic free function that expects (void *) */
+void lpFreeVoid(void *lp) {
+    lp_free((unsigned char *)lp);
+}
+
 /* Shrink the memory to fit. */
 unsigned char *lpShrinkToFit(unsigned char *lp) {
     size_t size = lpGetTotalBytes(lp);
diff --git a/src/listpack.h b/src/listpack.h
index aa7636143f..b143797261 100644
--- a/src/listpack.h
+++ b/src/listpack.h
@@ -56,6 +56,7 @@ typedef struct {
 
 unsigned char *lpNew(size_t capacity);
 void lpFree(unsigned char *lp);
+void lpFreeVoid(void *lp);
 unsigned char *lpShrinkToFit(unsigned char *lp);
 unsigned char *
 lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp);
diff --git a/src/module.c b/src/module.c
index 36283e2c73..541ae490ab 100644
--- a/src/module.c
+++ b/src/module.c
@@ -10399,7 +10399,7 @@ ValkeyModuleServerInfoData *VM_GetServerInfo(ValkeyModuleCtx *ctx, const char *s
  * context instead of passing NULL. */
 void VM_FreeServerInfo(ValkeyModuleCtx *ctx, ValkeyModuleServerInfoData *data) {
     if (ctx != NULL) autoMemoryFreed(ctx, VALKEYMODULE_AM_INFO, data);
-    raxFreeWithCallback(data->rax, (void (*)(void *))sdsfree);
+    raxFreeWithCallback(data->rax, sdsfreeVoid);
     zfree(data);
 }
 
diff --git a/src/networking.c b/src/networking.c
index 4d386d6dc4..16147ff0ba 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -556,7 +556,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
     if (c->flag.module) {
         if (!c->deferred_reply_errors) {
             c->deferred_reply_errors = listCreate();
-            listSetFreeMethod(c->deferred_reply_errors, (void (*)(void *))sdsfree);
+            listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid);
         }
         listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len));
         return;
diff --git a/src/replication.c b/src/replication.c
index b5ce77f5e0..3a207a1d0f 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -282,7 +282,7 @@ void removeReplicaFromPsyncWait(client *replica_main_client) {
 void resetReplicationBuffer(void) {
     server.repl_buffer_mem = 0;
     server.repl_buffer_blocks = listCreate();
-    listSetFreeMethod(server.repl_buffer_blocks, (void (*)(void *))zfree);
+    listSetFreeMethod(server.repl_buffer_blocks, zfree);
 }
 
 int canFeedReplicaReplBuffer(client *replica) {
diff --git a/src/t_stream.c b/src/t_stream.c
index 79aa080703..17254b58dd 100644
--- a/src/t_stream.c
+++ b/src/t_stream.c
@@ -54,6 +54,7 @@
 #define STREAM_LISTPACK_MAX_SIZE (1 << 30)
 
 void streamFreeCG(streamCG *cg);
+void streamFreeCGVoid(void *cg);
 void streamFreeNACK(streamNACK *na);
 size_t streamReplyWithRangeFromConsumerPEL(client *c,
                                            stream *s,
@@ -86,8 +87,8 @@ stream *streamNew(void) {
 
 /* Free a stream, including the listpacks stored inside the radix tree. */
 void freeStream(stream *s) {
-    raxFreeWithCallback(s->rax, (void (*)(void *))lpFree);
-    if (s->cgroups) raxFreeWithCallback(s->cgroups, (void (*)(void *))streamFreeCG);
+    raxFreeWithCallback(s->rax, lpFreeVoid);
+    if (s->cgroups) raxFreeWithCallback(s->cgroups, streamFreeCGVoid);
     zfree(s);
 }
 
@@ -2454,6 +2455,11 @@ void streamFreeConsumer(streamConsumer *sc) {
     zfree(sc);
 }
 
+/* Used for generic free functions. */
+static void streamFreeConsumerVoid(void *sc) {
+    streamFreeConsumer((streamConsumer *)sc);
+}
+
 /* Create a new consumer group in the context of the stream 's', having the
  * specified name, last server ID and reads counter. If a consumer group with
  * the same name already exists NULL is returned, otherwise the pointer to the
@@ -2473,11 +2479,16 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo
 
 /* Free a consumer group and all its associated data. */
 void streamFreeCG(streamCG *cg) {
-    raxFreeWithCallback(cg->pel, (void (*)(void *))streamFreeNACK);
-    raxFreeWithCallback(cg->consumers, (void (*)(void *))streamFreeConsumer);
+    raxFreeWithCallback(cg->pel, zfree);
+    raxFreeWithCallback(cg->consumers, streamFreeConsumerVoid);
     zfree(cg);
 }
 
+/* Used for generic free functions. */
+void streamFreeCGVoid(void *cg) {
+    streamFreeCG((streamCG *)cg);
+}
+
 /* Lookup the consumer group in the specified stream and returns its
  * pointer, otherwise if there is no such group, NULL is returned. */
 streamCG *streamLookupCG(stream *s, sds groupname) {
diff --git a/src/unit/test_listpack.c b/src/unit/test_listpack.c
index 4838fc8952..0c71da18db 100644
--- a/src/unit/test_listpack.c
+++ b/src/unit/test_listpack.c
@@ -1184,7 +1184,7 @@ int test_listpackStressWithRandom(int argc, char **argv, int flags) {
     for (i = 0; i < iteration; i++) {
         lp = lpNew(0);
         ref = listCreate();
-        listSetFreeMethod(ref, (void (*)(void *))sdsfree);
+        listSetFreeMethod(ref, sdsfreeVoid);
         len = rand() % 256;
 
         /* Create lists */
diff --git a/src/unit/test_ziplist.c b/src/unit/test_ziplist.c
index d2f7ebe69c..58687d81fc 100644
--- a/src/unit/test_ziplist.c
+++ b/src/unit/test_ziplist.c
@@ -645,7 +645,7 @@ int test_ziplistStressWithRandomPayloadsOfDifferentEncoding(int argc, char **arg
     for (i = 0; i < iteration; i++) {
         zl = ziplistNew();
         ref = listCreate();
-        listSetFreeMethod(ref, (void (*)(void *))sdsfree);
+        listSetFreeMethod(ref, sdsfreeVoid);
         len = rand() % 256;
 
         /* Create lists */

From c72089932c75c9ad6722f17e62bc6299ab142721 Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Wed, 18 Dec 2024 09:03:30 +0200
Subject: [PATCH 022/101] Offload TLS negotiation to I/O threads (#1338)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## TLS Negotiation Offloading to I/O Threads

### Overview
This PR introduces the ability to offload TLS handshake negotiations to
I/O threads, significantly improving performance under high TLS
connection loads.

### Key Changes
- Added infrastructure to offload TLS negotiations to I/O threads
- Refactored SSL event handling to allow I/O threads modify conn flags.
- Introduced new connection flag to identify client connections

### Performance Impact
Testing with 650 clients with SET commands and 160 new TLS connections
per second in the background:

#### Throughput Impact of new TLS connections
- **With Offloading**: Minimal impact (1050K → 990K ops/sec)
- **Without Offloading**: Significant drop (1050K → 670K ops/sec)

#### New Connection Rate
- **With Offloading**:
  - 1,757 conn/sec
- **Without Offloading**:
  - 477 conn/sec

### Implementation Details
1. **Main Thread**:
   - Initiates negotiation-offload jobs to I/O threads
- Adds connections to pending-read clients list (using existing read
offload mechanism)
   - Post-negotiation handling:
     - Creates read/write events if needed for incomplete negotiations
     - Calls accept handler for completed negotiations

2. **I/O Thread**:
   - Performs TLS negotiation
   - Updates connection flags based on negotiation result

Related issue:https://github.com/valkey-io/valkey/issues/761

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
---
 .github/workflows/daily.yml |  38 ++++++++++
 src/connection.h            |   5 +-
 src/io_threads.c            |  52 ++++++++++++++
 src/io_threads.h            |   1 +
 src/networking.c            |   6 ++
 src/server.c                |   2 +
 src/server.h                |   1 +
 src/tls.c                   | 139 ++++++++++++++++++------------------
 8 files changed, 174 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index 44386f5ffd..e1d577b51b 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -375,6 +375,44 @@ jobs:
         if: true && !contains(github.event.inputs.skiptests, 'cluster')
         run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}
 
+  test-ubuntu-tls-io-threads:
+    runs-on: ubuntu-latest
+    if: |
+      (github.event_name == 'workflow_dispatch' ||
+        (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
+        (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
+      !contains(github.event.inputs.skipjobs, 'tls') && !contains(github.event.inputs.skipjobs, 'iothreads')
+    timeout-minutes: 14400
+    steps:
+      - name: prep
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV
+          echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV
+          echo "skipjobs: ${{github.event.inputs.skipjobs}}"
+          echo "skiptests: ${{github.event.inputs.skiptests}}"
+          echo "test_args: ${{github.event.inputs.test_args}}"
+          echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          repository: ${{ env.GITHUB_REPOSITORY }}
+          ref: ${{ env.GITHUB_HEAD_REF }}
+      - name: make
+        run: |
+          make BUILD_TLS=yes SERVER_CFLAGS='-Werror'
+      - name: testprep
+        run: |
+          sudo apt-get install tcl8.6 tclx tcl-tls
+          ./utils/gen-test-certs.sh
+      - name: test
+        if: true && !contains(github.event.inputs.skiptests, 'valkey')
+        run: |
+          ./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
+      - name: cluster tests
+        if: true && !contains(github.event.inputs.skiptests, 'cluster')
+        run: |
+          ./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}}
+
   test-ubuntu-reclaim-cache:
     runs-on: ubuntu-latest
     if: |
diff --git a/src/connection.h b/src/connection.h
index 8a2775ee34..fd7e0910cf 100644
--- a/src/connection.h
+++ b/src/connection.h
@@ -54,8 +54,9 @@ typedef enum {
     CONN_STATE_ERROR
 } ConnectionState;
 
-#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */
-#define CONN_FLAG_WRITE_BARRIER (1 << 1)   /* Write barrier requested */
+#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0)      /* Closed scheduled by a handler */
+#define CONN_FLAG_WRITE_BARRIER (1 << 1)        /* Write barrier requested */
+#define CONN_FLAG_ALLOW_ACCEPT_OFFLOAD (1 << 2) /* Connection accept can be offloaded to IO threads. */
 
 #define CONN_TYPE_SOCKET "tcp"
 #define CONN_TYPE_UNIX "unix"
diff --git a/src/io_threads.c b/src/io_threads.c
index 3865eb77c3..90f5b88700 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -561,3 +561,55 @@ void trySendPollJobToIOThreads(void) {
     aeSetPollProtect(server.el, 1);
     IOJobQueue_push(jq, IOThreadPoll, server.el);
 }
+
+static void ioThreadAccept(void *data) {
+    client *c = (client *)data;
+    connAccept(c->conn, NULL);
+    c->io_read_state = CLIENT_COMPLETED_IO;
+}
+
+/*
+ * Attempts to offload an Accept operation (currently used for TLS accept) for a client
+ * connection to I/O threads.
+ *
+ * Returns:
+ *   C_OK  - If the accept operation was successfully queued for processing
+ *   C_ERR - If the connection is not eligible for offloading
+ *
+ * Parameters:
+ *   conn - The connection object to perform the accept operation on
+ */
+int trySendAcceptToIOThreads(connection *conn) {
+    if (server.io_threads_num <= 1) {
+        return C_ERR;
+    }
+
+    if (!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD)) {
+        return C_ERR;
+    }
+
+    client *c = connGetPrivateData(conn);
+    if (c->io_read_state != CLIENT_IDLE) {
+        return C_OK;
+    }
+
+    if (server.active_io_threads_num <= 1) {
+        return C_ERR;
+    }
+
+    size_t thread_id = (c->id % (server.active_io_threads_num - 1)) + 1;
+    IOJobQueue *job_queue = &io_jobs[thread_id];
+
+    if (IOJobQueue_isFull(job_queue)) {
+        return C_ERR;
+    }
+
+    c->io_read_state = CLIENT_PENDING_IO;
+    c->flag.pending_read = 1;
+    listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node);
+    connSetPostponeUpdateState(c->conn, 1);
+    server.stat_io_accept_offloaded++;
+    IOJobQueue_push(job_queue, ioThreadAccept, c);
+
+    return C_OK;
+}
diff --git a/src/io_threads.h b/src/io_threads.h
index 8818f08588..a3ff582a77 100644
--- a/src/io_threads.h
+++ b/src/io_threads.h
@@ -13,5 +13,6 @@ int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv);
 void adjustIOThreadsByEventLoad(int numevents, int increase_only);
 void drainIOThreadsQueue(void);
 void trySendPollJobToIOThreads(void);
+int trySendAcceptToIOThreads(connection *conn);
 
 #endif /* IO_THREADS_H */
diff --git a/src/networking.c b/src/networking.c
index 16147ff0ba..9f36f24275 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -134,6 +134,7 @@ client *createClient(connection *conn) {
         if (server.tcpkeepalive) connKeepAlive(conn, server.tcpkeepalive);
         connSetReadHandler(conn, readQueryFromClient);
         connSetPrivateData(conn, c);
+        conn->flags |= CONN_FLAG_ALLOW_ACCEPT_OFFLOAD;
     }
     c->buf = zmalloc_usable(PROTO_REPLY_CHUNK_BYTES, &c->buf_usable_size);
     selectDb(c, 0);
@@ -4805,9 +4806,14 @@ int processIOThreadsReadDone(void) {
         processed++;
         server.stat_io_reads_processed++;
 
+        /* Save the current conn state, as connUpdateState may modify it */
+        int in_accept_state = (connGetState(c->conn) == CONN_STATE_ACCEPTING);
         connSetPostponeUpdateState(c->conn, 0);
         connUpdateState(c->conn);
 
+        /* In accept state, no client's data was read - stop here. */
+        if (in_accept_state) continue;
+
         /* On read error - stop here. */
         if (handleReadResult(c) == C_ERR) {
             continue;
diff --git a/src/server.c b/src/server.c
index db39970632..a0c642b541 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2645,6 +2645,7 @@ void resetServerStats(void) {
     server.stat_total_reads_processed = 0;
     server.stat_io_writes_processed = 0;
     server.stat_io_freed_objects = 0;
+    server.stat_io_accept_offloaded = 0;
     server.stat_poll_processed_by_io_threads = 0;
     server.stat_total_writes_processed = 0;
     server.stat_client_qbuf_limit_disconnections = 0;
@@ -5922,6 +5923,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                 "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed,
                 "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed,
                 "io_threaded_freed_objects:%lld\r\n", server.stat_io_freed_objects,
+                "io_threaded_accept_processed:%lld\r\n", server.stat_io_accept_offloaded,
                 "io_threaded_poll_processed:%lld\r\n", server.stat_poll_processed_by_io_threads,
                 "io_threaded_total_prefetch_batches:%lld\r\n", server.stat_total_prefetch_batches,
                 "io_threaded_total_prefetch_entries:%lld\r\n", server.stat_total_prefetch_entries,
diff --git a/src/server.h b/src/server.h
index b0e4ae1050..d8497ccff5 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1869,6 +1869,7 @@ struct valkeyServer {
     long long stat_io_reads_processed;                 /* Number of read events processed by IO threads */
     long long stat_io_writes_processed;                /* Number of write events processed by IO threads */
     long long stat_io_freed_objects;                   /* Number of objects freed by IO threads */
+    long long stat_io_accept_offloaded;                /* Number of offloaded accepts */
     long long stat_poll_processed_by_io_threads;       /* Total number of poll jobs processed by IO */
     long long stat_total_reads_processed;              /* Total number of read events processed */
     long long stat_total_writes_processed;             /* Total number of write events processed */
diff --git a/src/tls.c b/src/tls.c
index 48b75553de..11e6143561 100644
--- a/src/tls.c
+++ b/src/tls.c
@@ -32,6 +32,7 @@
 #include "server.h"
 #include "connhelpers.h"
 #include "adlist.h"
+#include "io_threads.h"
 
 #if (USE_OPENSSL == 1 /* BUILD_YES */) || ((USE_OPENSSL == 2 /* BUILD_MODULE */) && (BUILD_TLS_MODULE == 2))
 
@@ -437,16 +438,13 @@ static ConnectionType CT_TLS;
  *
  */
 
-typedef enum {
-    WANT_READ = 1,
-    WANT_WRITE
-} WantIOType;
-
 #define TLS_CONN_FLAG_READ_WANT_WRITE (1 << 0)
 #define TLS_CONN_FLAG_WRITE_WANT_READ (1 << 1)
 #define TLS_CONN_FLAG_FD_SET (1 << 2)
 #define TLS_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 3)
 #define TLS_CONN_FLAG_HAS_PENDING (1 << 4)
+#define TLS_CONN_FLAG_ACCEPT_ERROR (1 << 5)
+#define TLS_CONN_FLAG_ACCEPT_SUCCESS (1 << 6)
 
 typedef struct tls_connection {
     connection c;
@@ -514,20 +512,26 @@ static connection *connCreateAcceptedTLS(int fd, void *priv) {
     return (connection *)conn;
 }
 
+static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler);
 static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask);
 static void updateSSLEvent(tls_connection *conn);
 
+static void clearTLSWantFlags(tls_connection *conn) {
+    conn->flags &= ~(TLS_CONN_FLAG_WRITE_WANT_READ | TLS_CONN_FLAG_READ_WANT_WRITE);
+}
+
 /* Process the return code received from OpenSSL>
- * Update the want parameter with expected I/O.
+ * Update the conn flags with the WANT_READ/WANT_WRITE flags.
  * Update the connection's error state if a real error has occurred.
  * Returns an SSL error code, or 0 if no further handling is required.
  */
-static int handleSSLReturnCode(tls_connection *conn, int ret_value, WantIOType *want) {
+static int handleSSLReturnCode(tls_connection *conn, int ret_value) {
+    clearTLSWantFlags(conn);
     if (ret_value <= 0) {
         int ssl_err = SSL_get_error(conn->ssl, ret_value);
         switch (ssl_err) {
-        case SSL_ERROR_WANT_WRITE: *want = WANT_WRITE; return 0;
-        case SSL_ERROR_WANT_READ: *want = WANT_READ; return 0;
+        case SSL_ERROR_WANT_WRITE: conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE; return 0;
+        case SSL_ERROR_WANT_READ: conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ; return 0;
         case SSL_ERROR_SYSCALL:
             conn->c.last_errno = errno;
             if (conn->ssl_error) zfree(conn->ssl_error);
@@ -563,11 +567,8 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update
     }
 
     if (ret_value <= 0) {
-        WantIOType want = 0;
         int ssl_err;
-        if (!(ssl_err = handleSSLReturnCode(conn, ret_value, &want))) {
-            if (want == WANT_READ) conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ;
-            if (want == WANT_WRITE) conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE;
+        if (!(ssl_err = handleSSLReturnCode(conn, ret_value))) {
             if (update_event) updateSSLEvent(conn);
             errno = EAGAIN;
             return -1;
@@ -585,19 +586,17 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update
     return ret_value;
 }
 
-static void registerSSLEvent(tls_connection *conn, WantIOType want) {
+static void registerSSLEvent(tls_connection *conn) {
     int mask = aeGetFileEvents(server.el, conn->c.fd);
 
-    switch (want) {
-    case WANT_READ:
+    if (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ) {
         if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE);
         if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn);
-        break;
-    case WANT_WRITE:
+    } else if (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE) {
         if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE);
         if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn);
-        break;
-    default: serverAssert(0); break;
+    } else {
+        serverAssert(0);
     }
 }
 
@@ -650,12 +649,47 @@ static void updateSSLEvent(tls_connection *conn) {
     if (!need_write && (mask & AE_WRITABLE)) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE);
 }
 
+static int TLSHandleAcceptResult(tls_connection *conn, int call_handler_on_error) {
+    serverAssert(conn->c.state == CONN_STATE_ACCEPTING);
+    if (conn->flags & TLS_CONN_FLAG_ACCEPT_SUCCESS) {
+        conn->c.state = CONN_STATE_CONNECTED;
+    } else if (conn->flags & TLS_CONN_FLAG_ACCEPT_ERROR) {
+        conn->c.state = CONN_STATE_ERROR;
+        if (!call_handler_on_error) return C_ERR;
+    } else {
+        /* Still pending accept */
+        registerSSLEvent(conn);
+        return C_OK;
+    }
+
+    /* call accept handler */
+    if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_ERR;
+    conn->c.conn_handler = NULL;
+    return C_OK;
+}
+
 static void updateSSLState(connection *conn_) {
     tls_connection *conn = (tls_connection *)conn_;
+
+    if (conn->c.state == CONN_STATE_ACCEPTING) {
+        if (TLSHandleAcceptResult(conn, 1) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return;
+    }
+
     updateSSLEvent(conn);
     updatePendingData(conn);
 }
 
+static void TLSAccept(void *_conn) {
+    tls_connection *conn = (tls_connection *)_conn;
+    ERR_clear_error();
+    int ret = SSL_accept(conn->ssl);
+    if (ret > 0) {
+        conn->flags |= TLS_CONN_FLAG_ACCEPT_SUCCESS;
+    } else if (handleSSLReturnCode(conn, ret)) {
+        conn->flags |= TLS_CONN_FLAG_ACCEPT_ERROR;
+    }
+}
+
 static void tlsHandleEvent(tls_connection *conn, int mask) {
     int ret, conn_error;
 
@@ -676,10 +710,8 @@ static void tlsHandleEvent(tls_connection *conn, int mask) {
             }
             ret = SSL_connect(conn->ssl);
             if (ret <= 0) {
-                WantIOType want = 0;
-                if (!handleSSLReturnCode(conn, ret, &want)) {
-                    registerSSLEvent(conn, want);
-
+                if (!handleSSLReturnCode(conn, ret)) {
+                    registerSSLEvent(conn);
                     /* Avoid hitting UpdateSSLEvent, which knows nothing
                      * of what SSL_connect() wants and instead looks at our
                      * R/W handlers.
@@ -698,27 +730,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) {
         conn->c.conn_handler = NULL;
         break;
     case CONN_STATE_ACCEPTING:
-        ERR_clear_error();
-        ret = SSL_accept(conn->ssl);
-        if (ret <= 0) {
-            WantIOType want = 0;
-            if (!handleSSLReturnCode(conn, ret, &want)) {
-                /* Avoid hitting UpdateSSLEvent, which knows nothing
-                 * of what SSL_connect() wants and instead looks at our
-                 * R/W handlers.
-                 */
-                registerSSLEvent(conn, want);
-                return;
-            }
-
-            /* If not handled, it's an error */
-            conn->c.state = CONN_STATE_ERROR;
-        } else {
-            conn->c.state = CONN_STATE_CONNECTED;
-        }
-
-        if (!callHandler((connection *)conn, conn->c.conn_handler)) return;
-        conn->c.conn_handler = NULL;
+        if (connTLSAccept((connection *)conn, NULL) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return;
         break;
     case CONN_STATE_CONNECTED: {
         int call_read = ((mask & AE_READABLE) && conn->c.read_handler) ||
@@ -740,20 +752,17 @@ static void tlsHandleEvent(tls_connection *conn, int mask) {
         int invert = conn->c.flags & CONN_FLAG_WRITE_BARRIER;
 
         if (!invert && call_read) {
-            conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE;
             if (!callHandler((connection *)conn, conn->c.read_handler)) return;
         }
 
         /* Fire the writable event. */
         if (call_write) {
-            conn->flags &= ~TLS_CONN_FLAG_WRITE_WANT_READ;
             if (!callHandler((connection *)conn, conn->c.write_handler)) return;
         }
 
         /* If we have to invert the call, fire the readable event now
          * after the writable one. */
         if (invert && call_read) {
-            conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE;
             if (!callHandler((connection *)conn, conn->c.read_handler)) return;
         }
         updatePendingData(conn);
@@ -845,31 +854,25 @@ static void connTLSClose(connection *conn_) {
 
 static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler) {
     tls_connection *conn = (tls_connection *)_conn;
-    int ret;
-
     if (conn->c.state != CONN_STATE_ACCEPTING) return C_ERR;
-    ERR_clear_error();
-
+    int call_handler_on_error = 1;
     /* Try to accept */
-    conn->c.conn_handler = accept_handler;
-    ret = SSL_accept(conn->ssl);
-
-    if (ret <= 0) {
-        WantIOType want = 0;
-        if (!handleSSLReturnCode(conn, ret, &want)) {
-            registerSSLEvent(conn, want); /* We'll fire back */
-            return C_OK;
-        } else {
-            conn->c.state = CONN_STATE_ERROR;
-            return C_ERR;
-        }
+    if (accept_handler) {
+        conn->c.conn_handler = accept_handler;
+        call_handler_on_error = 0;
     }
 
-    conn->c.state = CONN_STATE_CONNECTED;
-    if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_OK;
-    conn->c.conn_handler = NULL;
+    /* We're in IO thread - just call accept and return, the main thread will handle the rest */
+    if (!inMainThread()) {
+        TLSAccept(conn);
+        return C_OK;
+    }
 
-    return C_OK;
+    /* Try to offload accept to IO threads */
+    if (trySendAcceptToIOThreads(_conn) == C_OK) return C_OK;
+
+    TLSAccept(conn);
+    return TLSHandleAcceptResult(conn, call_handler_on_error);
 }
 
 static int connTLSConnect(connection *conn_,

From bfad1106a11086175d3bab6703f217e25d41548d Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Wed, 18 Dec 2024 09:17:11 -0800
Subject: [PATCH 023/101] Attempt to read secondary error from info test
 (#1452)

The test attempts to write 1MB of data in order to trigger a disconnect.
Normally, the data is fully flushed and we get the error on the read
(I/O error). However, it's possible we might fail the write, which
leaves the client in an inconsistent state. On the next command, we
finally process the I/O error on the FD. So, the simple fix is to
consume any secondary errors.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/info.tcl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index a27043fa88..11dc4e5d40 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -391,7 +391,13 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} {
             # set qbuf limit to minimum to test stat
             set org_qbuf_limit [lindex [r config get client-query-buffer-limit] 1]
             r config set client-query-buffer-limit 1048576
-            catch {r set key [string repeat a 1048576]}
+            catch {r set key [string repeat a 2048576]} e
+            # We might get an error on the write path of the previous command, which won't be
+            # an I/O error based on how the client is designed. We will need to manually consume
+            # the secondary I/O error.
+            if {![string match "I/O error*" $e]} {
+                catch {r read}
+            }
             set info [r info stats]
             assert_equal [getInfoProperty $info client_query_buffer_limit_disconnections] {1}
             r config set client-query-buffer-limit $org_qbuf_limit

From 84c1a44ce96328ddb36e167e0899f4626828b0fc Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Wed, 18 Dec 2024 22:18:02 -0800
Subject: [PATCH 024/101] Add a hint about the current file for TCL debugging
 (#1459)

There are some tests that fail and give no useful information since they are
outside of a test context. Now we will at least get the file we are located in.

We can sort of reverse engineer where we are in the test by seeing which
tests have finished in a file.

```
[TIMEOUT]: clients state report follows.
sock6 => (SPAWNED SERVER) pid:30375 - tests/unit/info.tcl
Killing still running Valkey server 30375 - tests/unit/info.tcl
```

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/support/server.tcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 8c545d900a..bd3135e9d9 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -314,7 +314,7 @@ proc spawn_server {config_file stdout stderr args} {
     }
 
     # Tell the test server about this new instance.
-    send_data_packet $::test_server_fd server-spawned $pid
+    send_data_packet $::test_server_fd server-spawned "$pid - $::curfile"
     return $pid
 }
 

From fff1573236a9c0965fe235b68e3c547e5a36b219 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 19 Dec 2024 16:12:34 +0800
Subject: [PATCH 025/101] Minor log fixes when failover auth denied due to slot
 epoch (#1341)

The old reqEpoch mainly refers to requestCurrentEpoch, see:
```
    if (requestCurrentEpoch < server.cluster->currentEpoch) {
        serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", node->name,
                  node->human_nodename, (unsigned long long)requestCurrentEpoch,
                  (unsigned long long)server.cluster->currentEpoch);
        return;
    }
```

And in here we refer to requestConfigEpoch, it's a bit misleading,
so change it to reqConfigEpoch to make it clear.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 9ddcf6678d..bbf63d46b9 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4430,7 +4430,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
          * by the replica requesting our vote. Refuse to vote for this replica. */
         serverLog(LL_WARNING,
                   "Failover auth denied to %.40s (%s): "
-                  "slot %d epoch (%llu) > reqEpoch (%llu)",
+                  "slot %d epoch (%llu) > reqConfigEpoch (%llu)",
                   node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch,
                   (unsigned long long)requestConfigEpoch);
         return;
@@ -4721,8 +4721,8 @@ void clusterHandleReplicaFailover(void) {
     if (server.cluster->failover_auth_sent == 0) {
         server.cluster->currentEpoch++;
         server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
-        serverLog(LL_NOTICE, "Starting a failover election for epoch %llu.",
-                  (unsigned long long)server.cluster->currentEpoch);
+        serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu",
+                  (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));
         clusterRequestFailoverAuth();
         server.cluster->failover_auth_sent = 1;
         clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);

From 4aa656fb393fa243aade9bb7034dd7cba5b7e3fe Mon Sep 17 00:00:00 2001
From: Jungwoo Song <37579681+bluayer@users.noreply.github.com>
Date: Fri, 20 Dec 2024 01:32:31 +0900
Subject: [PATCH 026/101] Support for reading from replicas in valkey-benchmark
 (#1392)

**Background**
When conducting performance tests using `valkey-benchmark`, reading from
replicas was not supported. Consequently, even in cluster mode, all
reads were directed to the primary nodes. This limitation made it
challenging to obtain accurate metrics during workload stress testing
for performance measurement or before a version upgrade.

Related issue : https://github.com/valkey-io/valkey/issues/900

**Changes**
1. Replaced the use of `CLUSTER NODES` with `CLUSTER SLOTS` when
fetching cluster configuration. This allows for easier identification of
replica slots.
2. Support for reading from replicas by executing the client in
`READONLY` mode.
3. Support reading from replicas even during slot migrations.
4. Introduced two CLI options `--rfr` to enable reading from replicas
only or all cluster nodes. A warning added to indicate that write
requests might not be handled correctly when using this option.

---------

Signed-off-by: bluayer <ijacsong98@gmail.com>
Signed-off-by: bluayer <bluayer@gmail.com>
Signed-off-by: Jungwoo Song <37579681+bluayer@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
---
 src/valkey-benchmark.c | 354 +++++++++++++++++++----------------------
 1 file changed, 168 insertions(+), 186 deletions(-)

diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c
index 57cdd6fc16..1924203ae7 100644
--- a/src/valkey-benchmark.c
+++ b/src/valkey-benchmark.c
@@ -77,6 +77,13 @@ struct benchmarkThread;
 struct clusterNode;
 struct serverConfig;
 
+/* Read from replica options */
+typedef enum readFromReplica {
+    FROM_PRIMARY_ONLY = 0, /* default option */
+    FROM_REPLICA_ONLY,
+    FROM_ALL
+} readFromReplica;
+
 static struct config {
     aeEventLoop *el;
     cliConnInfo conn_info;
@@ -112,6 +119,7 @@ static struct config {
     int num_threads;
     struct benchmarkThread **threads;
     int cluster_mode;
+    readFromReplica read_from_replica;
     int cluster_node_count;
     struct clusterNode **cluster_nodes;
     struct serverConfig *redis_config;
@@ -168,12 +176,6 @@ typedef struct clusterNode {
     int *updated_slots;      /* Used by updateClusterSlotsConfiguration */
     int updated_slots_count; /* Used by updateClusterSlotsConfiguration */
     int replicas_count;
-    sds *migrating;      /* An array of sds where even strings are slots and odd
-                          * strings are the destination node IDs. */
-    sds *importing;      /* An array of sds where even strings are slots and odd
-                          * strings are the source node IDs. */
-    int migrating_count; /* Length of the migrating array (migrating slots*2) */
-    int importing_count; /* Length of the importing array (importing slots*2) */
     struct serverConfig *redis_config;
 } clusterNode;
 
@@ -228,6 +230,15 @@ static int dictSdsKeyCompare(const void *key1, const void *key2) {
     return memcmp(key1, key2, l1) == 0;
 }
 
+static dictType dtype = {
+    dictSdsHash,       /* hash function */
+    NULL,              /* key dup */
+    dictSdsKeyCompare, /* key compare */
+    NULL,              /* key destructor */
+    NULL,              /* val destructor */
+    NULL               /* allow to expand */
+};
+
 static redisContext *getRedisContext(const char *ip, int port, const char *hostsocket) {
     redisContext *ctx = NULL;
     redisReply *reply = NULL;
@@ -710,6 +721,15 @@ static client createClient(char *cmd, size_t len, client from, int thread_id) {
         c->prefix_pending++;
     }
 
+    if (config.cluster_mode && (config.read_from_replica == FROM_REPLICA_ONLY || config.read_from_replica == FROM_ALL)) {
+        char *buf = NULL;
+        int len;
+        len = redisFormatCommand(&buf, "READONLY");
+        c->obuf = sdscatlen(c->obuf, buf, len);
+        free(buf);
+        c->prefix_pending++;
+    }
+
     c->prefixlen = sdslen(c->obuf);
     /* Append the request itself. */
     if (from) {
@@ -835,7 +855,15 @@ static void showLatencyReport(void) {
         printf("  %d bytes payload\n", config.datasize);
         printf("  keep alive: %d\n", config.keepalive);
         if (config.cluster_mode) {
-            printf("  cluster mode: yes (%d primaries)\n", config.cluster_node_count);
+            const char *node_roles = NULL;
+            if (config.read_from_replica == FROM_ALL) {
+                node_roles = "cluster";
+            } else if (config.read_from_replica == FROM_REPLICA_ONLY) {
+                node_roles = "replica";
+            } else {
+                node_roles = "primary";
+            }
+            printf("  cluster mode: yes (%d %s)\n", config.cluster_node_count, node_roles);
             int m;
             for (m = 0; m < config.cluster_node_count; m++) {
                 clusterNode *node = config.cluster_nodes[m];
@@ -1009,26 +1037,13 @@ static clusterNode *createClusterNode(char *ip, int port) {
     node->slots_count = 0;
     node->updated_slots = NULL;
     node->updated_slots_count = 0;
-    node->migrating = NULL;
-    node->importing = NULL;
-    node->migrating_count = 0;
-    node->importing_count = 0;
     node->redis_config = NULL;
     return node;
 }
 
 static void freeClusterNode(clusterNode *node) {
-    int i;
     if (node->name) sdsfree(node->name);
     if (node->replicate) sdsfree(node->replicate);
-    if (node->migrating != NULL) {
-        for (i = 0; i < node->migrating_count; i++) sdsfree(node->migrating[i]);
-        zfree(node->migrating);
-    }
-    if (node->importing != NULL) {
-        for (i = 0; i < node->importing_count; i++) sdsfree(node->importing[i]);
-        zfree(node->importing);
-    }
     /* If the node is not the reference node, that uses the address from
      * config.conn_info.hostip and config.conn_info.hostport, then the node ip has been
      * allocated by fetchClusterConfiguration, so it must be freed. */
@@ -1056,157 +1071,85 @@ static clusterNode **addClusterNode(clusterNode *node) {
     return config.cluster_nodes;
 }
 
-/* TODO: This should be refactored to use CLUSTER SLOTS, the migrating/importing
- * information is anyway not used.
- */
 static int fetchClusterConfiguration(void) {
     int success = 1;
     redisContext *ctx = NULL;
     redisReply *reply = NULL;
+    dict *nodes = NULL;
+    const char *errmsg = "Failed to fetch cluster configuration";
+    size_t i, j;
     ctx = getRedisContext(config.conn_info.hostip, config.conn_info.hostport, config.hostsocket);
     if (ctx == NULL) {
         exit(1);
     }
-    clusterNode *firstNode = createClusterNode((char *)config.conn_info.hostip, config.conn_info.hostport);
-    if (!firstNode) {
+
+    reply = redisCommand(ctx, "CLUSTER SLOTS");
+    if (reply == NULL || reply->type == REDIS_REPLY_ERROR) {
         success = 0;
+        if (reply) fprintf(stderr, "%s\nCLUSTER SLOTS ERROR: %s\n", errmsg, reply->str);
         goto cleanup;
     }
-    reply = redisCommand(ctx, "CLUSTER NODES");
-    success = (reply != NULL);
-    if (!success) goto cleanup;
-    success = (reply->type != REDIS_REPLY_ERROR);
-    if (!success) {
-        if (config.hostsocket == NULL) {
-            fprintf(stderr, "Cluster node %s:%d replied with error:\n%s\n", config.conn_info.hostip,
-                    config.conn_info.hostport, reply->str);
-        } else {
-            fprintf(stderr, "Cluster node %s replied with error:\n%s\n", config.hostsocket, reply->str);
-        }
-        goto cleanup;
-    }
-    char *lines = reply->str, *p, *line;
-    while ((p = strstr(lines, "\n")) != NULL) {
-        *p = '\0';
-        line = lines;
-        lines = p + 1;
-        char *name = NULL, *addr = NULL, *flags = NULL, *primary_id = NULL;
-        int i = 0;
-        while ((p = strchr(line, ' ')) != NULL) {
-            *p = '\0';
-            char *token = line;
-            line = p + 1;
-            switch (i++) {
-            case 0: name = token; break;
-            case 1: addr = token; break;
-            case 2: flags = token; break;
-            case 3: primary_id = token; break;
-            }
-            if (i == 8) break; // Slots
-        }
-        if (!flags) {
-            fprintf(stderr, "Invalid CLUSTER NODES reply: missing flags.\n");
-            success = 0;
-            goto cleanup;
-        }
-        int myself = (strstr(flags, "myself") != NULL);
-        int is_replica = (strstr(flags, "slave") != NULL || (primary_id != NULL && primary_id[0] != '-'));
-        if (is_replica) continue;
-        if (addr == NULL) {
-            fprintf(stderr, "Invalid CLUSTER NODES reply: missing addr.\n");
-            success = 0;
-            goto cleanup;
-        }
-        clusterNode *node = NULL;
-        char *ip = NULL;
-        int port = 0;
-        char *paddr = strrchr(addr, ':');
-        if (paddr != NULL) {
-            *paddr = '\0';
-            ip = addr;
-            addr = paddr + 1;
-            /* If internal bus is specified, then just drop it. */
-            if ((paddr = strchr(addr, '@')) != NULL) *paddr = '\0';
-            port = atoi(addr);
-        }
-        if (myself) {
-            node = firstNode;
-            if (ip != NULL && strcmp(node->ip, ip) != 0) {
-                node->ip = sdsnew(ip);
-                node->port = port;
+    assert(reply->type == REDIS_REPLY_ARRAY);
+    nodes = dictCreate(&dtype);
+    for (i = 0; i < reply->elements; i++) {
+        redisReply *r = reply->element[i];
+        assert(r->type == REDIS_REPLY_ARRAY);
+        assert(r->elements >= 3);
+        int from = r->element[0]->integer;
+        int to = r->element[1]->integer;
+        sds primary = NULL;
+        for (j = 2; j < r->elements; j++) {
+            redisReply *nr = r->element[j];
+            assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3);
+            assert(nr->element[0]->str != NULL);
+            assert(nr->element[2]->str != NULL);
+
+            int is_primary = (j == 2);
+            if (is_primary) primary = sdsnew(nr->element[2]->str);
+            int is_cluster_option_only = (config.read_from_replica == FROM_PRIMARY_ONLY);
+            if ((config.read_from_replica == FROM_REPLICA_ONLY && is_primary) || (is_cluster_option_only && !is_primary)) continue;
+
+            sds ip = sdsnew(nr->element[0]->str);
+            sds name = sdsnew(nr->element[2]->str);
+            int port = nr->element[1]->integer;
+            int slot_start = from;
+            int slot_end = to;
+
+            clusterNode *node = NULL;
+            dictEntry *entry = dictFind(nodes, name);
+            if (entry == NULL) {
+                node = createClusterNode(sdsnew(ip), port);
+                if (node == NULL) {
+                    success = 0;
+                    goto cleanup;
+                } else {
+                    node->name = name;
+                    if (!is_primary) node->replicate = sdsdup(primary);
+                }
+            } else {
+                node = dictGetVal(entry);
             }
-        } else {
-            node = createClusterNode(sdsnew(ip), port);
-        }
-        if (node == NULL) {
-            success = 0;
-            goto cleanup;
-        }
-        if (name != NULL) node->name = sdsnew(name);
-        if (i == 8) {
-            int remaining = strlen(line);
-            while (remaining > 0) {
-                p = strchr(line, ' ');
-                if (p == NULL) p = line + remaining;
-                remaining -= (p - line);
-
-                char *slotsdef = line;
-                *p = '\0';
-                if (remaining) {
-                    line = p + 1;
-                    remaining--;
-                } else
-                    line = p;
-                char *dash = NULL;
-                if (slotsdef[0] == '[') {
-                    slotsdef++;
-                    if ((p = strstr(slotsdef, "->-"))) { // Migrating
-                        *p = '\0';
-                        p += 3;
-                        char *closing_bracket = strchr(p, ']');
-                        if (closing_bracket) *closing_bracket = '\0';
-                        sds slot = sdsnew(slotsdef);
-                        sds dst = sdsnew(p);
-                        node->migrating_count += 2;
-                        node->migrating = zrealloc(node->migrating, (node->migrating_count * sizeof(sds)));
-                        node->migrating[node->migrating_count - 2] = slot;
-                        node->migrating[node->migrating_count - 1] = dst;
-                    } else if ((p = strstr(slotsdef, "-<-"))) { // Importing
-                        *p = '\0';
-                        p += 3;
-                        char *closing_bracket = strchr(p, ']');
-                        if (closing_bracket) *closing_bracket = '\0';
-                        sds slot = sdsnew(slotsdef);
-                        sds src = sdsnew(p);
-                        node->importing_count += 2;
-                        node->importing = zrealloc(node->importing, (node->importing_count * sizeof(sds)));
-                        node->importing[node->importing_count - 2] = slot;
-                        node->importing[node->importing_count - 1] = src;
-                    }
-                } else if ((dash = strchr(slotsdef, '-')) != NULL) {
-                    p = dash;
-                    int start, stop;
-                    *p = '\0';
-                    start = atoi(slotsdef);
-                    stop = atoi(p + 1);
-                    while (start <= stop) {
-                        int slot = start++;
-                        node->slots[node->slots_count++] = slot;
-                    }
-                } else if (p > slotsdef) {
-                    int slot = atoi(slotsdef);
+            if (slot_start == slot_end) {
+                node->slots[node->slots_count++] = slot_start;
+            } else {
+                while (slot_start <= slot_end) {
+                    int slot = slot_start++;
                     node->slots[node->slots_count++] = slot;
                 }
             }
+            if (node->slots_count == 0) {
+                fprintf(stderr, "WARNING: Node %s:%d has no slots, skipping...\n", node->ip, node->port);
+                continue;
+            }
+            if (entry == NULL) {
+                dictReplace(nodes, node->name, node);
+                if (!addClusterNode(node)) {
+                    success = 0;
+                    goto cleanup;
+                }
+            }
         }
-        if (node->slots_count == 0) {
-            fprintf(stderr, "WARNING: Primary node %s:%d has no slots, skipping...\n", node->ip, node->port);
-            continue;
-        }
-        if (!addClusterNode(node)) {
-            success = 0;
-            goto cleanup;
-        }
+        sdsfree(primary);
     }
 cleanup:
     if (ctx) redisFree(ctx);
@@ -1214,6 +1157,7 @@ static int fetchClusterConfiguration(void) {
         if (config.cluster_nodes) freeClusterNodes();
     }
     if (reply) freeReplyObject(reply);
+    if (nodes) dictRelease(nodes);
     return success;
 }
 
@@ -1222,7 +1166,7 @@ static int fetchClusterConfiguration(void) {
 static int fetchClusterSlotsConfiguration(client c) {
     UNUSED(c);
     int success = 1, is_fetching_slots = 0, last_update = 0;
-    size_t i;
+    size_t i, j;
 
     last_update = atomic_load_explicit(&config.slots_last_update, memory_order_relaxed);
     if (c->slots_last_update < last_update) {
@@ -1236,16 +1180,9 @@ static int fetchClusterSlotsConfiguration(client c) {
     atomic_store_explicit(&config.is_fetching_slots, 1, memory_order_relaxed);
     fprintf(stderr, "WARNING: Cluster slots configuration changed, fetching new one...\n");
     const char *errmsg = "Failed to update cluster slots configuration";
-    static dictType dtype = {
-        dictSdsHash,       /* hash function */
-        NULL,              /* key dup */
-        dictSdsKeyCompare, /* key compare */
-        NULL,              /* key destructor */
-        NULL,              /* val destructor */
-        NULL               /* allow to expand */
-    };
+
     /* printf("[%d] fetchClusterSlotsConfiguration\n", c->thread_id); */
-    dict *primaries = dictCreate(&dtype);
+    dict *nodes = dictCreate(&dtype);
     redisContext *ctx = NULL;
     for (i = 0; i < (size_t)config.cluster_node_count; i++) {
         clusterNode *node = config.cluster_nodes[i];
@@ -1263,7 +1200,7 @@ static int fetchClusterSlotsConfiguration(client c) {
         if (node->updated_slots != NULL) zfree(node->updated_slots);
         node->updated_slots = NULL;
         node->updated_slots_count = 0;
-        dictReplace(primaries, node->name, node);
+        dictReplace(nodes, node->name, node);
     }
     reply = redisCommand(ctx, "CLUSTER SLOTS");
     if (reply == NULL || reply->type == REDIS_REPLY_ERROR) {
@@ -1279,30 +1216,44 @@ static int fetchClusterSlotsConfiguration(client c) {
         int from, to, slot;
         from = r->element[0]->integer;
         to = r->element[1]->integer;
-        redisReply *nr = r->element[2];
-        assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3);
-        assert(nr->element[2]->str != NULL);
-        sds name = sdsnew(nr->element[2]->str);
-        dictEntry *entry = dictFind(primaries, name);
-        if (entry == NULL) {
-            success = 0;
-            fprintf(stderr,
-                    "%s: could not find node with ID %s in current "
-                    "configuration.\n",
-                    errmsg, name);
-            if (name) sdsfree(name);
-            goto cleanup;
+        size_t start, end;
+        if (config.read_from_replica == FROM_ALL) {
+            start = 2;
+            end = r->elements;
+        } else if (config.read_from_replica == FROM_REPLICA_ONLY) {
+            start = 3;
+            end = r->elements;
+        } else {
+            start = 2;
+            end = 3;
+        }
+
+        for (j = start; j < end; j++) {
+            redisReply *nr = r->element[j];
+            assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3);
+            assert(nr->element[2]->str != NULL);
+            sds name = sdsnew(nr->element[2]->str);
+            dictEntry *entry = dictFind(nodes, name);
+            if (entry == NULL) {
+                success = 0;
+                fprintf(stderr,
+                        "%s: could not find node with ID %s in current "
+                        "configuration.\n",
+                        errmsg, name);
+                if (name) sdsfree(name);
+                goto cleanup;
+            }
+            sdsfree(name);
+            clusterNode *node = dictGetVal(entry);
+            if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int));
+            for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot;
         }
-        sdsfree(name);
-        clusterNode *node = dictGetVal(entry);
-        if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int));
-        for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot;
     }
     updateClusterSlotsConfiguration();
 cleanup:
     freeReplyObject(reply);
     redisFree(ctx);
-    dictRelease(primaries);
+    dictRelease(nodes);
     atomic_store_explicit(&config.is_fetching_slots, 0, memory_order_relaxed);
     return success;
 }
@@ -1460,6 +1411,19 @@ int parseOptions(int argc, char **argv) {
                 config.num_threads = 0;
         } else if (!strcmp(argv[i], "--cluster")) {
             config.cluster_mode = 1;
+        } else if (!strcmp(argv[i], "--rfr")) {
+            if (argv[++i]) {
+                if (!strcmp(argv[i], "all")) {
+                    config.read_from_replica = FROM_ALL;
+                } else if (!strcmp(argv[i], "yes")) {
+                    config.read_from_replica = FROM_REPLICA_ONLY;
+                } else if (!strcmp(argv[i], "no")) {
+                    config.read_from_replica = FROM_PRIMARY_ONLY;
+                } else {
+                    goto invalid;
+                }
+            } else
+                goto invalid;
         } else if (!strcmp(argv[i], "--enable-tracking")) {
             config.enable_tracking = 1;
         } else if (!strcmp(argv[i], "--help")) {
@@ -1557,6 +1521,14 @@ int parseOptions(int argc, char **argv) {
         "                    If the command is supplied on the command line in cluster\n"
         "                    mode, the key must contain \"{tag}\". Otherwise, the\n"
         "                    command will not be sent to the right cluster node.\n"
+        " --rfr <mode>       Enable read from replicas in cluster mode.\n"
+        "                    This command must be used with the --cluster option.\n"
+        "                    There are three modes for reading from replicas:\n"
+        "                    'no' - sends read requests to primaries only (default) \n"
+        "                    'yes' - sends read requests to replicas only.\n"
+        "                    'all' - sends read requests to all nodes.\n"
+        "                    Since write commands will not be accepted by replicas,\n"
+        "                    it is recommended to enable read from replicas only for read command tests.\n"
         " --enable-tracking  Send CLIENT TRACKING on before starting benchmark.\n"
         " -k <boolean>       1=keep alive 0=reconnect (default 1)\n"
         " -r <keyspacelen>   Use random keys for SET/GET/INCR, random values for SADD,\n"
@@ -1698,6 +1670,7 @@ int main(int argc, char **argv) {
     config.num_threads = 0;
     config.threads = NULL;
     config.cluster_mode = 0;
+    config.read_from_replica = FROM_PRIMARY_ONLY;
     config.cluster_node_count = 0;
     config.cluster_nodes = NULL;
     config.redis_config = NULL;
@@ -1742,7 +1715,15 @@ int main(int argc, char **argv) {
             fprintf(stderr, "Invalid cluster: %d node(s).\n", config.cluster_node_count);
             exit(1);
         }
-        printf("Cluster has %d primary nodes:\n\n", config.cluster_node_count);
+        const char *node_roles = NULL;
+        if (config.read_from_replica == FROM_ALL) {
+            node_roles = "cluster";
+        } else if (config.read_from_replica == FROM_REPLICA_ONLY) {
+            node_roles = "replica";
+        } else {
+            node_roles = "primary";
+        }
+        printf("Cluster has %d %s nodes:\n\n", config.cluster_node_count, node_roles);
         int i = 0;
         for (; i < config.cluster_node_count; i++) {
             clusterNode *node = config.cluster_nodes[i];
@@ -1750,7 +1731,8 @@ int main(int argc, char **argv) {
                 fprintf(stderr, "Invalid cluster node #%d\n", i);
                 exit(1);
             }
-            printf("Primary %d: ", i);
+            const char *node_type = (node->replicate == NULL ? "Primary" : "Replica");
+            printf("Node %d(%s): ", i, node_type);
             if (node->name) printf("%s ", node->name);
             printf("%s:%d\n", node->ip, node->port);
             node->redis_config = getServerConfig(node->ip, node->port, NULL);

From 7f0cc83428ba71e460bd7b01e53673c84e5beb11 Mon Sep 17 00:00:00 2001
From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com>
Date: Thu, 19 Dec 2024 12:32:40 -0800
Subject: [PATCH 027/101] Workflow changes to fix old release binaries (#1461)

- Moves `build-config.json` to workflow dir to build old versions with
new configs.
- Enables contributors to test release Wf on private repo by adding
`github.event_name == 'workflow_dispatch' ||`

---------

Signed-off-by: Roshan Khatri <rvkhatri@amazon.com>
---
 .github/actions/generate-package-build-matrix/action.yml    | 4 ++--
 .../generate-package-build-matrix}/build-config.json        | 0
 .github/workflows/build-release-packages.yml                | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)
 rename {utils/releasetools => .github/actions/generate-package-build-matrix}/build-config.json (100%)

diff --git a/.github/actions/generate-package-build-matrix/action.yml b/.github/actions/generate-package-build-matrix/action.yml
index 7e90f27be5..2494a71118 100644
--- a/.github/actions/generate-package-build-matrix/action.yml
+++ b/.github/actions/generate-package-build-matrix/action.yml
@@ -24,11 +24,11 @@ runs:
 
     - name: Get targets
       run: |
-        x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' utils/releasetools/build-config.json)
+        x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' .github/actions/generate-package-build-matrix/build-config.json)
         x86_matrix=$(echo "{ \"distro\" : $x86_arch }" | jq -c .)
         echo "X86_MATRIX=$x86_matrix" >> $GITHUB_ENV
 
-        arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' utils/releasetools/build-config.json)
+        arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' .github/actions/generate-package-build-matrix/build-config.json)
         arm_matrix=$(echo "{ \"distro\" : $arm_arch }" | jq -c .)
         echo "ARM_MATRIX=$arm_matrix" >> $GITHUB_ENV
       shell: bash
diff --git a/utils/releasetools/build-config.json b/.github/actions/generate-package-build-matrix/build-config.json
similarity index 100%
rename from utils/releasetools/build-config.json
rename to .github/actions/generate-package-build-matrix/build-config.json
diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 3f1ca2627b..d7ab8e57d6 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -8,7 +8,7 @@ on:
       - '.github/workflows/build-release-packages.yml'
       - '.github/workflows/call-build-linux-arm-packages.yml'
       - '.github/workflows/call-build-linux-x86-packages.yml'
-      - 'utils/releasetools/build-config.json'
+      - '.github/actions/generate-package-build-matrix/build-config.json'
   workflow_dispatch:
     inputs:
       version:
@@ -23,7 +23,7 @@ jobs:
   # This job provides the version metadata from the tag for the other jobs to use.
   release-build-get-meta:
     name: Get metadata to build
-    if: github.repository == 'valkey-io/valkey'
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       version: ${{ steps.get_version.outputs.VERSION }}
@@ -69,7 +69,7 @@ jobs:
 
   generate-build-matrix:
     name: Generating build matrix
-    if: github.repository == 'valkey-io/valkey'
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }}

From cfa292afd58a78235a208bf402a7b254d9c878b2 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 20 Dec 2024 10:14:01 +0800
Subject: [PATCH 028/101] Clear outdated failure reports more accurately
 (#1184)

There are two changes here:

1. The one in clusterNodeCleanupFailureReports, only primary with slots can
report the failure report, if the primary became a replica its failure report
should be cleared. This may lead to inaccurate node fail judgment in some network
partition cases i guess, it will also affect the CLUSTER COUNT-FAILURE-REPORTS
command.

2. The one in clusterProcessGossipSection, it is not that important, but it can
print a "node is back online" log helps us troubleshoot the problem, although
it may conflict with 1 at some points.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index bbf63d46b9..876beef91f 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1552,9 +1552,14 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
  * older than the global node timeout. Note that anyway for a node to be
  * flagged as FAIL we need to have a local PFAIL state that is at least
  * older than the global node timeout, so we don't just trust the number
- * of failure reports from other nodes. */
+ * of failure reports from other nodes.
+ *
+ * If the reporting node loses its voting right during this time, we will
+ * also clear its report. */
 void clusterNodeCleanupFailureReports(clusterNode *node) {
     list *l = node->fail_reports;
+    if (!listLength(l)) return;
+
     listNode *ln;
     listIter li;
     clusterNodeFailReport *fr;
@@ -1564,7 +1569,11 @@ void clusterNodeCleanupFailureReports(clusterNode *node) {
     listRewind(l, &li);
     while ((ln = listNext(&li)) != NULL) {
         fr = ln->value;
-        if (now - fr->time > maxtime) listDelNode(l, ln);
+        if (now - fr->time > maxtime) {
+            listDelNode(l, ln);
+        } else if (!clusterNodeIsVotingPrimary(fr->node)) {
+            listDelNode(l, ln);
+        }
     }
 }
 
@@ -1581,6 +1590,8 @@ void clusterNodeCleanupFailureReports(clusterNode *node) {
  * Otherwise 0 is returned. */
 int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) {
     list *l = node->fail_reports;
+    if (!listLength(l)) return 0;
+
     listNode *ln;
     listIter li;
     clusterNodeFailReport *fr;
@@ -2254,10 +2265,11 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
         /* Ignore gossips about self. */
         if (node && node != myself) {
             /* We already know this node.
-               Handle failure reports, only when the sender is a voting primary. */
-            if (sender && clusterNodeIsVotingPrimary(sender)) {
+             * Handle failure reports, the report is added only if the sender is a voting primary,
+             * and deletion of a failure report is not restricted. */
+            if (sender) {
                 if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) {
-                    if (clusterNodeAddFailureReport(node, sender)) {
+                    if (clusterNodeIsVotingPrimary(sender) && clusterNodeAddFailureReport(node, sender)) {
                         serverLog(LL_NOTICE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", sender->name,
                                   sender->human_nodename, node->name, node->human_nodename);
                     }

From 13419cbda29ea0201dea2496326eb130147e7e0e Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Thu, 19 Dec 2024 18:14:56 -0800
Subject: [PATCH 029/101] Fix storing the wrong PID in active servers (#1464)

In #1459, I missed that the data was also used to keep track of the PID
files so if the testing framework crashed it would no longer be able to
cleanup the extra servers. So now we properly extract the PID and store
it so we can clean up PIDs.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/test_helper.tcl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 8a4125e48d..54bb923674 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -421,7 +421,8 @@ proc read_from_test_client fd {
     } elseif {$status eq {server-spawning}} {
         set ::active_clients_task($fd) "(SPAWNING SERVER) $data"
     } elseif {$status eq {server-spawned}} {
-        lappend ::active_servers $data
+        set pid [string trim [lindex [split $data "-"] 0]]
+        lappend ::active_servers $pid
         set ::active_clients_task($fd) "(SPAWNED SERVER) pid:$data"
     } elseif {$status eq {server-killing}} {
         set ::active_clients_task($fd) "(KILLING SERVER) pid:$data"

From b3daabe14a7b0e7ebd7630bd7caf263a2b4d0bac Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Thu, 19 Dec 2024 18:16:46 -0800
Subject: [PATCH 030/101] Update info.tcl test to revert client output limits
 sooner (#1462)

We set the client output buffer limits to 10 bytes, and then execute
`info stats` which produces more than 10 bytes of output, which can
cause that command to throw an error.

I'm not sure why it wasn't consistently erroring before, might have been
some change related to the ubuntu upgrade though.

Issues related to ubuntu-tls are hopefully resolved now.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/info.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index 11dc4e5d40..3295c5e31a 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -406,10 +406,10 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} {
             r config set client-output-buffer-limit "normal 10 0 0"
             r set key [string repeat a 100000] ;# to trigger output buffer limit check this needs to be big
             catch {r get key}
+            r config set client-output-buffer-limit $org_outbuf_limit
             set info [r info stats]
             assert_equal [getInfoProperty $info client_output_buffer_limit_disconnections] {1}
-            r config set client-output-buffer-limit $org_outbuf_limit
-        } {OK} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres
+        } {} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres
 
         test {clients: pubsub clients} {
             set info [r info clients]

From beb95d334aaec898aab0df9c8020292b2046e3f1 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Fri, 20 Dec 2024 12:10:48 -0800
Subject: [PATCH 031/101] Resolve bounds checks on cluster_legacy.c (#1463)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are getting a number of errors like:
```
array subscript ‘clusterMsg[0]’ is partly outside array bounds of ‘unsigned char[2272]’
```

Which is basically GCC telling us that we have an object which is longer
than the underlying storage of the allocation. We actually do this a
lot, but GCC is generally not aware of how big the underlying allocation
is, so it doesn't throw this error. We are specifically getting this
error because the msgBlock can be of variable length depending on the
type of message, but GCC assumes it's the longest one possible. The
solution I went with here was make the message type optional, so that it
wasn't included in the size. I think this also makes some sense, since
it's really just a helper for us to easily cast the object around.

I considered disabling this error, but it is generally pretty useful
since it can catch real issues. Another solution would be to
over-allocate to the largest possible object, which could hurt
performance as we initialize it to zero.

Results:
https://github.com/madolson/valkey/actions/runs/12423414811/job/34686899884

This is a slightly cleaned up version of
https://github.com/valkey-io/valkey/pull/1439. I thought I had another
strategy but alas, it didn't work out.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/cluster_legacy.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 876beef91f..9a23527b30 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -424,9 +424,19 @@ typedef struct {
     union {
         clusterMsg msg;
         clusterMsgLight msg_light;
-    };
+    } data[];
 } clusterMsgSendBlock;
 
+/* Helper function to extract a normal message from a send block. */
+static clusterMsgLight *getLightMessageFromSendBlock(clusterMsgSendBlock *msgblock) {
+    return &msgblock->data[0].msg_light;
+}
+
+/* Helper function to extract a light message from a send block. */
+static clusterMsg *getMessageFromSendBlock(clusterMsgSendBlock *msgblock) {
+    return &msgblock->data[0].msg;
+}
+
 /* -----------------------------------------------------------------------------
  * Initialization
  * -------------------------------------------------------------------------- */
@@ -1288,15 +1298,15 @@ void clusterReset(int hard) {
  * CLUSTER communication link
  * -------------------------------------------------------------------------- */
 clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) {
-    uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, msg);
+    uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, data);
     clusterMsgSendBlock *msgblock = zcalloc(blocklen);
     msgblock->refcount = 1;
     msgblock->totlen = blocklen;
     server.stat_cluster_links_memory += blocklen;
     if (IS_LIGHT_MESSAGE(type)) {
-        clusterBuildMessageHdrLight(&msgblock->msg_light, type, msglen);
+        clusterBuildMessageHdrLight(getLightMessageFromSendBlock(msgblock), type, msglen);
     } else {
-        clusterBuildMessageHdr(&msgblock->msg, type, msglen);
+        clusterBuildMessageHdr(getMessageFromSendBlock(msgblock), type, msglen);
     }
     return msgblock;
 }
@@ -3668,7 +3678,7 @@ void clusterWriteHandler(connection *conn) {
     while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) {
         listNode *head = listFirst(link->send_msg_queue);
         clusterMsgSendBlock *msgblock = (clusterMsgSendBlock *)head->value;
-        clusterMsg *msg = &msgblock->msg;
+        clusterMsg *msg = getMessageFromSendBlock(msgblock);
         size_t msg_offset = link->head_msg_send_offset;
         size_t msg_len = ntohl(msg->totlen);
 
@@ -3853,7 +3863,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) {
     if (!link) {
         return;
     }
-    if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0)
+    if (listLength(link->send_msg_queue) == 0 && getMessageFromSendBlock(msgblock)->totlen != 0)
         connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1);
 
     listAddNodeTail(link->send_msg_queue, msgblock);
@@ -3864,7 +3874,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) {
     server.stat_cluster_links_memory += sizeof(listNode);
 
     /* Populate sent messages stats. */
-    uint16_t type = ntohs(msgblock->msg.type);
+    uint16_t type = ntohs(getMessageFromSendBlock(msgblock)->type);
     if (type < CLUSTERMSG_TYPE_COUNT) server.cluster->stats_bus_messages_sent[type]++;
 }
 
@@ -4050,7 +4060,7 @@ void clusterSendPing(clusterLink *link, int type) {
      * sizeof(clusterMsg) or more. */
     if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen);
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
 
     if (!link->inbound && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime();
 
@@ -4195,10 +4205,10 @@ clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message,
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen);
     clusterMsgDataPublish *hdr_data_msg;
     if (is_light) {
-        clusterMsgLight *hdr_light = &msgblock->msg_light;
+        clusterMsgLight *hdr_light = getLightMessageFromSendBlock(msgblock);
         hdr_data_msg = &hdr_light->data.publish.msg;
     } else {
-        clusterMsg *hdr = &msgblock->msg;
+        clusterMsg *hdr = getMessageFromSendBlock(msgblock);
         hdr_data_msg = &hdr->data.publish.msg;
     }
     hdr_data_msg->channel_len = htonl(channel_len);
@@ -4221,7 +4231,7 @@ void clusterSendFail(char *nodename) {
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataFail);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
     memcpy(hdr->data.fail.about.nodename, nodename, CLUSTER_NAMELEN);
 
     clusterBroadcastMessage(msgblock);
@@ -4237,7 +4247,7 @@ void clusterSendUpdate(clusterLink *link, clusterNode *node) {
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataUpdate);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
     memcpy(hdr->data.update.nodecfg.nodename, node->name, CLUSTER_NAMELEN);
     hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch);
     memcpy(hdr->data.update.nodecfg.slots, node->slots, sizeof(node->slots));
@@ -4259,7 +4269,7 @@ void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, cons
     msglen += sizeof(clusterMsgModule) - 3 + len;
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
     hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */
     hdr->data.module.msg.type = type;
     hdr->data.module.msg.len = htonl(len);
@@ -4348,11 +4358,10 @@ void clusterRequestFailoverAuth(void) {
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
     /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit
      * in the header to communicate the nodes receiving the message that
      * they should authorized the failover even if the primary is working. */
-    if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
+    if (server.cluster->mf_end) getMessageFromSendBlock(msgblock)->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
     clusterBroadcastMessage(msgblock);
     clusterMsgSendBlockDecrRefCount(msgblock);
 }

From 32b09c67d378dc348338075a9a282b8a7e0233f9 Mon Sep 17 00:00:00 2001
From: Ricardo Dias <ricardo.dias@percona.com>
Date: Sat, 21 Dec 2024 22:09:35 +0000
Subject: [PATCH 032/101] Adds support for scripting engines as Valkey modules
 (#1277)

This PR extends the module API to support the addition of different
scripting engines to execute user defined functions.

The scripting engine can be implemented as a Valkey module, and can be
dynamically loaded with the `loadmodule` config directive, or with the
`MODULE LOAD` command.

This PR also adds an example of a dummy scripting engine module, to show
how to use the new module API. The dummy module is implemented in
`tests/modules/helloscripting.c`.

The current module API support, only allows to load scripting engines to
run functions using `FCALL` command.

The additions to the module API are the following:

```c
/* This struct represents a scripting engine function that results from the
 * compilation of a script by the engine implementation. */
struct ValkeyModuleScriptingEngineCompiledFunction

typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx,
    const char *code,
    size_t timeout,
    size_t *out_num_compiled_functions,
    char **err);

typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)(
    ValkeyModuleCtx *module_ctx,
    ValkeyModuleScriptingEngineCtx *engine_ctx,
    ValkeyModuleScriptingEngineFunctionCtx *func_ctx,
    void *compiled_function,
    ValkeyModuleString **keys,
    size_t nkeys,
    ValkeyModuleString **args,
    size_t nargs);

typedef size_t (*ValkeyModuleScriptingEngineGetUsedMemoryFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx);

typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)(
    void *compiled_function);

typedef size_t (*ValkeyModuleScriptingEngineGetEngineMemoryOverheadFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx);

typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx,
    void *compiled_function);

/* This struct stores the callback functions implemented by the scripting
 * engine to provide the functionality for the `FUNCTION *` commands. */
typedef struct ValkeyModuleScriptingEngineMethodsV1 {
    uint64_t version; /* Version of this structure for ABI compat. */

    /* Library create function callback. When a new script is loaded, this
     * callback will be called with the script code, and returns a list of
     * ValkeyModuleScriptingEngineCompiledFunc objects. */
    ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library;

    /* The callback function called when `FCALL` command is called on a function
     * registered in this engine. */
    ValkeyModuleScriptingEngineCallFunctionFunc call_function;

    /* Function callback to get current used memory by the engine. */
    ValkeyModuleScriptingEngineGetUsedMemoryFunc get_used_memory;

    /* Function callback to return memory overhead for a given function. */
    ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead;

    /* Function callback to return memory overhead of the engine. */
    ValkeyModuleScriptingEngineGetEngineMemoryOverheadFunc get_engine_memory_overhead;

    /* Function callback to free the memory of a registered engine function. */
    ValkeyModuleScriptingEngineFreeFunctionFunc free_function;
} ValkeyModuleScriptingEngineMethodsV1;

/* Registers a new scripting engine in the server.
 *
 * - `engine_name`: the name of the scripting engine. This name will match
 *   against the engine name specified in the script header using a shebang.
 *
 * - `engine_ctx`: engine specific context pointer.
 *
 * - `engine_methods`: the struct with the scripting engine callback functions
 * pointers.
 */
int ValkeyModule_RegisterScriptingEngine(ValkeyModuleCtx *ctx,
                                         const char *engine_name,
                                         void *engine_ctx,
                                         ValkeyModuleScriptingEngineMethods engine_methods);

/* Removes the scripting engine from the server.
 *
 * `engine_name` is the name of the scripting engine.
 *
 */
int ValkeyModule_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name);
```

---------

Signed-off-by: Ricardo Dias <ricardo.dias@percona.com>
---
 src/function_lua.c                       | 205 +++++++-----
 src/functions.c                          | 240 ++++++++++++--
 src/functions.h                          |  74 +++--
 src/module.c                             |  76 +++++
 src/module.h                             |  17 +
 src/script.h                             |   2 +
 src/script_lua.c                         |   6 +-
 src/script_lua.h                         |   2 +-
 src/util.c                               |  21 ++
 src/util.h                               |   1 +
 src/valkeymodule.h                       |  99 +++++-
 tests/modules/CMakeLists.txt             |   1 +
 tests/modules/Makefile                   |   3 +-
 tests/modules/helloscripting.c           | 383 +++++++++++++++++++++++
 tests/unit/functions.tcl                 |   4 +-
 tests/unit/moduleapi/scriptingengine.tcl | 126 ++++++++
 16 files changed, 1124 insertions(+), 136 deletions(-)
 create mode 100644 src/module.h
 create mode 100644 tests/modules/helloscripting.c
 create mode 100644 tests/unit/moduleapi/scriptingengine.tcl

diff --git a/src/function_lua.c b/src/function_lua.c
index fa9983bf7e..b535528906 100644
--- a/src/function_lua.c
+++ b/src/function_lua.c
@@ -64,17 +64,14 @@ typedef struct luaFunctionCtx {
 } luaFunctionCtx;
 
 typedef struct loadCtx {
-    functionLibInfo *li;
+    list *functions;
     monotime start_time;
     size_t timeout;
 } loadCtx;
 
-typedef struct registerFunctionArgs {
-    sds name;
-    sds desc;
-    luaFunctionCtx *lua_f_ctx;
-    uint64_t f_flags;
-} registerFunctionArgs;
+static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx,
+                                  engineCtx *engine_ctx,
+                                  void *compiled_function);
 
 /* Hook for FUNCTION LOAD execution.
  * Used to cancel the execution in case of a timeout (500ms).
@@ -93,15 +90,42 @@ static void luaEngineLoadHook(lua_State *lua, lua_Debug *ar) {
     }
 }
 
+static void freeCompiledFunc(ValkeyModuleCtx *module_ctx,
+                             luaEngineCtx *lua_engine_ctx,
+                             void *compiled_func) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
+    compiledFunction *func = compiled_func;
+    decrRefCount(func->name);
+    if (func->desc) {
+        decrRefCount(func->desc);
+    }
+    luaEngineFreeFunction(module_ctx, lua_engine_ctx, func->function);
+    zfree(func);
+}
+
 /*
- * Compile a given blob and save it on the registry.
- * Return a function ctx with Lua ref that allows to later retrieve the
- * function from the registry.
+ * Compile a given script code by generating a set of compiled functions. These
+ * functions are also saved into the the registry of the Lua environment.
+ *
+ * Returns an array of compiled functions. The `compileFunction` struct stores a
+ * Lua ref that allows to later retrieve the function from the registry.
+ * In the `out_num_compiled_functions` parameter is returned the size of the
+ * array.
  *
  * Return NULL on compilation error and set the error to the err variable
  */
-static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size_t timeout, sds *err) {
-    int ret = C_ERR;
+static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx,
+                                          engineCtx *engine_ctx,
+                                          const char *code,
+                                          size_t timeout,
+                                          size_t *out_num_compiled_functions,
+                                          char **err) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
+    compiledFunction **compiled_functions = NULL;
     luaEngineCtx *lua_engine_ctx = engine_ctx;
     lua_State *lua = lua_engine_ctx->lua;
 
@@ -114,15 +138,15 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size
     lua_pop(lua, 1);                                   /* pop the metatable */
 
     /* compile the code */
-    if (luaL_loadbuffer(lua, blob, sdslen(blob), "@user_function")) {
-        *err = sdscatprintf(sdsempty(), "Error compiling function: %s", lua_tostring(lua, -1));
+    if (luaL_loadbuffer(lua, code, strlen(code), "@user_function")) {
+        *err = valkey_asprintf("Error compiling function: %s", lua_tostring(lua, -1));
         lua_pop(lua, 1); /* pops the error */
         goto done;
     }
     serverAssert(lua_isfunction(lua, -1));
 
     loadCtx load_ctx = {
-        .li = li,
+        .functions = listCreate(),
         .start_time = getMonotonicUs(),
         .timeout = timeout,
     };
@@ -133,13 +157,31 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size
     if (lua_pcall(lua, 0, 0, 0)) {
         errorInfo err_info = {0};
         luaExtractErrorInformation(lua, &err_info);
-        *err = sdscatprintf(sdsempty(), "Error registering functions: %s", err_info.msg);
+        *err = valkey_asprintf("Error registering functions: %s", err_info.msg);
         lua_pop(lua, 1); /* pops the error */
         luaErrorInformationDiscard(&err_info);
+        listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD);
+        listNode *node = NULL;
+        while ((node = listNext(iter)) != NULL) {
+            freeCompiledFunc(module_ctx, lua_engine_ctx, listNodeValue(node));
+        }
+        listReleaseIterator(iter);
+        listRelease(load_ctx.functions);
         goto done;
     }
 
-    ret = C_OK;
+    compiled_functions =
+        zcalloc(sizeof(compiledFunction *) * listLength(load_ctx.functions));
+    listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD);
+    listNode *node = NULL;
+    *out_num_compiled_functions = 0;
+    while ((node = listNext(iter)) != NULL) {
+        compiledFunction *func = listNodeValue(node);
+        compiled_functions[*out_num_compiled_functions] = func;
+        (*out_num_compiled_functions)++;
+    }
+    listReleaseIterator(iter);
+    listRelease(load_ctx.functions);
 
 done:
     /* restore original globals */
@@ -152,19 +194,23 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size
 
     lua_sethook(lua, NULL, 0, 0); /* Disable hook */
     luaSaveOnRegistry(lua, REGISTRY_LOAD_CTX_NAME, NULL);
-    return ret;
+    return compiled_functions;
 }
 
 /*
  * Invole the give function with the given keys and args
  */
-static void luaEngineCall(scriptRunCtx *run_ctx,
-                          void *engine_ctx,
+static void luaEngineCall(ValkeyModuleCtx *module_ctx,
+                          engineCtx *engine_ctx,
+                          functionCtx *func_ctx,
                           void *compiled_function,
                           robj **keys,
                           size_t nkeys,
                           robj **args,
                           size_t nargs) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
     luaEngineCtx *lua_engine_ctx = engine_ctx;
     lua_State *lua = lua_engine_ctx->lua;
     luaFunctionCtx *f_ctx = compiled_function;
@@ -177,25 +223,38 @@ static void luaEngineCall(scriptRunCtx *run_ctx,
 
     serverAssert(lua_isfunction(lua, -1));
 
+    scriptRunCtx *run_ctx = (scriptRunCtx *)func_ctx;
     luaCallFunction(run_ctx, lua, keys, nkeys, args, nargs, 0);
     lua_pop(lua, 1); /* Pop error handler */
 }
 
-static size_t luaEngineGetUsedMemoy(void *engine_ctx) {
+static engineMemoryInfo luaEngineGetMemoryInfo(ValkeyModuleCtx *module_ctx,
+                                               engineCtx *engine_ctx) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
     luaEngineCtx *lua_engine_ctx = engine_ctx;
-    return luaMemory(lua_engine_ctx->lua);
+
+    return (engineMemoryInfo){
+        .used_memory = luaMemory(lua_engine_ctx->lua),
+        .engine_memory_overhead = zmalloc_size(lua_engine_ctx),
+    };
 }
 
-static size_t luaEngineFunctionMemoryOverhead(void *compiled_function) {
+static size_t luaEngineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx,
+                                              void *compiled_function) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
     return zmalloc_size(compiled_function);
 }
 
-static size_t luaEngineMemoryOverhead(void *engine_ctx) {
-    luaEngineCtx *lua_engine_ctx = engine_ctx;
-    return zmalloc_size(lua_engine_ctx);
-}
+static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx,
+                                  engineCtx *engine_ctx,
+                                  void *compiled_function) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
 
-static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) {
     luaEngineCtx *lua_engine_ctx = engine_ctx;
     lua_State *lua = lua_engine_ctx->lua;
     luaFunctionCtx *f_ctx = compiled_function;
@@ -203,26 +262,19 @@ static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) {
     zfree(f_ctx);
 }
 
-static void luaRegisterFunctionArgsInitialize(registerFunctionArgs *register_f_args,
-                                              sds name,
-                                              sds desc,
+static void luaRegisterFunctionArgsInitialize(compiledFunction *func,
+                                              robj *name,
+                                              robj *desc,
                                               luaFunctionCtx *lua_f_ctx,
                                               uint64_t flags) {
-    *register_f_args = (registerFunctionArgs){
+    *func = (compiledFunction){
         .name = name,
         .desc = desc,
-        .lua_f_ctx = lua_f_ctx,
+        .function = lua_f_ctx,
         .f_flags = flags,
     };
 }
 
-static void luaRegisterFunctionArgsDispose(lua_State *lua, registerFunctionArgs *register_f_args) {
-    sdsfree(register_f_args->name);
-    if (register_f_args->desc) sdsfree(register_f_args->desc);
-    lua_unref(lua, register_f_args->lua_f_ctx->lua_function_ref);
-    zfree(register_f_args->lua_f_ctx);
-}
-
 /* Read function flags located on the top of the Lua stack.
  * On success, return C_OK and set the flags to 'flags' out parameter
  * Return C_ERR if encounter an unknown flag. */
@@ -267,10 +319,11 @@ static int luaRegisterFunctionReadFlags(lua_State *lua, uint64_t *flags) {
     return ret;
 }
 
-static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs *register_f_args) {
+static int luaRegisterFunctionReadNamedArgs(lua_State *lua,
+                                            compiledFunction *func) {
     char *err = NULL;
-    sds name = NULL;
-    sds desc = NULL;
+    robj *name = NULL;
+    robj *desc = NULL;
     luaFunctionCtx *lua_f_ctx = NULL;
     uint64_t flags = 0;
     if (!lua_istable(lua, 1)) {
@@ -287,14 +340,15 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs
             err = "named argument key given to server.register_function is not a string";
             goto error;
         }
+
         const char *key = lua_tostring(lua, -2);
         if (!strcasecmp(key, "function_name")) {
-            if (!(name = luaGetStringSds(lua, -1))) {
+            if (!(name = luaGetStringObject(lua, -1))) {
                 err = "function_name argument given to server.register_function must be a string";
                 goto error;
             }
         } else if (!strcasecmp(key, "description")) {
-            if (!(desc = luaGetStringSds(lua, -1))) {
+            if (!(desc = luaGetStringObject(lua, -1))) {
                 err = "description argument given to server.register_function must be a string";
                 goto error;
             }
@@ -335,13 +389,17 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs
         goto error;
     }
 
-    luaRegisterFunctionArgsInitialize(register_f_args, name, desc, lua_f_ctx, flags);
+    luaRegisterFunctionArgsInitialize(func,
+                                      name,
+                                      desc,
+                                      lua_f_ctx,
+                                      flags);
 
     return C_OK;
 
 error:
-    if (name) sdsfree(name);
-    if (desc) sdsfree(desc);
+    if (name) decrRefCount(name);
+    if (desc) decrRefCount(desc);
     if (lua_f_ctx) {
         lua_unref(lua, lua_f_ctx->lua_function_ref);
         zfree(lua_f_ctx);
@@ -350,11 +408,12 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs
     return C_ERR;
 }
 
-static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctionArgs *register_f_args) {
+static int luaRegisterFunctionReadPositionalArgs(lua_State *lua,
+                                                 compiledFunction *func) {
     char *err = NULL;
-    sds name = NULL;
+    robj *name = NULL;
     luaFunctionCtx *lua_f_ctx = NULL;
-    if (!(name = luaGetStringSds(lua, 1))) {
+    if (!(name = luaGetStringObject(lua, 1))) {
         err = "first argument to server.register_function must be a string";
         goto error;
     }
@@ -369,17 +428,17 @@ static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctio
     lua_f_ctx = zmalloc(sizeof(*lua_f_ctx));
     lua_f_ctx->lua_function_ref = lua_function_ref;
 
-    luaRegisterFunctionArgsInitialize(register_f_args, name, NULL, lua_f_ctx, 0);
+    luaRegisterFunctionArgsInitialize(func, name, NULL, lua_f_ctx, 0);
 
     return C_OK;
 
 error:
-    if (name) sdsfree(name);
+    if (name) decrRefCount(name);
     luaPushError(lua, err);
     return C_ERR;
 }
 
-static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *register_f_args) {
+static int luaRegisterFunctionReadArgs(lua_State *lua, compiledFunction *func) {
     int argc = lua_gettop(lua);
     if (argc < 1 || argc > 2) {
         luaPushError(lua, "wrong number of arguments to server.register_function");
@@ -387,33 +446,28 @@ static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *reg
     }
 
     if (argc == 1) {
-        return luaRegisterFunctionReadNamedArgs(lua, register_f_args);
+        return luaRegisterFunctionReadNamedArgs(lua, func);
     } else {
-        return luaRegisterFunctionReadPositionalArgs(lua, register_f_args);
+        return luaRegisterFunctionReadPositionalArgs(lua, func);
     }
 }
 
 static int luaRegisterFunction(lua_State *lua) {
-    registerFunctionArgs register_f_args = {0};
+    compiledFunction *func = zcalloc(sizeof(*func));
 
     loadCtx *load_ctx = luaGetFromRegistry(lua, REGISTRY_LOAD_CTX_NAME);
     if (!load_ctx) {
+        zfree(func);
         luaPushError(lua, "server.register_function can only be called on FUNCTION LOAD command");
         return luaError(lua);
     }
 
-    if (luaRegisterFunctionReadArgs(lua, &register_f_args) != C_OK) {
+    if (luaRegisterFunctionReadArgs(lua, func) != C_OK) {
+        zfree(func);
         return luaError(lua);
     }
 
-    sds err = NULL;
-    if (functionLibCreateFunction(register_f_args.name, register_f_args.lua_f_ctx, load_ctx->li, register_f_args.desc,
-                                  register_f_args.f_flags, &err) != C_OK) {
-        luaRegisterFunctionArgsDispose(lua, &register_f_args);
-        luaPushError(lua, err);
-        sdsfree(err);
-        return luaError(lua);
-    }
+    listAddNodeTail(load_ctx->functions, func);
 
     return 0;
 }
@@ -494,16 +548,17 @@ int luaEngineInitEngine(void) {
     lua_enablereadonlytable(lua_engine_ctx->lua, -1, 1); /* protect the new global table */
     lua_replace(lua_engine_ctx->lua, LUA_GLOBALSINDEX);  /* set new global table as the new globals */
 
-
-    engine *lua_engine = zmalloc(sizeof(*lua_engine));
-    *lua_engine = (engine){
-        .engine_ctx = lua_engine_ctx,
-        .create = luaEngineCreate,
-        .call = luaEngineCall,
-        .get_used_memory = luaEngineGetUsedMemoy,
+    engineMethods lua_engine_methods = {
+        .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION,
+        .create_functions_library = luaEngineCreate,
+        .call_function = luaEngineCall,
         .get_function_memory_overhead = luaEngineFunctionMemoryOverhead,
-        .get_engine_memory_overhead = luaEngineMemoryOverhead,
         .free_function = luaEngineFreeFunction,
+        .get_memory_info = luaEngineGetMemoryInfo,
     };
-    return functionsRegisterEngine(LUA_ENGINE_NAME, lua_engine);
+
+    return functionsRegisterEngine(LUA_ENGINE_NAME,
+                                   NULL,
+                                   lua_engine_ctx,
+                                   &lua_engine_methods);
 }
diff --git a/src/functions.c b/src/functions.c
index feb82d4ab7..0d003f7fac 100644
--- a/src/functions.c
+++ b/src/functions.c
@@ -31,6 +31,7 @@
 #include "sds.h"
 #include "dict.h"
 #include "adlist.h"
+#include "module.h"
 
 #define LOAD_TIMEOUT_MS 500
 
@@ -117,9 +118,28 @@ static dict *engines = NULL;
 /* Libraries Ctx. */
 static functionsLibCtx *curr_functions_lib_ctx = NULL;
 
+static void setupEngineModuleCtx(engineInfo *ei, client *c) {
+    if (ei->engineModule != NULL) {
+        serverAssert(ei->module_ctx != NULL);
+        moduleScriptingEngineInitContext(ei->module_ctx, ei->engineModule, c);
+    }
+}
+
+static void teardownEngineModuleCtx(engineInfo *ei) {
+    if (ei->engineModule != NULL) {
+        serverAssert(ei->module_ctx != NULL);
+        moduleFreeContext(ei->module_ctx);
+    }
+}
+
 static size_t functionMallocSize(functionInfo *fi) {
-    return zmalloc_size(fi) + sdsAllocSize(fi->name) + (fi->desc ? sdsAllocSize(fi->desc) : 0) +
-           fi->li->ei->engine->get_function_memory_overhead(fi->function);
+    setupEngineModuleCtx(fi->li->ei, NULL);
+    size_t size = zmalloc_size(fi) +
+                  sdsAllocSize(fi->name) +
+                  (fi->desc ? sdsAllocSize(fi->desc) : 0) +
+                  fi->li->ei->engine->get_function_memory_overhead(fi->li->ei->module_ctx, fi->function);
+    teardownEngineModuleCtx(fi->li->ei);
+    return size;
 }
 
 static size_t libraryMallocSize(functionLibInfo *li) {
@@ -141,8 +161,12 @@ static void engineFunctionDispose(void *obj) {
     if (fi->desc) {
         sdsfree(fi->desc);
     }
+    setupEngineModuleCtx(fi->li->ei, NULL);
     engine *engine = fi->li->ei->engine;
-    engine->free_function(engine->engine_ctx, fi->function);
+    engine->free_function(fi->li->ei->module_ctx,
+                          engine->engine_ctx,
+                          fi->function);
+    teardownEngineModuleCtx(fi->li->ei);
     zfree(fi);
 }
 
@@ -233,6 +257,15 @@ functionsLibCtx *functionsLibCtxCreate(void) {
     return ret;
 }
 
+void functionsAddEngineStats(engineInfo *ei) {
+    serverAssert(curr_functions_lib_ctx != NULL);
+    dictEntry *entry = dictFind(curr_functions_lib_ctx->engines_stats, ei->name);
+    if (entry == NULL) {
+        functionsLibEngineStats *stats = zcalloc(sizeof(*stats));
+        dictAdd(curr_functions_lib_ctx->engines_stats, ei->name, stats);
+    }
+}
+
 /*
  * Creating a function inside the given library.
  * On success, return C_OK.
@@ -242,24 +275,34 @@ functionsLibCtx *functionsLibCtxCreate(void) {
  *       the function will verify that the given name is following the naming format
  *       and return an error if its not.
  */
-int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err) {
-    if (functionsVerifyName(name) != C_OK) {
-        *err = sdsnew("Library names can only contain letters, numbers, or underscores(_) and must be at least one "
-                      "character long");
+static int functionLibCreateFunction(robj *name,
+                                     void *function,
+                                     functionLibInfo *li,
+                                     robj *desc,
+                                     uint64_t f_flags,
+                                     sds *err) {
+    serverAssert(name->type == OBJ_STRING);
+    serverAssert(desc == NULL || desc->type == OBJ_STRING);
+
+    if (functionsVerifyName(name->ptr) != C_OK) {
+        *err = sdsnew("Function names can only contain letters, numbers, or "
+                      "underscores(_) and must be at least one character long");
         return C_ERR;
     }
 
-    if (dictFetchValue(li->functions, name)) {
+    sds name_sds = sdsdup(name->ptr);
+    if (dictFetchValue(li->functions, name_sds)) {
         *err = sdsnew("Function already exists in the library");
+        sdsfree(name_sds);
         return C_ERR;
     }
 
     functionInfo *fi = zmalloc(sizeof(*fi));
     *fi = (functionInfo){
-        .name = name,
+        .name = name_sds,
         .function = function,
         .li = li,
-        .desc = desc,
+        .desc = desc ? sdsdup(desc->ptr) : NULL,
         .f_flags = f_flags,
     };
 
@@ -403,11 +446,24 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l
     return ret;
 }
 
-/* Register an engine, should be called once by the engine on startup and give the following:
+/* Register an engine, should be called once by the engine on startup and give
+ * the following:
  *
  * - engine_name - name of the engine to register
- * - engine_ctx - the engine ctx that should be used by the server to interact with the engine */
-int functionsRegisterEngine(const char *engine_name, engine *engine) {
+ *
+ * - engine_module - the valkey module that implements this engine
+ *
+ * - engine_ctx - the engine ctx that should be used by the server to interact
+ * with the engine.
+ *
+ * - engine_methods - the struct with the scripting engine callback functions
+ * pointers.
+ *
+ */
+int functionsRegisterEngine(const char *engine_name,
+                            ValkeyModule *engine_module,
+                            engineCtx *engine_ctx,
+                            engineMethods *engine_methods) {
     sds engine_name_sds = sdsnew(engine_name);
     if (dictFetchValue(engines, engine_name_sds)) {
         serverLog(LL_WARNING, "Same engine was registered twice");
@@ -415,6 +471,16 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) {
         return C_ERR;
     }
 
+    engine *eng = zmalloc(sizeof(engine));
+    *eng = (engine){
+        .engine_ctx = engine_ctx,
+        .create = engine_methods->create_functions_library,
+        .call = engine_methods->call_function,
+        .get_function_memory_overhead = engine_methods->get_function_memory_overhead,
+        .free_function = engine_methods->free_function,
+        .get_memory_info = engine_methods->get_memory_info,
+    };
+
     client *c = createClient(NULL);
     c->flag.deny_blocking = 1;
     c->flag.script = 1;
@@ -422,15 +488,64 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) {
     engineInfo *ei = zmalloc(sizeof(*ei));
     *ei = (engineInfo){
         .name = engine_name_sds,
-        .engine = engine,
+        .engineModule = engine_module,
+        .module_ctx = engine_module ? moduleAllocateContext() : NULL,
+        .engine = eng,
         .c = c,
     };
 
     dictAdd(engines, engine_name_sds, ei);
 
-    engine_cache_memory += zmalloc_size(ei) + sdsAllocSize(ei->name) + zmalloc_size(engine) +
-                           engine->get_engine_memory_overhead(engine->engine_ctx);
+    functionsAddEngineStats(ei);
+
+    setupEngineModuleCtx(ei, NULL);
+    engineMemoryInfo mem_info = eng->get_memory_info(ei->module_ctx,
+                                                     eng->engine_ctx);
+    engine_cache_memory += zmalloc_size(ei) +
+                           sdsAllocSize(ei->name) +
+                           zmalloc_size(eng) +
+                           mem_info.engine_memory_overhead;
+
+    teardownEngineModuleCtx(ei);
+
+    return C_OK;
+}
+
+/* Removes a scripting engine from the server.
+ *
+ * - engine_name - name of the engine to remove
+ */
+int functionsUnregisterEngine(const char *engine_name) {
+    sds engine_name_sds = sdsnew(engine_name);
+    dictEntry *entry = dictFind(engines, engine_name_sds);
+    if (entry == NULL) {
+        serverLog(LL_WARNING, "There's no engine registered with name %s", engine_name);
+        sdsfree(engine_name_sds);
+        return C_ERR;
+    }
+
+    engineInfo *ei = dictGetVal(entry);
+
+    dictIterator *iter = dictGetSafeIterator(curr_functions_lib_ctx->libraries);
+    while ((entry = dictNext(iter))) {
+        functionLibInfo *li = dictGetVal(entry);
+        if (li->ei == ei) {
+            libraryUnlink(curr_functions_lib_ctx, li);
+            engineLibraryFree(li);
+        }
+    }
+    dictReleaseIterator(iter);
+
+    zfree(ei->engine);
+    sdsfree(ei->name);
+    freeClient(ei->c);
+    if (ei->engineModule != NULL) {
+        serverAssert(ei->module_ctx != NULL);
+        zfree(ei->module_ctx);
+    }
+    zfree(ei);
 
+    sdsfree(engine_name_sds);
     return C_OK;
 }
 
@@ -649,11 +764,19 @@ static void fcallCommandGeneric(client *c, int ro) {
     }
 
     scriptRunCtx run_ctx;
-
     if (scriptPrepareForRun(&run_ctx, fi->li->ei->c, c, fi->name, fi->f_flags, ro) != C_OK) return;
-
-    engine->call(&run_ctx, engine->engine_ctx, fi->function, c->argv + 3, numkeys, c->argv + 3 + numkeys,
+    setupEngineModuleCtx(fi->li->ei, run_ctx.original_client);
+
+    engine->call(fi->li->ei->module_ctx,
+                 engine->engine_ctx,
+                 &run_ctx,
+                 fi->function,
+                 c->argv + 3,
+                 numkeys,
+                 c->argv + 3 + numkeys,
                  c->argc - 3 - numkeys);
+
+    teardownEngineModuleCtx(fi->li->ei);
     scriptResetRun(&run_ctx);
 }
 
@@ -953,14 +1076,40 @@ void functionFreeLibMetaData(functionsLibMetaData *md) {
     if (md->engine) sdsfree(md->engine);
 }
 
+static void freeCompiledFunctions(engineInfo *ei,
+                                  compiledFunction **compiled_functions,
+                                  size_t num_compiled_functions,
+                                  size_t free_function_from_idx) {
+    setupEngineModuleCtx(ei, NULL);
+
+    for (size_t i = 0; i < num_compiled_functions; i++) {
+        compiledFunction *func = compiled_functions[i];
+        decrRefCount(func->name);
+        if (func->desc) {
+            decrRefCount(func->desc);
+        }
+        if (i >= free_function_from_idx) {
+            ei->engine->free_function(ei->module_ctx,
+                                      ei->engine->engine_ctx,
+                                      func->function);
+        }
+        zfree(func);
+    }
+
+    zfree(compiled_functions);
+
+    teardownEngineModuleCtx(ei);
+}
+
 /* Compile and save the given library, return the loaded library name on success
  * and NULL on failure. In case on failure the err out param is set with relevant error message */
 sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout) {
     dictIterator *iter = NULL;
     dictEntry *entry = NULL;
-    functionLibInfo *new_li = NULL;
     functionLibInfo *old_li = NULL;
     functionsLibMetaData md = {0};
+    functionLibInfo *new_li = NULL;
+
     if (functionExtractLibMetaData(code, &md, err) != C_OK) {
         return NULL;
     }
@@ -990,10 +1139,47 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC
     }
 
     new_li = engineLibraryCreate(md.name, ei, code);
-    if (engine->create(engine->engine_ctx, new_li, md.code, timeout, err) != C_OK) {
+    size_t num_compiled_functions = 0;
+    char *compile_error = NULL;
+    setupEngineModuleCtx(ei, NULL);
+    compiledFunction **compiled_functions =
+        engine->create(ei->module_ctx,
+                       engine->engine_ctx,
+                       md.code,
+                       timeout,
+                       &num_compiled_functions,
+                       &compile_error);
+    teardownEngineModuleCtx(ei);
+    if (compiled_functions == NULL) {
+        serverAssert(num_compiled_functions == 0);
+        serverAssert(compile_error != NULL);
+        *err = sdsnew(compile_error);
+        zfree(compile_error);
         goto error;
     }
 
+    for (size_t i = 0; i < num_compiled_functions; i++) {
+        compiledFunction *func = compiled_functions[i];
+        int ret = functionLibCreateFunction(func->name,
+                                            func->function,
+                                            new_li,
+                                            func->desc,
+                                            func->f_flags,
+                                            err);
+        if (ret == C_ERR) {
+            freeCompiledFunctions(ei,
+                                  compiled_functions,
+                                  num_compiled_functions,
+                                  i);
+            goto error;
+        }
+    }
+
+    freeCompiledFunctions(ei,
+                          compiled_functions,
+                          num_compiled_functions,
+                          num_compiled_functions);
+
     if (dictSize(new_li->functions) == 0) {
         *err = sdsnew("No functions registered");
         goto error;
@@ -1063,6 +1249,7 @@ void functionLoadCommand(client *c) {
         timeout = 0;
     }
     if (!(library_name = functionsCreateWithLibraryCtx(code->ptr, replace, &err, curr_functions_lib_ctx, timeout))) {
+        serverAssert(err != NULL);
         addReplyErrorSds(c, err);
         return;
     }
@@ -1080,7 +1267,11 @@ unsigned long functionsMemory(void) {
     while ((entry = dictNext(iter))) {
         engineInfo *ei = dictGetVal(entry);
         engine *engine = ei->engine;
-        engines_memory += engine->get_used_memory(engine->engine_ctx);
+        setupEngineModuleCtx(ei, NULL);
+        engineMemoryInfo mem_info = engine->get_memory_info(ei->module_ctx,
+                                                            engine->engine_ctx);
+        engines_memory += mem_info.used_memory;
+        teardownEngineModuleCtx(ei);
     }
     dictReleaseIterator(iter);
 
@@ -1120,12 +1311,11 @@ size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx) {
 int functionsInit(void) {
     engines = dictCreate(&engineDictType);
 
+    curr_functions_lib_ctx = functionsLibCtxCreate();
+
     if (luaEngineInitEngine() != C_OK) {
         return C_ERR;
     }
 
-    /* Must be initialized after engines initialization */
-    curr_functions_lib_ctx = functionsLibCtxCreate();
-
     return C_OK;
 }
diff --git a/src/functions.h b/src/functions.h
index b199fbd06e..89e39fdc56 100644
--- a/src/functions.h
+++ b/src/functions.h
@@ -54,53 +54,68 @@
 
 typedef struct functionLibInfo functionLibInfo;
 
+/* ValkeyModule type aliases for scripting engine structs and types. */
+typedef ValkeyModuleScriptingEngineCtx engineCtx;
+typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx;
+typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction;
+typedef ValkeyModuleScriptingEngineMemoryInfo engineMemoryInfo;
+typedef ValkeyModuleScriptingEngineMethods engineMethods;
+
 typedef struct engine {
     /* engine specific context */
-    void *engine_ctx;
-
-    /* Create function callback, get the engine_ctx, and function code
-     * engine_ctx - opaque struct that was created on engine initialization
-     * li - library information that need to be provided and when add functions
-     * code - the library code
-     * timeout - timeout for the library creation (0 for no timeout)
-     * err - description of error (if occurred)
-     * returns C_ERR on error and set err to be the error message */
-    int (*create)(void *engine_ctx, functionLibInfo *li, sds code, size_t timeout, sds *err);
-
-    /* Invoking a function, r_ctx is an opaque object (from engine POV).
-     * The r_ctx should be used by the engine to interaction with the server,
+    engineCtx *engine_ctx;
+
+    /* Compiles the script code and returns an array of compiled functions
+     * registered in the script./
+     *
+     * Returns NULL on error and set err to be the error message */
+    compiledFunction **(*create)(
+        ValkeyModuleCtx *module_ctx,
+        engineCtx *engine_ctx,
+        const char *code,
+        size_t timeout,
+        size_t *out_num_compiled_functions,
+        char **err);
+
+    /* Invoking a function, func_ctx is an opaque object (from engine POV).
+     * The func_ctx should be used by the engine to interaction with the server,
      * such interaction could be running commands, set resp, or set
      * replication mode
      */
-    void (*call)(scriptRunCtx *r_ctx,
-                 void *engine_ctx,
+    void (*call)(ValkeyModuleCtx *module_ctx,
+                 engineCtx *engine_ctx,
+                 functionCtx *func_ctx,
                  void *compiled_function,
                  robj **keys,
                  size_t nkeys,
                  robj **args,
                  size_t nargs);
 
-    /* get current used memory by the engine */
-    size_t (*get_used_memory)(void *engine_ctx);
+    /* free the given function */
+    void (*free_function)(ValkeyModuleCtx *module_ctx,
+                          engineCtx *engine_ctx,
+                          void *compiled_function);
 
     /* Return memory overhead for a given function,
      * such memory is not counted as engine memory but as general
      * structs memory that hold different information */
-    size_t (*get_function_memory_overhead)(void *compiled_function);
+    size_t (*get_function_memory_overhead)(ValkeyModuleCtx *module_ctx,
+                                           void *compiled_function);
 
-    /* Return memory overhead for engine (struct size holding the engine)*/
-    size_t (*get_engine_memory_overhead)(void *engine_ctx);
+    /* Get the current used memory by the engine */
+    engineMemoryInfo (*get_memory_info)(ValkeyModuleCtx *module_ctx,
+                                        engineCtx *engine_ctx);
 
-    /* free the given function */
-    void (*free_function)(void *engine_ctx, void *compiled_function);
 } engine;
 
 /* Hold information about an engine.
  * Used on rdb.c so it must be declared here. */
 typedef struct engineInfo {
-    sds name;       /* Name of the engine */
-    engine *engine; /* engine callbacks that allows to interact with the engine */
-    client *c;      /* Client that is used to run commands */
+    sds name;                    /* Name of the engine */
+    ValkeyModule *engineModule;  /* the module that implements the scripting engine */
+    ValkeyModuleCtx *module_ctx; /* Scripting engine module context */
+    engine *engine;              /* engine callbacks that allows to interact with the engine */
+    client *c;                   /* Client that is used to run commands */
 } engineInfo;
 
 /* Hold information about the specific function.
@@ -123,7 +138,12 @@ struct functionLibInfo {
     sds code;        /* Library code */
 };
 
-int functionsRegisterEngine(const char *engine_name, engine *engine_ctx);
+int functionsRegisterEngine(const char *engine_name,
+                            ValkeyModule *engine_module,
+                            void *engine_ctx,
+                            engineMethods *engine_methods);
+int functionsUnregisterEngine(const char *engine_name);
+
 sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout);
 unsigned long functionsMemory(void);
 unsigned long functionsMemoryOverhead(void);
@@ -138,8 +158,6 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx);
 void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *));
 void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async);
 
-int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err);
-
 int luaEngineInitEngine(void);
 int functionsInit(void);
 
diff --git a/src/module.c b/src/module.c
index 541ae490ab..db493dd8bc 100644
--- a/src/module.c
+++ b/src/module.c
@@ -62,6 +62,7 @@
 #include "crc16_slottable.h"
 #include "valkeymodule.h"
 #include "io_threads.h"
+#include "functions.h"
 #include <dlfcn.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
@@ -879,6 +880,15 @@ void moduleCallCommandUnblockedHandler(client *c) {
     moduleReleaseTempClient(c);
 }
 
+/* Allocates the memory necessary to hold the ValkeyModuleCtx structure, and
+ * returns the pointer to the allocated memory.
+ *
+ * Used by the scripting engines implementation to cache the context structure.
+ */
+ValkeyModuleCtx *moduleAllocateContext(void) {
+    return (ValkeyModuleCtx *)zcalloc(sizeof(ValkeyModuleCtx));
+}
+
 /* Create a module ctx and keep track of the nesting level.
  *
  * Note: When creating ctx for threads (VM_GetThreadSafeContext and
@@ -921,6 +931,16 @@ void moduleCreateContext(ValkeyModuleCtx *out_ctx, ValkeyModule *module, int ctx
     }
 }
 
+/* Initialize a module context to be used by scripting engines callback
+ * functions.
+ */
+void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx,
+                                      ValkeyModule *module,
+                                      client *client) {
+    moduleCreateContext(out_ctx, module, VALKEYMODULE_CTX_NONE);
+    out_ctx->client = client;
+}
+
 /* This command binds the normal command invocation with commands
  * exported by modules. */
 void ValkeyModuleCommandDispatcher(client *c) {
@@ -13074,6 +13094,60 @@ int VM_RdbSave(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) {
     return VALKEYMODULE_OK;
 }
 
+/* Registers a new scripting engine in the server.
+ *
+ * - `module_ctx`: the module context object.
+ *
+ * - `engine_name`: the name of the scripting engine. This name will match
+ *   against the engine name specified in the script header using a shebang.
+ *
+ * - `engine_ctx`: engine specific context pointer.
+ *
+ * - `engine_methods`: the struct with the scripting engine callback functions
+ *   pointers.
+ *
+ * Returns VALKEYMODULE_OK if the engine is successfully registered, and
+ * VALKEYMODULE_ERR in case some failure occurs. In case of a failure, an error
+ * message is logged.
+ */
+int VM_RegisterScriptingEngine(ValkeyModuleCtx *module_ctx,
+                               const char *engine_name,
+                               ValkeyModuleScriptingEngineCtx *engine_ctx,
+                               ValkeyModuleScriptingEngineMethods *engine_methods) {
+    serverLog(LL_DEBUG, "Registering a new scripting engine: %s", engine_name);
+
+    if (engine_methods->version > VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION) {
+        serverLog(LL_WARNING, "The engine implementation version is greater "
+                              "than what this server supports. Server ABI "
+                              "Version: %lu, Engine ABI version: %lu",
+                  VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION,
+                  (unsigned long)engine_methods->version);
+        return VALKEYMODULE_ERR;
+    }
+
+    if (functionsRegisterEngine(engine_name,
+                                module_ctx->module,
+                                engine_ctx,
+                                engine_methods) != C_OK) {
+        return VALKEYMODULE_ERR;
+    }
+
+    return VALKEYMODULE_OK;
+}
+
+/* Removes the scripting engine from the server.
+ *
+ * `engine_name` is the name of the scripting engine.
+ *
+ * Returns VALKEYMODULE_OK.
+ *
+ */
+int VM_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name) {
+    UNUSED(ctx);
+    functionsUnregisterEngine(engine_name);
+    return VALKEYMODULE_OK;
+}
+
 /* MODULE command.
  *
  * MODULE LIST
@@ -13944,4 +14018,6 @@ void moduleRegisterCoreAPI(void) {
     REGISTER_API(RdbStreamFree);
     REGISTER_API(RdbLoad);
     REGISTER_API(RdbSave);
+    REGISTER_API(RegisterScriptingEngine);
+    REGISTER_API(UnregisterScriptingEngine);
 }
diff --git a/src/module.h b/src/module.h
new file mode 100644
index 0000000000..f61ef1e3cb
--- /dev/null
+++ b/src/module.h
@@ -0,0 +1,17 @@
+#ifndef _MODULE_H_
+#define _MODULE_H_
+
+/* This header file exposes a set of functions defined in module.c that are
+ * not part of the module API, but are used by the core to interact with modules
+ */
+
+typedef struct ValkeyModuleCtx ValkeyModuleCtx;
+typedef struct ValkeyModule ValkeyModule;
+
+ValkeyModuleCtx *moduleAllocateContext(void);
+void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx,
+                                      ValkeyModule *module,
+                                      client *client);
+void moduleFreeContext(ValkeyModuleCtx *ctx);
+
+#endif /* _MODULE_H_ */
diff --git a/src/script.h b/src/script.h
index 7fff34a40b..194cc8bd05 100644
--- a/src/script.h
+++ b/src/script.h
@@ -67,6 +67,8 @@
 #define SCRIPT_ALLOW_CROSS_SLOT (1ULL << 8) /* Indicate that the current script may access keys from multiple slots */
 typedef struct scriptRunCtx scriptRunCtx;
 
+/* This struct stores the necessary information to manage the execution of
+ * scripts using EVAL and FCALL. */
 struct scriptRunCtx {
     const char *funcname;
     client *c;
diff --git a/src/script_lua.c b/src/script_lua.c
index 5093fa944f..29d352d44b 100644
--- a/src/script_lua.c
+++ b/src/script_lua.c
@@ -1258,15 +1258,15 @@ static void luaLoadLibraries(lua_State *lua) {
 
 /* Return sds of the string value located on stack at the given index.
  * Return NULL if the value is not a string. */
-sds luaGetStringSds(lua_State *lua, int index) {
+robj *luaGetStringObject(lua_State *lua, int index) {
     if (!lua_isstring(lua, index)) {
         return NULL;
     }
 
     size_t len;
     const char *str = lua_tolstring(lua, index, &len);
-    sds str_sds = sdsnewlen(str, len);
-    return str_sds;
+    robj *str_obj = createStringObject(str, len);
+    return str_obj;
 }
 
 static int luaProtectedTableError(lua_State *lua) {
diff --git a/src/script_lua.h b/src/script_lua.h
index 35edf46af6..6c60754bbc 100644
--- a/src/script_lua.h
+++ b/src/script_lua.h
@@ -67,7 +67,7 @@ typedef struct errorInfo {
 } errorInfo;
 
 void luaRegisterServerAPI(lua_State *lua);
-sds luaGetStringSds(lua_State *lua, int index);
+robj *luaGetStringObject(lua_State *lua, int index);
 void luaRegisterGlobalProtectionFunction(lua_State *lua);
 void luaSetErrorMetatable(lua_State *lua);
 void luaSetAllowListProtection(lua_State *lua);
diff --git a/src/util.c b/src/util.c
index 6d99d47e5a..6e44392ce1 100644
--- a/src/util.c
+++ b/src/util.c
@@ -50,6 +50,7 @@
 #include "util.h"
 #include "sha256.h"
 #include "config.h"
+#include "zmalloc.h"
 
 #include "valkey_strtod.h"
 
@@ -1380,3 +1381,23 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...) {
     va_end(args);
     return result;
 }
+
+/* A printf-like function that returns a freshly allocated string.
+ *
+ * This function is similar to asprintf function, but it uses zmalloc for
+ * allocating the string buffer. */
+char *valkey_asprintf(char const *fmt, ...) {
+    va_list args;
+
+    va_start(args, fmt);
+    size_t str_len = vsnprintf(NULL, 0, fmt, args) + 1;
+    va_end(args);
+
+    char *str = zmalloc(str_len);
+
+    va_start(args, fmt);
+    vsnprintf(str, str_len, fmt, args);
+    va_end(args);
+
+    return str;
+}
diff --git a/src/util.h b/src/util.h
index 51eb38f0b4..61095ddb65 100644
--- a/src/util.h
+++ b/src/util.h
@@ -99,5 +99,6 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...);
 #endif
 size_t valkey_strlcpy(char *dst, const char *src, size_t dsize);
 size_t valkey_strlcat(char *dst, const char *src, size_t dsize);
+char *valkey_asprintf(char const *fmt, ...);
 
 #endif
diff --git a/src/valkeymodule.h b/src/valkeymodule.h
index 7c3adfd477..1d99d2ff7a 100644
--- a/src/valkeymodule.h
+++ b/src/valkeymodule.h
@@ -783,6 +783,7 @@ typedef enum {
 } ValkeyModuleACLLogEntryReason;
 
 /* Incomplete structures needed by both the core and modules. */
+typedef struct ValkeyModuleCtx ValkeyModuleCtx;
 typedef struct ValkeyModuleIO ValkeyModuleIO;
 typedef struct ValkeyModuleDigest ValkeyModuleDigest;
 typedef struct ValkeyModuleInfoCtx ValkeyModuleInfoCtx;
@@ -794,6 +795,93 @@ typedef void (*ValkeyModuleInfoFunc)(ValkeyModuleInfoCtx *ctx, int for_crash_rep
 typedef void (*ValkeyModuleDefragFunc)(ValkeyModuleDefragCtx *ctx);
 typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata);
 
+/* Current ABI version for scripting engine modules. */
+#define VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION 1UL
+
+/* Type definitions for implementing scripting engines modules. */
+typedef void ValkeyModuleScriptingEngineCtx;
+typedef void ValkeyModuleScriptingEngineFunctionCtx;
+
+/* This struct represents a scripting engine function that results from the
+ * compilation of a script by the engine implementation.
+ *
+ * IMPORTANT: If we ever need to add/remove fields from this struct, we need
+ * to bump the version number defined in the
+ * `VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION` constant.
+ */
+typedef struct ValkeyModuleScriptingEngineCompiledFunction {
+    ValkeyModuleString *name; /* Function name */
+    void *function;           /* Opaque object representing a function, usually it'
+                                 the function compiled code. */
+    ValkeyModuleString *desc; /* Function description */
+    uint64_t f_flags;         /* Function flags */
+} ValkeyModuleScriptingEngineCompiledFunction;
+
+/* This struct is used to return the memory information of the scripting
+ * engine. */
+typedef struct ValkeyModuleScriptingEngineMemoryInfo {
+    /* The memory used by the scripting engine runtime. */
+    size_t used_memory;
+    /* The memory used by the scripting engine data structures. */
+    size_t engine_memory_overhead;
+} ValkeyModuleScriptingEngineMemoryInfo;
+
+typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx,
+    const char *code,
+    size_t timeout,
+    size_t *out_num_compiled_functions,
+    char **err);
+
+typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx,
+    ValkeyModuleScriptingEngineFunctionCtx *func_ctx,
+    void *compiled_function,
+    ValkeyModuleString **keys,
+    size_t nkeys,
+    ValkeyModuleString **args,
+    size_t nargs);
+
+typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)(
+    ValkeyModuleCtx *module_ctx,
+    void *compiled_function);
+
+typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx,
+    void *compiled_function);
+
+typedef ValkeyModuleScriptingEngineMemoryInfo (*ValkeyModuleScriptingEngineGetMemoryInfoFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx);
+
+typedef struct ValkeyModuleScriptingEngineMethodsV1 {
+    uint64_t version; /* Version of this structure for ABI compat. */
+
+    /* Library create function callback. When a new script is loaded, this
+     * callback will be called with the script code, and returns a list of
+     * ValkeyModuleScriptingEngineCompiledFunc objects. */
+    ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library;
+
+    /* Function callback to free the memory of a registered engine function. */
+    ValkeyModuleScriptingEngineFreeFunctionFunc free_function;
+
+    /* The callback function called when `FCALL` command is called on a function
+     * registered in this engine. */
+    ValkeyModuleScriptingEngineCallFunctionFunc call_function;
+
+    /* Function callback to return memory overhead for a given function. */
+    ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead;
+
+    /* Function callback to get the used memory by the engine. */
+    ValkeyModuleScriptingEngineGetMemoryInfoFunc get_memory_info;
+
+} ValkeyModuleScriptingEngineMethodsV1;
+
+#define ValkeyModuleScriptingEngineMethods ValkeyModuleScriptingEngineMethodsV1
+
 /* ------------------------- End of common defines ------------------------ */
 
 /* ----------- The rest of the defines are only for modules ----------------- */
@@ -826,7 +914,6 @@ typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata);
 #endif
 
 /* Incomplete structures for compiler checks but opaque access. */
-typedef struct ValkeyModuleCtx ValkeyModuleCtx;
 typedef struct ValkeyModuleCommand ValkeyModuleCommand;
 typedef struct ValkeyModuleCallReply ValkeyModuleCallReply;
 typedef struct ValkeyModuleType ValkeyModuleType;
@@ -1650,6 +1737,14 @@ VALKEYMODULE_API int (*ValkeyModule_RdbSave)(ValkeyModuleCtx *ctx,
                                              ValkeyModuleRdbStream *stream,
                                              int flags) VALKEYMODULE_ATTR;
 
+VALKEYMODULE_API int (*ValkeyModule_RegisterScriptingEngine)(ValkeyModuleCtx *module_ctx,
+                                                             const char *engine_name,
+                                                             ValkeyModuleScriptingEngineCtx *engine_ctx,
+                                                             ValkeyModuleScriptingEngineMethods *engine_methods) VALKEYMODULE_ATTR;
+
+VALKEYMODULE_API int (*ValkeyModule_UnregisterScriptingEngine)(ValkeyModuleCtx *module_ctx,
+                                                               const char *engine_name) VALKEYMODULE_ATTR;
+
 #define ValkeyModule_IsAOFClient(id) ((id) == UINT64_MAX)
 
 /* This is included inline inside each Valkey module. */
@@ -2017,6 +2112,8 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in
     VALKEYMODULE_GET_API(RdbStreamFree);
     VALKEYMODULE_GET_API(RdbLoad);
     VALKEYMODULE_GET_API(RdbSave);
+    VALKEYMODULE_GET_API(RegisterScriptingEngine);
+    VALKEYMODULE_GET_API(UnregisterScriptingEngine);
 
     if (ValkeyModule_IsModuleNameBusy && ValkeyModule_IsModuleNameBusy(name)) return VALKEYMODULE_ERR;
     ValkeyModule_SetModuleAttribs(ctx, name, ver, apiver);
diff --git a/tests/modules/CMakeLists.txt b/tests/modules/CMakeLists.txt
index 0cac0c4cb6..e98a878c9d 100644
--- a/tests/modules/CMakeLists.txt
+++ b/tests/modules/CMakeLists.txt
@@ -40,6 +40,7 @@ list(APPEND MODULES_LIST "moduleauthtwo")
 list(APPEND MODULES_LIST "rdbloadsave")
 list(APPEND MODULES_LIST "crash")
 list(APPEND MODULES_LIST "cluster")
+list(APPEND MODULES_LIST "helloscripting")
 
 foreach (MODULE_NAME ${MODULES_LIST})
     message(STATUS "Building test module: ${MODULE_NAME}")
diff --git a/tests/modules/Makefile b/tests/modules/Makefile
index 82813bb6f7..963546a9ff 100644
--- a/tests/modules/Makefile
+++ b/tests/modules/Makefile
@@ -65,7 +65,8 @@ TEST_MODULES = \
     moduleauthtwo.so \
     rdbloadsave.so \
     crash.so \
-    cluster.so
+    cluster.so \
+    helloscripting.so
 
 .PHONY: all
 
diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c
new file mode 100644
index 0000000000..fdca6c8e91
--- /dev/null
+++ b/tests/modules/helloscripting.c
@@ -0,0 +1,383 @@
+#include "valkeymodule.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+
+/*
+ * This module implements a very simple stack based scripting language.
+ * It's purpose is only to test the valkey module API to implement scripting
+ * engines.
+ *
+ * The language is called HELLO, and a program in this language is formed by
+ * a list of function definitions.
+ * The language only supports 32-bit integer, and it only allows to return an
+ * integer constant, or return the value passed as the first argument to the
+ * function.
+ *
+ * Example of a program:
+ *
+ * ```
+ * FUNCTION foo  # declaration of function 'foo'
+ * ARGS 0        # pushes the value in the first argument to the top of the
+ *               # stack
+ * RETURN        # returns the current value on the top of the stack and marks
+ *               # the end of the function declaration
+ *
+ * FUNCTION bar  # declaration of function 'bar'
+ * CONSTI 432    # pushes the value 432 to the top of the stack
+ * RETURN        # returns the current value on the top of the stack and marks
+ *               # the end of the function declaration.
+ * ```
+ */
+
+/*
+ * List of instructions of the HELLO language.
+ */
+typedef enum HelloInstKind {
+    FUNCTION = 0,
+    CONSTI,
+    ARGS,
+    RETURN,
+    _NUM_INSTRUCTIONS, // Not a real instruction.
+} HelloInstKind;
+
+/*
+ * String representations of the instructions above.
+ */
+const char *HelloInstKindStr[] = {
+    "FUNCTION",
+    "CONSTI",
+    "ARGS",
+    "RETURN",
+};
+
+/*
+ * Struct that represents an instance of an instruction.
+ * Instructions may have at most one parameter.
+ */
+typedef struct HelloInst {
+    HelloInstKind kind;
+    union {
+        uint32_t integer;
+        const char *string;
+    } param;
+} HelloInst;
+
+/*
+ * Struct that represents an instance of a function.
+ * A function is just a list of instruction instances.
+ */
+typedef struct HelloFunc {
+    char *name;
+    HelloInst instructions[256];
+    uint32_t num_instructions;
+} HelloFunc;
+
+/*
+ * Struct that represents an instance of an HELLO program.
+ * A program is just a list of function instances.
+ */
+typedef struct HelloProgram {
+    HelloFunc *functions[16];
+    uint32_t num_functions;
+} HelloProgram;
+
+/*
+ * Struct that represents the runtime context of an HELLO program.
+ */
+typedef struct HelloLangCtx {
+    HelloProgram *program;
+} HelloLangCtx;
+
+
+static HelloLangCtx *hello_ctx = NULL;
+
+
+static uint32_t str2int(const char *str) {
+    char *end;
+    errno = 0;
+    uint32_t val = (uint32_t)strtoul(str, &end, 10);
+    ValkeyModule_Assert(errno == 0);
+    return val;
+}
+
+/*
+ * Parses the kind of instruction that the current token points to.
+ */
+static HelloInstKind helloLangParseInstruction(const char *token) {
+    for (HelloInstKind i = 0; i < _NUM_INSTRUCTIONS; i++) {
+        if (strcmp(HelloInstKindStr[i], token) == 0) {
+            return i;
+        }
+    }
+    return _NUM_INSTRUCTIONS;
+}
+
+/*
+ * Parses the function param.
+ */
+static void helloLangParseFunction(HelloFunc *func) {
+    char *token = strtok(NULL, " \n");
+    ValkeyModule_Assert(token != NULL);
+    func->name = ValkeyModule_Alloc(sizeof(char) * strlen(token) + 1);
+    strcpy(func->name, token);
+}
+
+/*
+ * Parses an integer parameter.
+ */
+static void helloLangParseIntegerParam(HelloFunc *func) {
+    char *token = strtok(NULL, " \n");
+    func->instructions[func->num_instructions].param.integer = str2int(token);
+}
+
+/*
+ * Parses the CONSTI instruction parameter.
+ */
+static void helloLangParseConstI(HelloFunc *func) {
+    helloLangParseIntegerParam(func);
+    func->num_instructions++;
+}
+
+/*
+ * Parses the ARGS instruction parameter.
+ */
+static void helloLangParseArgs(HelloFunc *func) {
+    helloLangParseIntegerParam(func);
+    func->num_instructions++;
+}
+
+/*
+ * Parses an HELLO program source code.
+ */
+static HelloProgram *helloLangParseCode(const char *code,
+                                        HelloProgram *program) {
+    char *_code = ValkeyModule_Alloc(sizeof(char) * strlen(code) + 1);
+    strcpy(_code, code);
+
+    HelloFunc *currentFunc = NULL;
+
+    char *token = strtok(_code, " \n");
+    while (token != NULL) {
+        HelloInstKind kind = helloLangParseInstruction(token);
+
+        if (currentFunc != NULL) {
+            currentFunc->instructions[currentFunc->num_instructions].kind = kind;
+        }
+
+        switch (kind) {
+        case FUNCTION:
+            ValkeyModule_Assert(currentFunc == NULL);
+            currentFunc = ValkeyModule_Alloc(sizeof(HelloFunc));
+            memset(currentFunc, 0, sizeof(HelloFunc));
+            program->functions[program->num_functions++] = currentFunc;
+            helloLangParseFunction(currentFunc);
+            break;
+        case CONSTI:
+            ValkeyModule_Assert(currentFunc != NULL);
+            helloLangParseConstI(currentFunc);
+            break;
+        case ARGS:
+            ValkeyModule_Assert(currentFunc != NULL);
+            helloLangParseArgs(currentFunc);
+            break;
+        case RETURN:
+            ValkeyModule_Assert(currentFunc != NULL);
+            currentFunc->num_instructions++;
+            currentFunc = NULL;
+            break;
+        default:
+            ValkeyModule_Assert(0);
+        }
+
+        token = strtok(NULL, " \n");
+    }
+
+    ValkeyModule_Free(_code);
+
+    return program;
+}
+
+/*
+ * Executes an HELLO function.
+ */
+static uint32_t executeHelloLangFunction(HelloFunc *func,
+                                         ValkeyModuleString **args, int nargs) {
+    uint32_t stack[64];
+    int sp = 0;
+
+    for (uint32_t pc = 0; pc < func->num_instructions; pc++) {
+        HelloInst instr = func->instructions[pc];
+        switch (instr.kind) {
+        case CONSTI:
+            stack[sp++] = instr.param.integer;
+            break;
+        case ARGS:
+            uint32_t idx = instr.param.integer;
+            ValkeyModule_Assert(idx < (uint32_t)nargs);
+            size_t len;
+            const char *argStr = ValkeyModule_StringPtrLen(args[idx], &len);
+            uint32_t arg = str2int(argStr);
+            stack[sp++] = arg;
+            break;
+        case RETURN:
+            uint32_t val = stack[--sp];
+            ValkeyModule_Assert(sp == 0);
+            return val;
+        case FUNCTION:
+        default:
+            ValkeyModule_Assert(0);
+        }
+    }
+
+    ValkeyModule_Assert(0);
+    return 0;
+}
+
+static ValkeyModuleScriptingEngineMemoryInfo engineGetMemoryInfo(ValkeyModuleCtx *module_ctx,
+                                                                 ValkeyModuleScriptingEngineCtx *engine_ctx) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx;
+    ValkeyModuleScriptingEngineMemoryInfo mem_info = {0};
+
+    if (ctx->program != NULL) {
+        mem_info.used_memory += ValkeyModule_MallocSize(ctx->program);
+
+        for (uint32_t i = 0; i < ctx->program->num_functions; i++) {
+            HelloFunc *func = ctx->program->functions[i];
+            mem_info.used_memory += ValkeyModule_MallocSize(func);
+            mem_info.used_memory += ValkeyModule_MallocSize(func->name);
+        }
+    }
+
+    mem_info.engine_memory_overhead = ValkeyModule_MallocSize(ctx);
+    if (ctx->program != NULL) {
+        mem_info.engine_memory_overhead += ValkeyModule_MallocSize(ctx->program);
+    }
+
+    return mem_info;
+}
+
+static size_t engineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx,
+                                           void *compiled_function) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    HelloFunc *func = (HelloFunc *)compiled_function;
+    return ValkeyModule_MallocSize(func->name);
+}
+
+static void engineFreeFunction(ValkeyModuleCtx *module_ctx,
+                               ValkeyModuleScriptingEngineCtx *engine_ctx,
+                               void *compiled_function) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    VALKEYMODULE_NOT_USED(engine_ctx);
+    HelloFunc *func = (HelloFunc *)compiled_function;
+    ValkeyModule_Free(func->name);
+    func->name = NULL;
+    ValkeyModule_Free(func);
+}
+
+static ValkeyModuleScriptingEngineCompiledFunction **createHelloLangEngine(ValkeyModuleCtx *module_ctx,
+                                                                           ValkeyModuleScriptingEngineCtx *engine_ctx,
+                                                                           const char *code,
+                                                                           size_t timeout,
+                                                                           size_t *out_num_compiled_functions,
+                                                                           char **err) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    VALKEYMODULE_NOT_USED(timeout);
+    VALKEYMODULE_NOT_USED(err);
+
+    HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx;
+
+    if (ctx->program == NULL) {
+        ctx->program = ValkeyModule_Alloc(sizeof(HelloProgram));
+        memset(ctx->program, 0, sizeof(HelloProgram));
+    } else {
+        ctx->program->num_functions = 0;
+    }
+
+    ctx->program = helloLangParseCode(code, ctx->program);
+
+    ValkeyModuleScriptingEngineCompiledFunction **compiled_functions =
+        ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction *) * ctx->program->num_functions);
+
+    for (uint32_t i = 0; i < ctx->program->num_functions; i++) {
+        HelloFunc *func = ctx->program->functions[i];
+
+        ValkeyModuleScriptingEngineCompiledFunction *cfunc =
+            ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction));
+        *cfunc = (ValkeyModuleScriptingEngineCompiledFunction) {
+            .name = ValkeyModule_CreateString(NULL, func->name, strlen(func->name)),
+            .function = func,
+            .desc = NULL,
+            .f_flags = 0,
+        };
+
+        compiled_functions[i] = cfunc;
+    }
+
+    *out_num_compiled_functions = ctx->program->num_functions;
+
+    return compiled_functions;
+}
+
+static void
+callHelloLangFunction(ValkeyModuleCtx *module_ctx,
+                      ValkeyModuleScriptingEngineCtx *engine_ctx,
+                      ValkeyModuleScriptingEngineFunctionCtx *func_ctx,
+                      void *compiled_function,
+                      ValkeyModuleString **keys, size_t nkeys,
+                      ValkeyModuleString **args, size_t nargs) {
+    VALKEYMODULE_NOT_USED(engine_ctx);
+    VALKEYMODULE_NOT_USED(func_ctx);
+    VALKEYMODULE_NOT_USED(keys);
+    VALKEYMODULE_NOT_USED(nkeys);
+
+    HelloFunc *func = (HelloFunc *)compiled_function;
+    uint32_t result = executeHelloLangFunction(func, args, nargs);
+
+    ValkeyModule_ReplyWithLongLong(module_ctx, result);
+}
+
+int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv,
+                        int argc) {
+    VALKEYMODULE_NOT_USED(argv);
+    VALKEYMODULE_NOT_USED(argc);
+
+    if (ValkeyModule_Init(ctx, "helloengine", 1, VALKEYMODULE_APIVER_1) ==
+        VALKEYMODULE_ERR)
+        return VALKEYMODULE_ERR;
+
+    hello_ctx = ValkeyModule_Alloc(sizeof(HelloLangCtx));
+    hello_ctx->program = NULL;
+
+    ValkeyModuleScriptingEngineMethods methods = {
+        .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION,
+        .create_functions_library = createHelloLangEngine,
+        .call_function = callHelloLangFunction,
+        .get_function_memory_overhead = engineFunctionMemoryOverhead,
+        .free_function = engineFreeFunction,
+        .get_memory_info = engineGetMemoryInfo,
+    };
+
+    ValkeyModule_RegisterScriptingEngine(ctx,
+                                         "HELLO",
+                                         hello_ctx,
+                                         &methods);
+
+    return VALKEYMODULE_OK;
+}
+
+int ValkeyModule_OnUnload(ValkeyModuleCtx *ctx) {
+    if (ValkeyModule_UnregisterScriptingEngine(ctx, "HELLO") != VALKEYMODULE_OK) {
+        ValkeyModule_Log(ctx, "error", "Failed to unregister engine");
+        return VALKEYMODULE_ERR;
+    }
+
+    ValkeyModule_Free(hello_ctx->program);
+    hello_ctx->program = NULL;
+    ValkeyModule_Free(hello_ctx);
+    hello_ctx = NULL;
+
+    return VALKEYMODULE_OK;
+}
diff --git a/tests/unit/functions.tcl b/tests/unit/functions.tcl
index 7ddd36dd7d..1636baaf6d 100644
--- a/tests/unit/functions.tcl
+++ b/tests/unit/functions.tcl
@@ -604,7 +604,7 @@ start_server {tags {"scripting"}} {
             }
         } e
         set _ $e
-    } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
+    } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
 
     test {LIBRARIES - test registration with empty name} {
         catch {
@@ -613,7 +613,7 @@ start_server {tags {"scripting"}} {
             }
         } e
         set _ $e
-    } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
+    } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
 
     test {LIBRARIES - math.random from function load} {
         catch {
diff --git a/tests/unit/moduleapi/scriptingengine.tcl b/tests/unit/moduleapi/scriptingengine.tcl
new file mode 100644
index 0000000000..c350633dd8
--- /dev/null
+++ b/tests/unit/moduleapi/scriptingengine.tcl
@@ -0,0 +1,126 @@
+set testmodule [file normalize tests/modules/helloscripting.so]
+
+set HELLO_PROGRAM "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 432\nRETURN"
+
+start_server {tags {"modules"}} {
+    r module load $testmodule
+
+    r function load $HELLO_PROGRAM
+
+    test {Load script with invalid library name} {
+        assert_error {ERR Library names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=my-lib\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with existing library} {
+        assert_error {ERR Library 'mylib' already exists} {r function load $HELLO_PROGRAM}
+    }
+
+    test {Load script with invalid engine} {
+        assert_error {ERR Engine 'wasm' not found} {r function load "#!wasm name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with no functions} {
+        assert_error {ERR No functions registered} {r function load "#!hello name=mylib2\n"}
+    }
+
+    test {Load script with duplicate function} {
+        assert_error {ERR Function foo already exists} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with no metadata header} {
+        assert_error {ERR Missing library metadata} {r function load "FUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with header without lib name} {
+        assert_error {ERR Library name was not given} {r function load "#!hello \n"}
+    }
+
+    test {Load script with header with unknown param} {
+        assert_error {ERR Invalid metadata value given: nme=mylib} {r function load "#!hello nme=mylib\n"}
+    }
+
+    test {Load script with header with lib name passed twice} {
+        assert_error {ERR Invalid metadata value, name argument was given multiple times} {r function load "#!hello name=mylib2 name=mylib3\n"}
+    }
+
+    test {Load script with invalid function name} {
+        assert_error {ERR Function names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=mylib2\nFUNCTION foo-bar\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with duplicate function} {
+        assert_error {ERR Function already exists in the library} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Call scripting engine function: calling foo works} {
+        r fcall foo 0 134
+    } {134}
+
+    test {Call scripting engine function: calling bar works} {
+        r fcall bar 0
+    } {432}
+
+    test {Replace function library and call functions} {
+        set result [r function load replace "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 500\nRETURN"]
+        assert_equal $result "mylib"
+
+        set result [r fcall foo 0 132]
+        assert_equal $result 132
+
+        set result [r fcall bar 0]
+        assert_equal $result 500
+    }
+
+    test {List scripting engine functions} {
+        r function load replace "#!hello name=mylib\nFUNCTION foobar\nARGS 0\nRETURN"
+        r function list
+    } {{library_name mylib engine HELLO functions {{name foobar description {} flags {}}}}}
+
+    test {Load a second library and call a function} {
+        r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN"
+        set result [r fcall getarg 0 456]
+        assert_equal $result 456
+    }
+
+    test {Delete all libraries and functions} {
+        set result [r function flush]
+        assert_equal $result {OK}
+        r function list
+    } {}
+
+    test {Test the deletion of a single library} {
+        r function load $HELLO_PROGRAM
+        r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN"
+
+        set result [r function delete mylib]
+        assert_equal $result {OK}
+
+        set result [r fcall getarg 0 446]
+        assert_equal $result 446
+    }
+
+    test {Test dump and restore function library} {
+        r function load $HELLO_PROGRAM
+
+        set result [r fcall bar 0]
+        assert_equal $result 432
+
+        set dump [r function dump]
+
+        set result [r function flush]
+        assert_equal $result {OK}
+
+        set result [r function restore $dump]
+        assert_equal $result {OK}
+
+        set result [r fcall getarg 0 436]
+        assert_equal $result 436
+
+        set result [r fcall bar 0]
+        assert_equal $result 432
+    }
+
+    test {Unload scripting engine module} {
+        set result [r module unload helloengine]
+        assert_equal $result "OK"
+    }
+}

From 65d054b3caf96fd5c041f74dd0e004a2c71cd1fa Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 23 Dec 2024 05:57:56 +0800
Subject: [PATCH 033/101] Fix switch case compilation error in the new
 helloscripting (#1472)

It is missing the curly braces for variable declaration after case.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/modules/helloscripting.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c
index fdca6c8e91..c912164bda 100644
--- a/tests/modules/helloscripting.c
+++ b/tests/modules/helloscripting.c
@@ -213,7 +213,7 @@ static uint32_t executeHelloLangFunction(HelloFunc *func,
         case CONSTI:
             stack[sp++] = instr.param.integer;
             break;
-        case ARGS:
+        case ARGS: {
             uint32_t idx = instr.param.integer;
             ValkeyModule_Assert(idx < (uint32_t)nargs);
             size_t len;
@@ -221,10 +221,12 @@ static uint32_t executeHelloLangFunction(HelloFunc *func,
             uint32_t arg = str2int(argStr);
             stack[sp++] = arg;
             break;
-        case RETURN:
+        }
+        case RETURN: {
             uint32_t val = stack[--sp];
             ValkeyModule_Assert(sp == 0);
             return val;
+        }
         case FUNCTION:
         default:
             ValkeyModule_Assert(0);

From 070ad88b158560625ccf90204928f54f3607fac2 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Mon, 23 Dec 2024 21:07:15 -0800
Subject: [PATCH 034/101] Remove readability refactor for failover auth to fix
 clang warning (#1481)

As part of #1463, I made a small refactor between the PR and the daily
test I submitted to try to improve readability by adding a function to
abstract the extraction of the message types. However, that change
apparently caused GCC to throw another warning, so reverting the
abstraction on just one line.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 9a23527b30..3d838dfe06 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4361,7 +4361,7 @@ void clusterRequestFailoverAuth(void) {
     /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit
      * in the header to communicate the nodes receiving the message that
      * they should authorized the failover even if the primary is working. */
-    if (server.cluster->mf_end) getMessageFromSendBlock(msgblock)->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
+    if (server.cluster->mf_end) msgblock->data[0].msg.mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
     clusterBroadcastMessage(msgblock);
     clusterMsgSendBlockDecrRefCount(msgblock);
 }

From c8e5fc94f7ffdfb522c0904725968494abc109a9 Mon Sep 17 00:00:00 2001
From: Amit Nagler <58042354+naglera@users.noreply.github.com>
Date: Tue, 24 Dec 2024 08:13:25 +0200
Subject: [PATCH 035/101] Reduce dual channel testing time (#1477)

- By not waiting `repl-diskless-sync-delay` when we don't have to, we
can reduce ~30% of dual channel tests execution time.
- This commit also drops one test which is not required for regular sync
(`Sync should continue if not all slaves dropped`).
- Skip dual channel test with master diskless disabled because it will
initiate the same synchronization process as the non-dual channel test,
making it redundant.


Before:
```
Execution time of different units:
  171 seconds - integration/dual-channel-replication
  305 seconds - integration/replication-psync

\o/ All tests passed without errors!
```
After:
```
Execution time of different units:
  120 seconds - integration/dual-channel-replication
  236 seconds - integration/replication-psync

\o/ All tests passed without errors!
```

Discused on https://github.com/valkey-io/valkey/pull/1173

---------

Signed-off-by: naglera <anagler123@gmail.com>
---
 .../integration/dual-channel-replication.tcl  | 60 ++++++++++---------
 tests/integration/replication-psync.tcl       |  4 ++
 2 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index 8191b9f699..b4b9286d68 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -110,6 +110,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
 
         $primary config set rdb-key-save-delay 200
         $primary config set dual-channel-replication-enabled yes
+        $primary config set repl-diskless-sync-delay 0
         $replica config set dual-channel-replication-enabled yes
         $replica config set repl-diskless-sync no
 
@@ -201,6 +202,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             # a replication buffer block.
             $primary config set client-output-buffer-limit "replica 1100k 0 0"
             $primary config set dual-channel-replication-enabled $enable
+            $primary config set repl-diskless-sync-delay 0
             $replica config set dual-channel-replication-enabled $enable
 
             test "Toggle dual-channel-replication-enabled: $enable start" {    
@@ -506,6 +508,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         $primary config set dual-channel-replication-enabled yes
         $primary config set repl-backlog-size $backlog_size
         $primary config set loglevel debug
+        $primary config set repl-diskless-sync-delay 0
         if {$::valgrind} {
             $primary config set repl-timeout 100
         } else {
@@ -877,7 +880,6 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     }
 }
 
-foreach dualchannel {yes no} {
 start_server {tags {"dual-channel-replication external:skip"}} {
     set primary [srv 0 client]
     set primary_host [srv 0 host]
@@ -893,20 +895,20 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     # Generating RDB will cost 5s(10000 * 0.0005s)
     $primary debug populate 10000 primary 1
     $primary config set rdb-key-save-delay 500
-    $primary config set dual-channel-replication-enabled $dualchannel
+    $primary config set dual-channel-replication-enabled yes
 
     start_server {} {
         set replica1 [srv 0 client]
-        $replica1 config set dual-channel-replication-enabled $dualchannel
+        $replica1 config set dual-channel-replication-enabled yes
         $replica1 config set loglevel debug
         start_server {} {
             set replica2 [srv 0 client]
-            $replica2 config set dual-channel-replication-enabled $dualchannel
+            $replica2 config set dual-channel-replication-enabled yes
             $replica2 config set loglevel debug
             $replica2 config set repl-timeout 60
 
             set load_handle [start_one_key_write_load $primary_host $primary_port 100 "mykey1"]
-            test "Sync should continue if not all slaves dropped dual-channel-replication $dualchannel" {
+            test "Sync should continue if not all slaves dropped" {
                 $replica1 replicaof $primary_host $primary_port
                 $replica2 replicaof $primary_host $primary_port
 
@@ -915,20 +917,17 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 } else {
                     fail "Sync did not start"
                 }
-                if {$dualchannel == "yes"} {
-                    # Wait for both replicas main conns to establish psync
-                    wait_for_condition 50 1000 {
-                        [status $primary sync_partial_ok] == 2
-                    } else {
-                        fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
-                    }
+                # Wait for both replicas main conns to establish psync
+                wait_for_condition 50 1000 {
+                    [status $primary sync_partial_ok] == 2
+                } else {
+                    fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
                 }
-
                 catch {$replica1 shutdown nosave}
                 wait_for_condition 50 2000 {
                     [status $replica2 master_link_status] == "up" &&
                     [status $primary sync_full] == 2 &&
-                    (($dualchannel == "yes" && [status $primary sync_partial_ok] == 2) || $dualchannel == "no")
+                    ([status $primary sync_partial_ok] == 2)
                 } else {
                     fail "Sync session interapted\n
                         sync_full:[status $primary sync_full]\n
@@ -942,7 +941,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             $primary debug populate 1000000 primary 1
             $primary config set rdb-key-save-delay 100
     
-            test "Primary abort sync if all slaves dropped dual-channel-replication $dualchannel" {
+            test "Primary abort sync if all slaves dropped dual-channel-replication" {
                 set cur_psync [status $primary sync_partial_ok]
                 $replica2 replicaof $primary_host $primary_port
 
@@ -951,13 +950,11 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 } else {
                     fail "Sync did not start"
                 }
-                if {$dualchannel == "yes"} {
-                    # Wait for both replicas main conns to establish psync
-                    wait_for_condition 50 1000 {
-                        [status $primary sync_partial_ok] == $cur_psync + 1
-                    } else {
-                        fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
-                    }
+                # Wait for both replicas main conns to establish psync
+                wait_for_condition 50 1000 {
+                    [status $primary sync_partial_ok] == $cur_psync + 1
+                } else {
+                    fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
                 }
 
                 catch {$replica2 shutdown nosave}
@@ -971,7 +968,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         }
     }
 }
-}
+
 
 start_server {tags {"dual-channel-replication external:skip"}} {
     set primary [srv 0 client]
@@ -982,8 +979,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     $primary config set repl-diskless-sync yes
     $primary config set dual-channel-replication-enabled yes
     $primary config set loglevel debug
-    $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
-
+    $primary config set repl-diskless-sync-delay 0
     # Generating RDB will cost 500s(1000000 * 0.0001s)
     $primary debug populate 1000000 primary 1
     $primary config set rdb-key-save-delay 100
@@ -1014,6 +1010,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
             assert {$replica_main_conn_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_main_conn_id
             # Wait for primary to abort the sync
             wait_for_condition 50 1000 {
@@ -1034,6 +1031,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         }
 
         test "Test dual-channel-replication replica rdb connection disconnected" {
+            $primary config set repl-diskless-sync-delay 0
             $replica replicaof $primary_host $primary_port
             # Wait for sync session to start
             wait_for_condition 500 1000 {
@@ -1048,6 +1046,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             $primary debug log "killing replica rdb connection $replica_rdb_channel_id"
             assert {$replica_rdb_channel_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_rdb_channel_id
             # Wait for primary to abort the sync
             wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10
@@ -1063,6 +1062,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         }
 
         test "Test dual-channel-replication primary reject set-rdb-client after client killed" {
+            $primary config set repl-diskless-sync-delay 0
             # Ensure replica main channel will not handshake before rdb client is killed
             $replica debug pause-after-fork 1
             $replica replicaof $primary_host $primary_port
@@ -1077,6 +1077,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"]
             assert {$replica_rdb_channel_id != ""}
             $primary debug log "killing replica rdb connection $replica_rdb_channel_id"
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_rdb_channel_id
             # Wait for primary to abort the sync
             wait_and_resume_process 0
@@ -1154,7 +1155,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     $primary config set repl-diskless-sync yes
     $primary config set dual-channel-replication-enabled yes
     $primary config set loglevel debug
-    $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
+    $primary config set repl-diskless-sync-delay 0
 
     # Generating RDB will cost 100 sec to generate
     $primary debug populate 10000 primary 1
@@ -1185,6 +1186,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"]
             assert {$replica_rdb_channel_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_rdb_channel_id
             # Wait for primary to abort the sync
             wait_for_condition 50 1000 {
@@ -1192,6 +1194,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "Primary did not free repl buf block after sync failure"
             }
+            $primary config set repl-diskless-sync-delay 0
             wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10
             # Replica should retry
             wait_for_condition 500 1000 {
@@ -1200,7 +1203,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 [s -1 rdb_bgsave_in_progress] eq 1
             } else {
                 fail "replica didn't retry after connection close"
-            }            
+            }
         }
         $replica replicaof no one
         wait_for_condition 500 1000 {
@@ -1218,11 +1221,11 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "replica didn't start sync session in time"
             }            
-
             $primary debug log "killing replica main connection"
             set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"]
             assert {$replica_main_conn_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_main_conn_id
             # Wait for primary to abort the sync
             wait_for_condition 50 1000 {
@@ -1230,6 +1233,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "Primary did not free repl buf block after sync failure"
             }
+            $primary config set repl-diskless-sync-delay 0
             wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10
             # Replica should retry
             wait_for_condition 500 1000 {
diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl
index 4c305ebff4..88a33045f0 100644
--- a/tests/integration/replication-psync.tcl
+++ b/tests/integration/replication-psync.tcl
@@ -115,6 +115,10 @@ tags {"external:skip"} {
 foreach mdl {no yes} {
     foreach sdl {disabled swapdb} {
         foreach dualchannel {yes no} {
+            # Skip dual channel test with master diskless disabled
+            if {$dualchannel == "yes" && $mdl == "no"} {
+                continue
+            }
             test_psync {no reconnection, just sync} 6 1000000 3600 0 {
             } $mdl $sdl $dualchannel 0
 

From da15cee7591e8fb678dbc5c8cbf3e28e2a2e2380 Mon Sep 17 00:00:00 2001
From: Amit Nagler <58042354+naglera@users.noreply.github.com>
Date: Tue, 24 Dec 2024 08:14:32 +0200
Subject: [PATCH 036/101] Add scoped RDB loading context and immediate abort
 flag (#1173)

This PR introduces a new mechanism for temporarily changing the
server's loading_rio context during RDB loading operations. The new
`RDB_SCOPED_LOADING_RIO` macro allows for a scoped change of the
`server.loading_rio` value, ensuring that it's automatically restored
to its original value when the scope ends.

Introduces a dedicated flag to `rio` to signal immediate abort,
preventing
potential use-after-free scenarios during replication disconnection in
dual-channel load. This ensures proper termination of
`rdbLoadRioWithLoadingCtx`
when replication is cancelled due to connection loss on main connection.

Fixes https://github.com/valkey-io/valkey/issues/1152

---------

Signed-off-by: naglera <anagler123@gmail.com>
Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Signed-off-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
---
 src/rdb.c                                     | 15 ++++-
 src/rdb.h                                     |  2 +-
 src/replication.c                             | 15 ++---
 src/rio.h                                     | 16 ++++-
 src/server.c                                  |  1 +
 src/server.h                                  |  1 +
 .../integration/dual-channel-replication.tcl  | 62 ++++++++++++++++++-
 7 files changed, 95 insertions(+), 17 deletions(-)

diff --git a/src/rdb.c b/src/rdb.c
index 5fb77a2897..a4eb2823fb 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -64,6 +64,7 @@ char *rdbFileBeingLoaded = NULL; /* used for rdb checking on read error */
 extern int rdbCheckMode;
 void rdbCheckError(const char *fmt, ...);
 void rdbCheckSetError(const char *fmt, ...);
+int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
 
 #ifdef __GNUC__
 void rdbReportError(int corruption_error, int linenum, char *reason, ...) __attribute__((format(printf, 3, 4)));
@@ -2991,7 +2992,19 @@ int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx *lib_ctx, int rdbflags, s
 int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) {
     functionsLibCtx *functions_lib_ctx = functionsLibCtxGetCurrent();
     rdbLoadingCtx loading_ctx = {.dbarray = server.db, .functions_lib_ctx = functions_lib_ctx};
-    int retval = rdbLoadRioWithLoadingCtx(rdb, rdbflags, rsi, &loading_ctx);
+    int retval = rdbLoadRioWithLoadingCtxScopedRdb(rdb, rdbflags, rsi, &loading_ctx);
+    return retval;
+}
+
+/* Wrapper for rdbLoadRioWithLoadingCtx that manages a scoped RDB context.
+ * This method wraps the rdbLoadRioWithLoadingCtx function, providing temporary
+ * RDB context management. It sets a new current loading RDB, calls the wrapped
+ * function, and then restores the previous loading RDB context. */
+int rdbLoadRioWithLoadingCtxScopedRdb(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx) {
+    rio *prev_rio = server.loading_rio;
+    server.loading_rio = rdb;
+    int retval = rdbLoadRioWithLoadingCtx(rdb, rdbflags, rsi, rdb_loading_ctx);
+    server.loading_rio = prev_rio;
     return retval;
 }
 
diff --git a/src/rdb.h b/src/rdb.h
index e9d53fa398..7342a926b5 100644
--- a/src/rdb.h
+++ b/src/rdb.h
@@ -172,7 +172,7 @@ int rdbLoadBinaryDoubleValue(rio *rdb, double *val);
 int rdbSaveBinaryFloatValue(rio *rdb, float val);
 int rdbLoadBinaryFloatValue(rio *rdb, float *val);
 int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi);
-int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
+int rdbLoadRioWithLoadingCtxScopedRdb(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
 int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx *lib_ctx, int rdbflags, sds *err);
 int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi);
 ssize_t rdbSaveFunctions(rio *rdb);
diff --git a/src/replication.c b/src/replication.c
index 3a207a1d0f..f907771e71 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -2254,7 +2254,7 @@ void readSyncBulkPayload(connection *conn) {
 
         int loadingFailed = 0;
         rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx};
-        if (rdbLoadRioWithLoadingCtx(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) {
+        if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) {
             /* RDB loading failed. */
             serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB "
                                   "from socket, check server logs.");
@@ -2831,18 +2831,15 @@ typedef struct replDataBufBlock {
  * Reads replication data from primary into specified repl buffer block */
 int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) {
     int nread = connRead(conn, data_block->buf + data_block->used, read);
-    if (nread == -1) {
-        if (connGetState(conn) != CONN_STATE_CONNECTED) {
-            dualChannelServerLog(LL_NOTICE, "Error reading from primary: %s", connGetLastError(conn));
+    if (nread <= 0) {
+        if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) {
+            dualChannelServerLog(LL_WARNING, "Provisional primary closed connection");
+            /* Signal ongoing RDB load to terminate gracefully */
+            if (server.loading_rio) rioCloseASAP(server.loading_rio);
             cancelReplicationHandshake(1);
         }
         return C_ERR;
     }
-    if (nread == 0) {
-        dualChannelServerLog(LL_VERBOSE, "Provisional primary closed connection");
-        cancelReplicationHandshake(1);
-        return C_ERR;
-    }
     data_block->used += nread;
     server.stat_total_reads_processed++;
     return read - nread;
diff --git a/src/rio.h b/src/rio.h
index ee0f27aa7e..d5c3263e79 100644
--- a/src/rio.h
+++ b/src/rio.h
@@ -39,6 +39,7 @@
 
 #define RIO_FLAG_READ_ERROR (1 << 0)
 #define RIO_FLAG_WRITE_ERROR (1 << 1)
+#define RIO_FLAG_CLOSE_ASAP (1 << 2) /* Rio was closed asynchronously during the current rio operation. */
 
 #define RIO_TYPE_FILE (1 << 0)
 #define RIO_TYPE_BUFFER (1 << 1)
@@ -115,7 +116,7 @@ typedef struct _rio rio;
  * if needed. */
 
 static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
-    if (r->flags & RIO_FLAG_WRITE_ERROR) return 0;
+    if (r->flags & RIO_FLAG_WRITE_ERROR || r->flags & RIO_FLAG_CLOSE_ASAP) return 0;
     while (len) {
         size_t bytes_to_write =
             (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
@@ -132,7 +133,7 @@ static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
 }
 
 static inline size_t rioRead(rio *r, void *buf, size_t len) {
-    if (r->flags & RIO_FLAG_READ_ERROR) return 0;
+    if (r->flags & RIO_FLAG_READ_ERROR || r->flags & RIO_FLAG_CLOSE_ASAP) return 0;
     while (len) {
         size_t bytes_to_read =
             (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
@@ -156,6 +157,10 @@ static inline int rioFlush(rio *r) {
     return r->flush(r);
 }
 
+static inline void rioCloseASAP(rio *r) {
+    r->flags |= RIO_FLAG_CLOSE_ASAP;
+}
+
 /* This function allows to know if there was a read error in any past
  * operation, since the rio stream was created or since the last call
  * to rioClearError(). */
@@ -168,8 +173,13 @@ static inline int rioGetWriteError(rio *r) {
     return (r->flags & RIO_FLAG_WRITE_ERROR) != 0;
 }
 
+/* Like rioGetReadError() but for async close errors. */
+static inline int rioGetAsyncCloseError(rio *r) {
+    return (r->flags & RIO_FLAG_CLOSE_ASAP) != 0;
+}
+
 static inline void rioClearErrors(rio *r) {
-    r->flags &= ~(RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR);
+    r->flags &= ~(RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR | RIO_FLAG_CLOSE_ASAP);
 }
 
 void rioInitWithFile(rio *r, FILE *fp);
diff --git a/src/server.c b/src/server.c
index a0c642b541..b997a9aec6 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2218,6 +2218,7 @@ void initServerConfig(void) {
     server.fsynced_reploff_pending = 0;
     server.rdb_client_id = -1;
     server.loading_process_events_interval_ms = LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT;
+    server.loading_rio = NULL;
 
     /* Replication partial resync backlog */
     server.repl_backlog = NULL;
diff --git a/src/server.h b/src/server.h
index d8497ccff5..61fa2c3c5a 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2089,6 +2089,7 @@ struct valkeyServer {
         int dbid;
     } repl_provisional_primary;
     client *cached_primary;             /* Cached primary to be reused for PSYNC. */
+    rio *loading_rio;                   /* Pointer to the rio object currently used for loading data. */
     int repl_syncio_timeout;            /* Timeout for synchronous I/O calls */
     int repl_state;                     /* Replication status if the instance is a replica */
     int repl_rdb_channel_state;         /* State of the replica's rdb channel during dual-channel-replication */
diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index b4b9286d68..3adf9ce9fd 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -1158,8 +1158,8 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     $primary config set repl-diskless-sync-delay 0
 
     # Generating RDB will cost 100 sec to generate
-    $primary debug populate 10000 primary 1
-    $primary config set rdb-key-save-delay 10000
+    $primary debug populate 100000 primary 1
+    $primary config set rdb-key-save-delay 1000
     
     start_server {} {
         set replica [srv 0 client]
@@ -1222,7 +1222,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 fail "replica didn't start sync session in time"
             }            
             $primary debug log "killing replica main connection"
-            set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"]
+            set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
             assert {$replica_main_conn_id != ""}
             set loglines [count_log_lines -1]
             $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
@@ -1247,3 +1247,59 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         stop_write_load $load_handle
     }
 }
+
+
+start_server {tags {"dual-channel-replication external:skip"}} {
+    set primary [srv 0 client]
+    set primary_host [srv 0 host]
+    set primary_port [srv 0 port]
+
+    $primary config set repl-diskless-sync yes
+    $primary config set dual-channel-replication-enabled yes
+    $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
+
+    # Generating RDB will take 100 sec to generate
+    $primary debug populate 1000000 primary 1
+    $primary config set rdb-key-save-delay -10
+    
+    start_server {} {
+        set replica [srv 0 client]
+        set replica_host [srv 0 host]
+        set replica_port [srv 0 port]
+        set replica_log [srv 0 stdout]
+        
+        $replica config set dual-channel-replication-enabled yes
+        $replica config set loglevel debug
+        $replica config set repl-timeout 10
+        $replica config set repl-diskless-load flush-before-load
+
+        test "Replica notice main-connection killed during rdb load callback" {; # https://github.com/valkey-io/valkey/issues/1152
+            set loglines [count_log_lines 0]
+            $replica replicaof $primary_host $primary_port
+            # Wait for sync session to start
+            wait_for_condition 500 1000 {
+                [string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
+                [string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
+                [s -1 rdb_bgsave_in_progress] eq 1
+            } else {
+                fail "replica didn't start sync session in time"
+            }
+            wait_for_log_messages 0 {"*Loading RDB produced by Valkey version*"} $loglines 1000 10
+            $primary set key val
+            set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
+            $primary debug log "killing replica main connection $replica_main_conn_id"
+            assert {$replica_main_conn_id != ""}
+            set loglines [count_log_lines 0]
+            $primary config set rdb-key-save-delay 0; # disable delay to allow next sync to succeed
+            $primary client kill id $replica_main_conn_id
+            # Wait for primary to abort the sync
+            wait_for_condition 50 1000 {
+                [string match {*replicas_waiting_psync:0*} [$primary info replication]]
+            } else {
+                fail "Primary did not free repl buf block after sync failure"
+            }
+            wait_for_log_messages 0 {"*Failed trying to load the PRIMARY synchronization DB from socket*"} $loglines 1000 10
+            verify_replica_online $primary 0 500
+        }
+    }
+}

From ff394270af692806c30b70c514f8a7a3851ccdc6 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 25 Dec 2024 10:57:42 +0800
Subject: [PATCH 037/101] Document all command flags near serverCommand (#1474)

These flags are not documented here.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/server.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/server.h b/src/server.h
index 61fa2c3c5a..424569f76f 100644
--- a/src/server.h
+++ b/src/server.h
@@ -248,6 +248,8 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
 #define CMD_ALLOW_BUSY ((1ULL << 26))
 #define CMD_MODULE_GETCHANNELS (1ULL << 27) /* Use the modules getchannels interface. */
 #define CMD_TOUCHES_ARBITRARY_KEYS (1ULL << 28)
+/* Command flags. Please don't forget to add command flag documentation in struct
+ * serverCommand in this file. */
 
 /* Command flags that describe ACLs categories. */
 #define ACL_CATEGORY_KEYSPACE (1ULL << 0)
@@ -2472,6 +2474,8 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc,
  * CMD_DENYOOM:     May increase memory usage once called. Don't allow if out
  *                  of memory.
  *
+ * CMD_MODULE:      Command exported by module.
+ *
  * CMD_ADMIN:       Administrative command, like SAVE or SHUTDOWN.
  *
  * CMD_PUBSUB:      Pub/Sub related command.
@@ -2518,11 +2522,22 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc,
  *
  * CMD_NO_MANDATORY_KEYS: This key arguments for this command are optional.
  *
+ * CMD_PROTECTED: The command is a protected command, see enable-debug-command for more details.
+ *
+ * CMD_MODULE_GETKEYS: Use the modules getkeys interface.
+ *
+ * CMD_MODULE_NO_CLUSTER: Deny on cluster.
+ *
  * CMD_NO_MULTI: The command is not allowed inside a transaction
  *
+ * CMD_MOVABLE_KEYS: The legacy range spec doesn't cover all keys. Populated by
+ *                   populateCommandLegacyRangeSpec.
+ *
  * CMD_ALLOW_BUSY: The command can run while another command is running for
  *                 a long time (timedout script, module command that yields)
  *
+ * CMD_MODULE_GETCHANNELS: Use the modules getchannels interface.
+ *
  * CMD_TOUCHES_ARBITRARY_KEYS: The command may touch (and cause lazy-expire)
  *                             arbitrary key (i.e not provided in argv)
  *

From 3d5acdd41d322abd99babd6abd6ca9dc3ce3d5c2 Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Wed, 25 Dec 2024 04:58:49 +0200
Subject: [PATCH 038/101] Fix restore replica output bytes stat update (#1486)

This PR fixes the missing stat update for `total_net_repl_output_bytes`
that was removed during the refactoring in PR #758. The metric was not
being updated when writing to replica connections.

Changes:
- Restored the stat update in postWriteToClient for replica connections
- Added integration test to verify the metric is properly updated

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/networking.c                  |  2 ++
 tests/integration/replication.tcl | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/networking.c b/src/networking.c
index 9f36f24275..d93046a603 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -2231,6 +2231,8 @@ int postWriteToClient(client *c) {
     server.stat_total_writes_processed++;
     if (getClientType(c) != CLIENT_TYPE_REPLICA) {
         _postWriteToClient(c);
+    } else {
+        server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0;
     }
 
     if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) {
diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index 1b5b0c030a..6d3c4e934f 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -194,6 +194,30 @@ start_server {tags {"repl external:skip"}} {
             }          
             assert_match {*calls=1,*,rejected_calls=0,failed_calls=1*} [cmdrstat blpop $B]
         }
+        
+        test {Replica output bytes metric} {
+            # reset stats 
+            $A config resetstat
+            
+            set info [$A info stats]
+            set replica_bytes_output [getInfoProperty $info "total_net_repl_output_bytes"]
+            assert_equal $replica_bytes_output 0
+            
+            # sent set command to primary
+            $A set key value
+            
+            # wait for command propagation
+            wait_for_condition 50 100 {
+                [$B get key] eq {value}
+            } else {
+                fail "Replica did not receive the command"
+            }
+            
+            # get the new stats
+            set info [$A info stats]
+            set replica_bytes_output [getInfoProperty $info "total_net_repl_output_bytes"]
+            assert_morethan $replica_bytes_output 0
+        }
     }
 }
 

From 2f07b663bc6075f685f9140f0a7b58759b90c0c4 Mon Sep 17 00:00:00 2001
From: gmbnomis <gmbnomis@users.noreply.github.com>
Date: Fri, 27 Dec 2024 00:55:20 +0100
Subject: [PATCH 039/101] Fix JSON description of SET command (#1473)

In the `arguments` section, the `arguments` key is only used for
arguments of type `block` or `oneof`.

Consequently, the `arguments` given for `IFEQ` are ignored by the
server. However, they lead to strange results when rendering the
command's page for the web documentation.

Fix this by removing `arguments` for `IFEQ`.

Signed-off-by: Simon Baatz <gmbnomis@gmail.com>
---
 src/commands/set.json | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/commands/set.json b/src/commands/set.json
index 3d3800f11d..601bd676a2 100644
--- a/src/commands/set.json
+++ b/src/commands/set.json
@@ -111,14 +111,7 @@
                         "type": "string",
                         "token": "IFEQ",
                         "since": "8.1.0",
-                        "summary": "Sets the key's value only if the current value matches the specified comparison value.",
-                        "arguments": [
-                            {
-                                "name": "comparison-value",
-                                "type": "string",
-                                "summary": "The value to compare with the current key's value before setting."
-                            }
-                        ]
+                        "summary": "Sets the key's value only if the current value matches the specified comparison value."
                     }
                 ]
             },

From 810a437da066a3479f17084e9a3f4b8910b3b7ed Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Sun, 29 Dec 2024 08:22:49 -0800
Subject: [PATCH 040/101] Immediately restart the defrag cycle if we still need
 to defrag (#1492)

---
 src/defrag.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/defrag.c b/src/defrag.c
index e9f40d4fab..a5d6c69c1c 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -1112,6 +1112,9 @@ static void endDefragCycle(bool normal_termination) {
     server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time);
     server.stat_last_active_defrag_time = 0;
     server.active_defrag_cpu_percent = 0;
+
+    /* Immediately check to see if we should start another defrag cycle. */
+    monitorActiveDefrag();
 }
 
 
From 4f59458502100e8f971242222d841a97ad26b312 Mon Sep 17 00:00:00 2001
From: Pierre <105686771+pieturin@users.noreply.github.com>
Date: Mon, 30 Dec 2024 12:56:39 -0800
Subject: [PATCH 041/101] Only (re-)send MEET packet once every handshake
 timeout period (#1441)

Add `meet_sent` field in `clusterNode` indicating the last time we sent
a MEET packet. Use this field to only (re-)send a MEET packet once every
handshake timeout period when detecting a node without an inbound link.

When receiving multiple MEET packets on the same link while the node is
in handshake state, instead of dropping the packet, we now simply
prevent the creation of a new node. This way we still process the MEET
packet's gossip and reply with a PONG as any other packets.

Improve some logging messages to include `human_nodename`. Add
`nodeExceedsHandshakeTimeout()` function.

This is a follow-up to this previous PR:
https://github.com/valkey-io/valkey/pull/1307
And a partial fix to the crash described in:
https://github.com/valkey-io/valkey/pull/1436

---------

Signed-off-by: Pierre Turin <pieturin@amazon.com>
---
 src/cluster_legacy.c                         | 139 ++++++++++---------
 src/cluster_legacy.h                         |   1 +
 tests/unit/cluster/cluster-reliable-meet.tcl |   7 +-
 3 files changed, 79 insertions(+), 68 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 3d838dfe06..80889a79d8 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -121,6 +121,7 @@ void freeClusterLink(clusterLink *link);
 int verifyClusterNodeId(const char *name, int length);
 sds clusterEncodeOpenSlotsAuxField(int rdbflags);
 int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s);
+static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now);
 
 /* Only primaries that own slots have voting rights.
  * Returns 1 if the node has voting rights, otherwise returns 0. */
@@ -1346,9 +1347,10 @@ clusterLink *createClusterLink(clusterNode *node) {
  * with this link will have the 'link' field set to NULL. */
 void freeClusterLink(clusterLink *link) {
     serverAssert(link != NULL);
-    serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s",
+    serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s (%s)",
               link->node ? link->node->name : "<unknown>",
-              link->inbound ? "inbound" : "outbound");
+              link->inbound ? "inbound" : "outbound",
+              link->node ? link->node->human_nodename : "<unknown>");
 
     if (link->conn) {
         connClose(link->conn);
@@ -1502,6 +1504,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
     node->last_in_ping_gossip = 0;
     node->ping_sent = node->pong_received = 0;
     node->data_received = 0;
+    node->meet_sent = 0;
     node->fail_time = 0;
     node->link = NULL;
     node->inbound_link = NULL;
@@ -1723,7 +1726,7 @@ void clusterAddNode(clusterNode *node) {
  */
 void clusterDelNode(clusterNode *delnode) {
     serverAssert(delnode != NULL);
-    serverLog(LL_DEBUG, "Deleting node %.40s from cluster view", delnode->name);
+    serverLog(LL_DEBUG, "Deleting node %.40s (%s) from cluster view", delnode->name, delnode->human_nodename);
 
     int j;
     dictIterator *di;
@@ -3143,27 +3146,6 @@ int clusterProcessPacket(clusterLink *link) {
         return 1;
     }
 
-    if (type == CLUSTERMSG_TYPE_MEET && link->node && nodeInHandshake(link->node)) {
-        /* If the link is bound to a node and the node is in the handshake state, and we receive
-         * a MEET packet, it may be that the sender sent multiple MEET packets so in here we are
-         * dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert
-         * will happen if the other sends a MEET packet because it detects that there is no inbound
-         * link, this node creates a new node in HANDSHAKE state (with a random node name), and
-         * respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET
-         * flag. This node is supposed to open an outbound connection to the other node in the next
-         * cron cycle, but before this happens, the other node re-sends a MEET on the same link
-         * because it still detects no inbound connection. We improved the re-send logic of MEET in
-         * #1441, now we will only re-send MEET packet once every handshake timeout period.
-         *
-         * Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name
-         * and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent
-         * us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET
-         * packet eliminate the handshake state. */
-        serverLog(LL_NOTICE, "Dropping MEET packet from node %.40s because the node is already in handshake state",
-                  link->node->name);
-        return 1;
-    }
-
     uint16_t flags = ntohs(hdr->flags);
     uint64_t sender_claimed_current_epoch = 0, sender_claimed_config_epoch = 0;
     clusterNode *sender = getNodeFromLinkAndMsg(link, hdr);
@@ -3261,42 +3243,59 @@ int clusterProcessPacket(clusterLink *link) {
 
         if (type == CLUSTERMSG_TYPE_MEET) {
             if (!sender) {
-                /* Add this node if it is new for us and the msg type is MEET.
-                 * In this stage we don't try to add the node with the right
-                 * flags, replicaof pointer, and so forth, as this details will be
-                 * resolved when we'll receive PONGs from the node. The exception
-                 * to this is the flag that indicates extensions are supported, as
-                 * we want to send extensions right away in the return PONG in order
-                 * to reduce the amount of time needed to stabilize the shard ID. */
-                clusterNode *node;
-
-                node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
-                serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK);
-                getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
-                node->cport = ntohs(hdr->cport);
-                if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
-                    node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED;
+                if (!link->node) {
+                    /* Add this node if it is new for us and the msg type is MEET.
+                     * In this stage we don't try to add the node with the right
+                     * flags, replicaof pointer, and so forth, as this details will be
+                     * resolved when we'll receive PONGs from the node. The exception
+                     * to this is the flag that indicates extensions are supported, as
+                     * we want to send extensions right away in the return PONG in order
+                     * to reduce the amount of time needed to stabilize the shard ID. */
+                    clusterNode *node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
+                    if (nodeIp2String(node->ip, link, hdr->myip) != C_OK) {
+                        /* We cannot get the IP info from the link, it probably means the connection is closed. */
+                        serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, "
+                                             "because the connection has an error");
+                        freeClusterLink(link);
+                        freeClusterNode(node);
+                        return 0;
+                    }
+                    getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
+                    node->cport = ntohs(hdr->cport);
+                    if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
+                        node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED;
+                    }
+                    setClusterNodeToInboundClusterLink(node, link);
+                    clusterAddNode(node);
+                    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+                } else {
+                    /* A second MEET packet was received on an existing link during the handshake process.
+                     * This happens when the other node detects no inbound link, and re-sends a MEET packet
+                     * before this node can respond with a PING. This MEET is a no-op.
+                     *
+                     * Note: Nodes in HANDSHAKE state are not fully "known" (random names), so the sender
+                     * remains unidentified at this point. The MEET packet might be re-sent if the inbound
+                     * connection is still unestablished by the next cron cycle.
+                     */
+                    debugServerAssert(link->inbound && nodeInHandshake(link->node));
                 }
-                setClusterNodeToInboundClusterLink(node, link);
-                clusterAddNode(node);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
 
                 /* If this is a MEET packet from an unknown node, we still process
                  * the gossip section here since we have to trust the sender because
                  * of the message type. */
                 clusterProcessGossipSection(hdr, link);
-            } else if (sender->link && now - sender->ctime > server.cluster_node_timeout) {
+            } else if (sender->link && nodeExceedsHandshakeTimeout(sender, now)) {
                 /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not
                  * know it.
-                 * Freeing my outbound link to that node, to force a reconnect and sending a PING.
+                 * Free my outbound link to that node, triggering a reconnect and a PING over the new link.
                  * Once that node receives our PING, it should recognize the new connection as an inbound link from me.
                  * We should only free the outbound link if the node is known for more time than the handshake timeout,
                  * since during this time, the other side might still be trying to complete the handshake. */
 
                 /* We should always receive a MEET packet on an inbound link. */
                 serverAssert(link != sender->link);
-                serverLog(LL_NOTICE, "Freeing outbound link to node %.40s after receiving a MEET packet from this known node",
-                          sender->name);
+                serverLog(LL_NOTICE, "Freeing outbound link to node %.40s (%s) after receiving a MEET packet from this known node",
+                          sender->name, sender->human_nodename);
                 freeClusterLink(sender->link);
             }
         }
@@ -4062,7 +4061,12 @@ void clusterSendPing(clusterLink *link, int type) {
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen);
     clusterMsg *hdr = getMessageFromSendBlock(msgblock);
 
-    if (!link->inbound && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime();
+    if (!link->inbound) {
+        if (type == CLUSTERMSG_TYPE_PING)
+            link->node->ping_sent = mstime();
+        else if (type == CLUSTERMSG_TYPE_MEET)
+            link->node->meet_sent = mstime();
+    }
 
     /* Populate the gossip fields */
     int maxiterations = wanted * 3;
@@ -4981,10 +4985,22 @@ void clusterHandleManualFailover(void) {
  * CLUSTER cron job
  * -------------------------------------------------------------------------- */
 
+static mstime_t getHandshakeTimeout(void) {
+    /* The handshake timeout is the time after which a handshake node that was
+     * not turned into a normal node is removed from the nodes. Usually it is
+     * just the cluster_node_timeout value, but when cluster_node_timeout is
+     * too small we use the value of 1 second. */
+    return max(server.cluster_node_timeout, 1000);
+}
+
+static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now) {
+    return now - node->ctime > getHandshakeTimeout() ? 1 : 0;
+}
+
 /* Check if the node is disconnected and re-establish the connection.
  * Also update a few stats while we are here, that can be used to make
  * better decisions in other part of the code. */
-static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) {
+static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t now) {
     /* Not interested in reconnecting the link with myself or nodes
      * for which we have no address. */
     if (node->flags & (CLUSTER_NODE_MYSELF | CLUSTER_NODE_NOADDR)) return 1;
@@ -4993,19 +5009,22 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_
 
     /* A Node in HANDSHAKE state has a limited lifespan equal to the
      * configured node timeout. */
-    if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
-        serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d after %lldms", node->ip,
-                  node->cport, handshake_timeout);
+    if (nodeInHandshake(node) && nodeExceedsHandshakeTimeout(node, now)) {
+        serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d", node->ip, node->cport);
         clusterDelNode(node);
         return 1;
     }
-    if (node->link != NULL && node->inbound_link == NULL && nodeInNormalState(node) &&
-        now - node->inbound_link_freed_time > handshake_timeout) {
+    if (nodeInNormalState(node) && node->link != NULL && node->inbound_link == NULL &&
+        now - node->inbound_link_freed_time > getHandshakeTimeout() &&
+        now - node->meet_sent > getHandshakeTimeout()) {
         /* Node has an outbound link, but no inbound link for more than the handshake timeout.
          * This probably means this node does not know us yet, whereas we know it.
-         * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. */
+         * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view.
+         * We make sure to not re-send a MEET packet more than once every handshake timeout period, so as to
+         * leave the other node time to complete the handshake. */
         node->flags |= CLUSTER_NODE_MEET;
-        serverLog(LL_NOTICE, "Sending MEET packet to node %.40s because there is no inbound link for it", node->name);
+        serverLog(LL_NOTICE, "Sending MEET packet to node %.40s (%s) because there is no inbound link for it",
+                  node->name, node->human_nodename);
         clusterSendPing(node->link, CLUSTERMSG_TYPE_MEET);
     }
 
@@ -5066,19 +5085,11 @@ void clusterCron(void) {
     mstime_t min_pong = 0, now = mstime();
     clusterNode *min_pong_node = NULL;
     static unsigned long long iteration = 0;
-    mstime_t handshake_timeout;
 
     iteration++; /* Number of times this function was called so far. */
 
     clusterUpdateMyselfHostname();
 
-    /* The handshake timeout is the time after which a handshake node that was
-     * not turned into a normal node is removed from the nodes. Usually it is
-     * just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
-     * the value of 1 second. */
-    handshake_timeout = server.cluster_node_timeout;
-    if (handshake_timeout < 1000) handshake_timeout = 1000;
-
     /* Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. */
     server.cluster->stats_pfail_nodes = 0;
     /* Run through some of the operations we want to do on each cluster node. */
@@ -5091,7 +5102,7 @@ void clusterCron(void) {
         /* The protocol is that function(s) below return non-zero if the node was
          * terminated.
          */
-        if (clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue;
+        if (clusterNodeCronHandleReconnect(node, now)) continue;
     }
     dictReleaseIterator(di);
 
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index d3e1c3459e..ac14bd583c 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -340,6 +340,7 @@ struct _clusterNode {
     mstime_t ping_sent;                     /* Unix time we sent latest ping */
     mstime_t pong_received;                 /* Unix time we received the pong */
     mstime_t data_received;                 /* Unix time we received any data */
+    mstime_t meet_sent;                     /* Unix time we sent latest meet packet */
     mstime_t fail_time;                     /* Unix time when FAIL flag was set */
     mstime_t repl_offset_time;              /* Unix time we received offset for this node */
     mstime_t orphaned_time;                 /* Starting time of orphaned primary condition */
diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl
index f189e96d5b..e32bbdab11 100644
--- a/tests/unit/cluster/cluster-reliable-meet.tcl
+++ b/tests/unit/cluster/cluster-reliable-meet.tcl
@@ -70,7 +70,7 @@ tags {tls:skip external:skip cluster} {
                 [CI 0 cluster_stats_messages_meet_received] >= 4 &&
                 [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received]
             } else {
-                fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]"
+                fail "Unexpected cluster state: node 1 cluster_state:[CI 1 cluster_state], node 0 cluster_state: [CI 0 cluster_state]"
             }
         }
     } ;# stop servers
@@ -178,14 +178,13 @@ start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout
 
             # Wait for Node 0's handshake to timeout
             wait_for_condition 50 100 {
-                [cluster_get_first_node_in_handshake 1] eq {}
+                [cluster_get_first_node_in_handshake 0] eq {}
             } else {
                 fail "Node 0 never exited handshake state"
             }
 
-            # At this point Node 0 knows Node 1 & 2 through the gossip, but they don't know Node 0.
+            # At this point Node 0 knows Node 2 through the gossip, but Node 1 & 2 don't know Node 0.
             wait_for_condition 50 100 {
-                [cluster_get_node_by_id 0 $node1_id] != {} &&
                 [cluster_get_node_by_id 0 $node2_id] != {} &&
                 [cluster_get_node_by_id 1 $node0_id] eq {} &&
                 [cluster_get_node_by_id 2 $node0_id] eq {}

From 68f50585129b69153f2cdceee955791373a237ab Mon Sep 17 00:00:00 2001
From: zhenwei pi <pizhenwei@bytedance.com>
Date: Tue, 31 Dec 2024 04:58:06 +0800
Subject: [PATCH 042/101] Make global configs as static (#1159)

Don't expose static configs symbol, and make configEnumGetValue as
static function.

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 src/config.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/config.c b/src/config.c
index f08b79ebbd..59cf0d9400 100644
--- a/src/config.c
+++ b/src/config.c
@@ -283,7 +283,7 @@ struct standardConfig {
     void *privdata;          /* privdata for this config, for module configs this is a ModuleConfig struct */
 };
 
-dict *configs = NULL; /* Runtime config values */
+static dict *configs = NULL; /* Runtime config values */
 
 /* Lookup a config by the provided sds string name, or return NULL
  * if the config does not exist */
@@ -297,7 +297,7 @@ static standardConfig *lookupConfig(sds name) {
  *----------------------------------------------------------------------------*/
 
 /* Get enum value from name. If there is no match INT_MIN is returned. */
-int configEnumGetValue(configEnum *ce, sds *argv, int argc, int bitflags) {
+static int configEnumGetValue(configEnum *ce, sds *argv, int argc, int bitflags) {
     if (argc == 0 || (!bitflags && argc != 1)) return INT_MIN;
     int values = 0;
     for (int i = 0; i < argc; i++) {

From 399fb0881dd7f90293431b3a102825372256e291 Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Wed, 1 Jan 2025 16:33:09 +0200
Subject: [PATCH 043/101] Align rejected unblocked commands to update the
 correct error statistic (#577)

Currently, in case a blocked command is unblocked externally (eg. due to
the relevant slot being migrated or the CLIENT UNBLOCK command was
issued, the command statistics will always update the failed_calls error
statistic. This leads to missalignment with
https://github.com/valkey-io/valkey/commit/90b9f08e5d1657e7bfffe43f31f6663bf469ee75
as well as some inconsistencies. For example when a key is migrated
during cluster slot migration, clients blocked on XREADGROUP will be
unblocked and update the rejected_calls stat, while clients blocked on
BLPOP will get unblocked updating the failed_calls stat.

In this PR we add explicit indication in updateStatsOnUnblock thet
indicates if the command was rejected or failed.

---------

Signed-off-by: ranshid <ranshid@amazon.com>
Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
---
 src/blocked.c                     | 23 ++++++++++++++++++-----
 src/module.c                      |  5 +++--
 src/server.h                      |  2 +-
 tests/integration/replication.tcl |  2 +-
 tests/unit/info.tcl               |  2 +-
 5 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/blocked.c b/src/blocked.c
index aeec560b3f..d356ea5c07 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -105,15 +105,27 @@ void blockClient(client *c, int btype) {
  * he will attempt to reprocess the command which will update the statistics.
  * However in case the client was timed out or in case of module blocked client is being unblocked
  * the command will not be reprocessed and we need to make stats update.
- * This function will make updates to the commandstats, slot-stats, slowlog and monitors.*/
-void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int had_errors) {
+ * This function will make updates to the commandstats, slot-stats, slowlog and monitors.
+ * The failed_or_rejected parameter is an indication that the blocked command was either failed internally or
+ * rejected/aborted externally. In case the command was rejected the value ERROR_COMMAND_REJECTED should be passed.
+ * In case the command failed internally, ERROR_COMMAND_FAILED should be passed.
+ * A value of zero indicate no error was reported after the command was unblocked  */
+void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_or_rejected) {
     const ustime_t total_cmd_duration = c->duration + blocked_us + reply_us;
     c->lastcmd->microseconds += total_cmd_duration;
     clusterSlotStatsAddCpuDuration(c, total_cmd_duration);
     c->lastcmd->calls++;
     c->commands_processed++;
     server.stat_numcommands++;
-    if (had_errors) c->lastcmd->failed_calls++;
+    debugServerAssertWithInfo(c, NULL, failed_or_rejected >= 0 && failed_or_rejected <= ERROR_COMMAND_FAILED);
+    if (failed_or_rejected) {
+        if (failed_or_rejected & ERROR_COMMAND_FAILED)
+            c->lastcmd->failed_calls++;
+        else if (failed_or_rejected & ERROR_COMMAND_REJECTED)
+            c->lastcmd->rejected_calls++;
+        else
+            debugServerAssertWithInfo(c, NULL, 0);
+    }
     if (server.latency_tracking_enabled)
         updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), total_cmd_duration * 1000);
     /* Log the command into the Slow log if needed. */
@@ -680,7 +692,8 @@ static void moduleUnblockClientOnKey(client *c, robj *key) {
     elapsedStart(&replyTimer);
 
     if (moduleTryServeClientBlockedOnKey(c, key)) {
-        updateStatsOnUnblock(c, 0, elapsedUs(replyTimer), server.stat_total_error_replies != prev_error_replies);
+        updateStatsOnUnblock(c, 0, elapsedUs(replyTimer),
+                             ((server.stat_total_error_replies != prev_error_replies) ? ERROR_COMMAND_FAILED : 0));
         moduleUnblockClient(c);
     }
     /* We need to call afterCommand even if the client was not unblocked
@@ -709,7 +722,7 @@ void unblockClientOnTimeout(client *c) {
  * If err_str is provided it will be used to reply to the blocked client */
 void unblockClientOnError(client *c, const char *err_str) {
     if (err_str) addReplyError(c, err_str);
-    updateStatsOnUnblock(c, 0, 0, 1);
+    updateStatsOnUnblock(c, 0, 0, ERROR_COMMAND_REJECTED);
     if (c->flag.pending_command) c->flag.pending_command = 0;
     unblockClient(c, 1);
 }
diff --git a/src/module.c b/src/module.c
index db493dd8bc..a8676cb727 100644
--- a/src/module.c
+++ b/src/module.c
@@ -8325,7 +8325,7 @@ void moduleHandleBlockedClients(void) {
         if (c && !clientHasModuleAuthInProgress(c)) {
             int had_errors = c->deferred_reply_errors ? !!listLength(c->deferred_reply_errors)
                                                       : (server.stat_total_error_replies != prev_error_replies);
-            updateStatsOnUnblock(c, bc->background_duration, reply_us, had_errors);
+            updateStatsOnUnblock(c, bc->background_duration, reply_us, (had_errors ? ERROR_COMMAND_FAILED : 0));
         }
 
         if (c != NULL) {
@@ -8411,7 +8411,8 @@ void moduleBlockedClientTimedOut(client *c, int from_module) {
     moduleFreeContext(&ctx);
 
     if (!from_module)
-        updateStatsOnUnblock(c, bc->background_duration, 0, server.stat_total_error_replies != prev_error_replies);
+        updateStatsOnUnblock(c, bc->background_duration, 0,
+                             ((server.stat_total_error_replies != prev_error_replies) ? ERROR_COMMAND_FAILED : 0));
 
     /* For timeout events, we do not want to call the disconnect callback,
      * because the blocked client will be automatically disconnected in
diff --git a/src/server.h b/src/server.h
index 424569f76f..f4fb663851 100644
--- a/src/server.h
+++ b/src/server.h
@@ -3735,7 +3735,7 @@ void blockPostponeClient(client *c);
 void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, long numreplicas, int numlocal);
 void replicationRequestAckFromReplicas(void);
 void signalDeletedKeyAsReady(serverDb *db, robj *key, int type);
-void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int had_errors);
+void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_or_rejected);
 void scanDatabaseForDeletedKeys(serverDb *emptied, serverDb *replaced_with);
 void totalNumberOfStatefulKeys(unsigned long *blocking_keys,
                                unsigned long *blocking_keys_on_nokey,
diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index 6d3c4e934f..de7837a9a8 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -192,7 +192,7 @@ start_server {tags {"repl external:skip"}} {
             } else {
                 fail "Master and replica have different digest: [$A debug digest] VS [$B debug digest]"
             }          
-            assert_match {*calls=1,*,rejected_calls=0,failed_calls=1*} [cmdrstat blpop $B]
+            assert_match {*calls=1,*,rejected_calls=1*,failed_calls=0} [cmdrstat blpop $B]
         }
         
         test {Replica output bytes metric} {
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index 3295c5e31a..4a638cac80 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -269,7 +269,7 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} {
             r client unblock $rd_id error
             assert_error {UNBLOCKED*} {$rd read}
             assert_match {*count=1*} [errorstat UNBLOCKED]
-            assert_match {*calls=1,*,rejected_calls=0,failed_calls=1} [cmdstat blpop]
+            assert_match {*calls=1,*,rejected_calls=1,failed_calls=0} [cmdstat blpop]
             assert_equal [s total_error_replies] 1
             $rd close
         }

From 471ecf4accdbade989ca17e65496db4a6c907830 Mon Sep 17 00:00:00 2001
From: Amit Nagler <58042354+naglera@users.noreply.github.com>
Date: Thu, 2 Jan 2025 04:00:29 +0200
Subject: [PATCH 044/101] Fix unreliable dual channel Valgrind tests (#1500)

Used same approach as PR #1165 to solve random failures.

Resolves #1491

Signed-off-by: naglera <anagler123@gmail.com>
---
 tests/integration/dual-channel-replication.tcl | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index 3adf9ce9fd..4ca70651a1 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -1256,7 +1256,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
 
     $primary config set repl-diskless-sync yes
     $primary config set dual-channel-replication-enabled yes
-    $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
+    $primary config set repl-diskless-sync-delay 0
 
     # Generating RDB will take 100 sec to generate
     $primary debug populate 1000000 primary 1
@@ -1270,9 +1270,18 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         
         $replica config set dual-channel-replication-enabled yes
         $replica config set loglevel debug
-        $replica config set repl-timeout 10
         $replica config set repl-diskless-load flush-before-load
 
+        if {$::valgrind} {
+            $primary config set repl-timeout 100
+            $replica config set repl-timeout 100
+            set max_tries 5000
+        } else {
+            $primary config set repl-timeout 10
+            $replica config set repl-timeout 10
+            set max_tries 500
+        }
+
         test "Replica notice main-connection killed during rdb load callback" {; # https://github.com/valkey-io/valkey/issues/1152
             set loglines [count_log_lines 0]
             $replica replicaof $primary_host $primary_port
@@ -1287,6 +1296,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             wait_for_log_messages 0 {"*Loading RDB produced by Valkey version*"} $loglines 1000 10
             $primary set key val
             set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary debug log "killing replica main connection $replica_main_conn_id"
             assert {$replica_main_conn_id != ""}
             set loglines [count_log_lines 0]
@@ -1298,8 +1308,8 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "Primary did not free repl buf block after sync failure"
             }
-            wait_for_log_messages 0 {"*Failed trying to load the PRIMARY synchronization DB from socket*"} $loglines 1000 10
-            verify_replica_online $primary 0 500
+            wait_for_log_messages 0 {"*Failed trying to load the PRIMARY synchronization DB from socket*"} $loglines $max_tries 10
+            verify_replica_online $primary 0 $max_tries
         }
     }
 }

From 15189c9b7eb56a537f848cabc872a645d41016fc Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:01:55 +0200
Subject: [PATCH 045/101] replication: fix io-threads possible race by moving
 waitForClientIO (#1422)

### Fix race with pending writes in replica state transition

#### The Problem
In #60 (Dual channel replication) a new `connWrite` call was added
before the `waitForClientIO` check. This created a race condition where
the main thread may attempt to write to a client that could have pending
writes in IO threads.

#### The Fix
Moved the `waitForClientIO()` call earlier in `syncCommand`, before any
`connWrite` call. This ensures all pending IO operations are completed
before attempting to write to the client.

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
---
 src/replication.c |  5 +++--
 src/socket.c      | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/replication.c b/src/replication.c
index f907771e71..160b0c4d5e 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -1036,6 +1036,9 @@ void syncCommand(client *c) {
     /* ignore SYNC if already replica or in monitor mode */
     if (c->flag.replica) return;
 
+    /* Wait for any IO pending operation to finish before changing the client state to replica */
+    waitForClientIO(c);
+
     /* Check if this is a failover request to a replica with the same replid and
      * become a primary if so. */
     if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr, "psync") && !strcasecmp(c->argv[3]->ptr, "failover")) {
@@ -1148,8 +1151,6 @@ void syncCommand(client *c) {
     c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
     if (server.repl_disable_tcp_nodelay) connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */
     c->repldbfd = -1;
-    /* Wait for any IO pending operation to finish before changing the client state */
-    waitForClientIO(c);
     c->flag.replica = 1;
     listAddNodeTail(server.replicas, c);
 
diff --git a/src/socket.c b/src/socket.c
index d89e6c8767..94869f3f25 100644
--- a/src/socket.c
+++ b/src/socket.c
@@ -29,6 +29,7 @@
 
 #include "server.h"
 #include "connhelpers.h"
+#include "io_threads.h"
 
 /* The connections module provides a lean abstraction of network connections
  * to avoid direct socket and async event management across the server code base.
@@ -154,6 +155,10 @@ static void connSocketClose(connection *conn) {
 }
 
 static int connSocketWrite(connection *conn, const void *data, size_t data_len) {
+    /* Assert the main thread is not writing to a connection that is currently offloaded. */
+    debugServerAssert(!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD) || !inMainThread() ||
+                      ((client *)connGetPrivateData(conn))->io_write_state != CLIENT_PENDING_IO);
+
     int ret = write(conn->fd, data, data_len);
     if (ret < 0 && errno != EAGAIN) {
         conn->last_errno = errno;
@@ -182,6 +187,11 @@ static int connSocketWritev(connection *conn, const struct iovec *iov, int iovcn
 }
 
 static int connSocketRead(connection *conn, void *buf, size_t buf_len) {
+    /* Assert the main thread is not reading from a connection that is currently offloaded. */
+    debugServerAssert(!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD) || !inMainThread() ||
+                      ((client *)connGetPrivateData(conn))->io_read_state != CLIENT_PENDING_IO);
+
+
     int ret = read(conn->fd, buf, buf_len);
     if (!ret) {
         conn->state = CONN_STATE_CLOSED;

From f7ac777e52c75670db8d6638b6f8dbbcaf96b714 Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Thu, 2 Jan 2025 11:42:39 +0200
Subject: [PATCH 046/101] Offload reading the replication stream to IO threads
 (#1449)

Support Primary client IO offload.

Related issue: https://github.com/valkey-io/valkey/issues/761

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
---
 src/io_threads.c  |  9 +++++----
 src/networking.c  | 16 +++++++++++++++-
 src/replication.c |  3 +++
 src/server.h      |  1 +
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/io_threads.c b/src/io_threads.c
index 90f5b88700..66ef4948b6 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -321,8 +321,8 @@ int trySendReadToIOThreads(client *c) {
     if (server.active_io_threads_num <= 1) return C_ERR;
     /* If IO thread is already reading, return C_OK to make sure the main thread will not handle it. */
     if (c->io_read_state != CLIENT_IDLE) return C_OK;
-    /* Currently, replica/master writes are not offloaded and are processed synchronously. */
-    if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR;
+    /* Currently, replica reads are not offloaded to IO threads. */
+    if (getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR;
     /* With Lua debug client we may call connWrite directly in the main thread */
     if (c->flag.lua_debug) return C_ERR;
     /* For simplicity let the main-thread handle the blocked clients */
@@ -345,6 +345,7 @@ int trySendReadToIOThreads(client *c) {
     c->cur_tid = tid;
     c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE;
     c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
+    c->read_flags |= c->flag.primary ? READ_FLAGS_PRIMARY : 0;
 
     c->io_read_state = CLIENT_PENDING_IO;
     connSetPostponeUpdateState(c->conn, 1);
@@ -363,8 +364,8 @@ int trySendWriteToIOThreads(client *c) {
     if (c->io_write_state != CLIENT_IDLE) return C_OK;
     /* Nothing to write */
     if (!clientHasPendingReplies(c)) return C_ERR;
-    /* Currently, replica/master writes are not offloaded and are processed synchronously. */
-    if (c->flag.primary || getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR;
+    /* Currently, replica writes are not offloaded to IO threads. */
+    if (getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR;
     /* We can't offload debugged clients as the main-thread may read at the same time  */
     if (c->flag.lua_debug) return C_ERR;
 
diff --git a/src/networking.c b/src/networking.c
index d93046a603..2190fca5bf 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -2592,6 +2592,16 @@ void resetClient(client *c) {
     }
 }
 
+void resetClientIOState(client *c) {
+    c->nwritten = 0;
+    c->nread = 0;
+    c->io_read_state = c->io_write_state = CLIENT_IDLE;
+    c->io_parsed_cmd = NULL;
+    c->flag.pending_command = 0;
+    c->io_last_bufpos = 0;
+    c->io_last_reply_block = NULL;
+}
+
 /* Initializes the shared query buffer to a new sds with the default capacity.
  * Need to ensure the initlen is not less than readlen in readToQueryBuf. */
 void initSharedQueryBuf(void) {
@@ -4962,7 +4972,11 @@ void ioThreadReadQueryFromClient(void *data) {
     }
 
 done:
-    trimClientQueryBuffer(c);
+    /* Only trim query buffer for non-primary clients
+     * Primary client's buffer is handled by main thread using repl_applied position */
+    if (!(c->read_flags & READ_FLAGS_PRIMARY)) {
+        trimClientQueryBuffer(c);
+    }
     atomic_thread_fence(memory_order_release);
     c->io_read_state = CLIENT_COMPLETED_IO;
 }
diff --git a/src/replication.c b/src/replication.c
index 160b0c4d5e..bec52a84d0 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -4134,6 +4134,8 @@ void replicationCachePrimary(client *c) {
     serverAssert(server.primary != NULL && server.cached_primary == NULL);
     serverLog(LL_NOTICE, "Caching the disconnected primary state.");
 
+    /* Wait for IO operations to be done before proceeding */
+    waitForClientIO(c);
     /* Unlink the client from the server structures. */
     unlinkClient(c);
 
@@ -4151,6 +4153,7 @@ void replicationCachePrimary(client *c) {
     c->reply_bytes = 0;
     c->bufpos = 0;
     resetClient(c);
+    resetClientIOState(c);
 
     /* Save the primary. Server.primary will be set to null later by
      * replicationHandlePrimaryDisconnection(). */
diff --git a/src/server.h b/src/server.h
index f4fb663851..582392bca0 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2831,6 +2831,7 @@ void logInvalidUseAndFreeClientAsync(client *c, const char *fmt, ...);
 void beforeNextClient(client *c);
 void clearClientConnectionState(client *c);
 void resetClient(client *c);
+void resetClientIOState(client *c);
 void freeClientOriginalArgv(client *c);
 void freeClientArgv(client *c);
 void sendReplyToClient(connection *conn);

From 245f51df9b27719f1e3ea69ed9f060bd10d98998 Mon Sep 17 00:00:00 2001
From: Wen Hui <wen.hui.ware@gmail.com>
Date: Thu, 2 Jan 2025 10:12:09 -0500
Subject: [PATCH 047/101] Remove releasetools folder (#1496)

The release tool utils\releasetools\ does not work anymore in Valkey, in
this PR, we remove it.

Signed-off-by: hwware <wen.hui.ware@gmail.com>
---
 utils/releasetools/01_create_tarball.sh | 14 ----------
 utils/releasetools/02_upload_tarball.sh | 23 ----------------
 utils/releasetools/03_test_release.sh   | 28 --------------------
 utils/releasetools/04_release_hash.sh   | 13 ---------
 utils/releasetools/changelog.tcl        | 35 -------------------------
 5 files changed, 113 deletions(-)
 delete mode 100755 utils/releasetools/01_create_tarball.sh
 delete mode 100755 utils/releasetools/02_upload_tarball.sh
 delete mode 100755 utils/releasetools/03_test_release.sh
 delete mode 100755 utils/releasetools/04_release_hash.sh
 delete mode 100755 utils/releasetools/changelog.tcl

diff --git a/utils/releasetools/01_create_tarball.sh b/utils/releasetools/01_create_tarball.sh
deleted file mode 100755
index 08fdcb6d16..0000000000
--- a/utils/releasetools/01_create_tarball.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh
-if [ $# != "1" ]
-then
-    echo "Usage: ./utils/releasetools/01_create_tarball.sh <version_tag>"
-    exit 1
-fi
-
-TAG=$1
-TARNAME="valkey-${TAG}.tar"
-echo "Generating /tmp/${TARNAME}"
-git archive $TAG --prefix valkey-${TAG}/ > /tmp/$TARNAME || exit 1
-echo "Gizipping the archive"
-rm -f /tmp/$TARNAME.gz
-gzip -9 /tmp/$TARNAME
diff --git a/utils/releasetools/02_upload_tarball.sh b/utils/releasetools/02_upload_tarball.sh
deleted file mode 100755
index dcd94ef383..0000000000
--- a/utils/releasetools/02_upload_tarball.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-if [ $# != "1" ]
-then
-    echo "Usage: ./utils/releasetools/02_upload_tarball.sh <version_tag>"
-    exit 1
-fi
-
-echo "Uploading..."
-scp /tmp/valkey-${1}.tar.gz ubuntu@host.redis.io:/var/www/download/releases/
-echo "Updating web site... "
-echo "Please check the github action tests for the release."
-echo "Press any key if it is a stable release, or Ctrl+C to abort"
-read x
-ssh ubuntu@host.redis.io "cd /var/www/download;
-                          rm -rf valkey-${1}.tar.gz;
-                          wget http://download.redis.io/releases/redis-${1}.tar.gz;
-                          tar xvzf redis-${1}.tar.gz;
-                          rm -rf valkey-stable;
-                          mv valkey-${1} valkey-stable;
-                          tar cvzf valkey-stable.tar.gz valkey-stable;
-                          rm -rf valkey-${1}.tar.gz;
-                          shasum -a 256 valkey-stable.tar.gz > valkey-stable.tar.gz.SHA256SUM;
-                          "
diff --git a/utils/releasetools/03_test_release.sh b/utils/releasetools/03_test_release.sh
deleted file mode 100755
index 2480d8cfd6..0000000000
--- a/utils/releasetools/03_test_release.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/sh
-set -e
-if [ $# != "1" ]
-then
-    echo "Usage: ./utils/releasetools/03_test_release.sh <version_tag>"
-    exit 1
-fi
-
-TAG=$1
-TARNAME="valkey-${TAG}.tar.gz"
-DOWNLOADURL="http://download.redis.io/releases/${TARNAME}"
-
-echo "Doing sanity test on the actual tarball"
-
-cd /tmp
-rm -rf test_release_tmp_dir
-mkdir test_release_tmp_dir
-cd test_release_tmp_dir
-rm -f $TARNAME
-rm -rf valkey-${TAG}
-wget $DOWNLOADURL
-tar xvzf $TARNAME
-cd valkey-${TAG}
-make
-./runtest
-./runtest-sentinel
-./runtest-cluster
-./runtest-moduleapi
diff --git a/utils/releasetools/04_release_hash.sh b/utils/releasetools/04_release_hash.sh
deleted file mode 100755
index 8be286fabc..0000000000
--- a/utils/releasetools/04_release_hash.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-if [ $# != "1" ]
-then
-    echo "Usage: ./utils/releasetools/04_release_hash.sh <version_tag>"
-    exit 1
-fi
-
-SHA=$(curl -s http://download.redis.io/releases/redis-${1}.tar.gz | shasum -a 256 | cut -f 1 -d' ')
-ENTRY="hash valkey-${1}.tar.gz sha256 $SHA http://download.redis.io/releases/redis-${1}.tar.gz"
-echo $ENTRY >> ../valkey-hashes/README
-echo "Press any key to commit, Ctrl-C to abort)."
-read yes
-(cd ../valkey-hashes; git commit -a -m "${1} hash."; git push)
diff --git a/utils/releasetools/changelog.tcl b/utils/releasetools/changelog.tcl
deleted file mode 100755
index e2f8d4364a..0000000000
--- a/utils/releasetools/changelog.tcl
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env tclsh
-
-if {[llength $::argv] != 2 && [llength $::argv] != 3} {
-    puts "Usage: $::argv0 <branch> <version> \[<num-commits>\]"
-    exit 1
-}
-
-set branch [lindex $::argv 0]
-set ver [lindex $::argv 1]
-if {[llength $::argv] == 3} {
-    set count [lindex ::$argv 2]
-} else {
-    set count 100
-}
-
-set template {
-================================================================================
-Valkey %ver%     Released %date%
-================================================================================
-
-Upgrade urgency <URGENCY>: <DESCRIPTION>
-}
-
-set template [string trim $template]
-append template "\n\n"
-set date [clock format [clock seconds]]
-set template [string map [list %ver% $ver %date% $date] $template]
-
-append template [exec git log $branch~$count..$branch "--format=format:%an in commit %h:%n %s" --shortstat]
-
-#Older, more verbose version.
-#
-#append template [exec git log $branch~30..$branch "--format=format:+-------------------------------------------------------------------------------%n| %s%n| By %an, %ai%n+--------------------------------------------------------------------------------%nhttps://github.com/redis/redis/commit/%H%n%n%b" --stat]
-
-puts $template

From cb7f2759a638cad3a579a8290ad445fbbd83c7b8 Mon Sep 17 00:00:00 2001
From: Ricardo Dias <ricardo.dias@percona.com>
Date: Thu, 2 Jan 2025 17:35:10 +0000
Subject: [PATCH 048/101] Refactor: move all valkey modules related
 declarations to `module.h` (#1489)

In this commit we move all structures and functions declarations related
to Valkey modules from `server.h` to the recently added `module.h` file.

This re-organization makes it easier for new contributors to find the
valkey modules related code, as well as reducing the compilation times
when changes are made to the modules code.

---------

Signed-off-by: Ricardo Dias <ricardo.dias@percona.com>
---
 src/acl.c              |   1 +
 src/aof.c              |   3 +-
 src/blocked.c          |   1 +
 src/cluster.c          |   1 +
 src/cluster_legacy.c   |   1 +
 src/config.c           |  24 +----
 src/db.c               |   1 +
 src/debug.c            |   3 +-
 src/defrag.c           |   1 +
 src/functions.h        |   1 +
 src/lazyfree.c         |   1 +
 src/module.c           |  40 +++++++-
 src/module.h           | 219 +++++++++++++++++++++++++++++++++++++++-
 src/networking.c       |   1 +
 src/notify.c           |   1 +
 src/object.c           |   1 +
 src/rdb.c              |   9 +-
 src/replication.c      |   1 +
 src/script.c           |   1 +
 src/server.c           |   1 +
 src/server.h           | 222 +----------------------------------------
 src/valkey-check-rdb.c |   1 +
 22 files changed, 283 insertions(+), 252 deletions(-)

diff --git a/src/acl.c b/src/acl.c
index d1f970a805..725419dcf2 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -29,6 +29,7 @@
 
 #include "server.h"
 #include "sha256.h"
+#include "module.h"
 #include <fcntl.h>
 #include <ctype.h>
 
diff --git a/src/aof.c b/src/aof.c
index 8af3a9928f..8ac44f64c2 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -31,6 +31,7 @@
 #include "bio.h"
 #include "rio.h"
 #include "functions.h"
+#include "module.h"
 
 #include <signal.h>
 #include <fcntl.h>
@@ -2167,7 +2168,7 @@ int rewriteModuleObject(rio *r, robj *key, robj *o, int dbid) {
     ValkeyModuleIO io;
     moduleValue *mv = o->ptr;
     moduleType *mt = mv->type;
-    moduleInitIOContext(io, mt, r, key, dbid);
+    moduleInitIOContext(&io, mt, r, key, dbid);
     mt->aof_rewrite(&io, key, mv->value);
     if (io.ctx) {
         moduleFreeContext(io.ctx);
diff --git a/src/blocked.c b/src/blocked.c
index d356ea5c07..39050932d9 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -65,6 +65,7 @@
 #include "latency.h"
 #include "monotonic.h"
 #include "cluster_slot_stats.h"
+#include "module.h"
 
 /* forward declarations */
 static void unblockClientWaitingData(client *c);
diff --git a/src/cluster.c b/src/cluster.c
index df6bb86454..39d9161b9c 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -36,6 +36,7 @@
 #include "server.h"
 #include "cluster.h"
 #include "cluster_slot_stats.h"
+#include "module.h"
 
 #include <ctype.h>
 
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 80889a79d8..a1b1d0e986 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -38,6 +38,7 @@
 #include "cluster_slot_stats.h"
 #include "endianconv.h"
 #include "connection.h"
+#include "module.h"
 
 #include <stdlib.h>
 #include <sys/types.h>
diff --git a/src/config.c b/src/config.c
index 59cf0d9400..0b459bb6e5 100644
--- a/src/config.c
+++ b/src/config.c
@@ -32,6 +32,7 @@
 #include "cluster.h"
 #include "connection.h"
 #include "bio.h"
+#include "module.h"
 
 #include <fcntl.h>
 #include <sys/stat.h>
@@ -371,20 +372,6 @@ void resetServerSaveParams(void) {
     server.saveparamslen = 0;
 }
 
-void queueLoadModule(sds path, sds *argv, int argc) {
-    int i;
-    struct moduleLoadQueueEntry *loadmod;
-
-    loadmod = zmalloc(sizeof(struct moduleLoadQueueEntry));
-    loadmod->argv = argc ? zmalloc(sizeof(robj *) * argc) : NULL;
-    loadmod->path = sdsnew(path);
-    loadmod->argc = argc;
-    for (i = 0; i < argc; i++) {
-        loadmod->argv[i] = createRawStringObject(argv[i], sdslen(argv[i]));
-    }
-    listAddNodeTail(server.loadmodule_queue, loadmod);
-}
-
 /* Parse an array of `arg_len` sds strings, validate and populate
  * server.client_obuf_limits if valid.
  * Used in CONFIG SET and configuration file parsing. */
@@ -567,7 +554,7 @@ void loadServerConfigFromString(char *config) {
                 goto loaderr;
             }
         } else if (!strcasecmp(argv[0], "loadmodule") && argc >= 2) {
-            queueLoadModule(argv[1], &argv[2], argc - 2);
+            moduleEnqueueLoadModule(argv[1], &argv[2], argc - 2);
         } else if (strchr(argv[0], '.')) {
             if (argc < 2) {
                 err = "Module config specified without value";
@@ -1583,12 +1570,7 @@ void rewriteConfigLoadmoduleOption(struct rewriteConfigState *state) {
     dictEntry *de;
     while ((de = dictNext(di)) != NULL) {
         struct ValkeyModule *module = dictGetVal(de);
-        line = sdsnew("loadmodule ");
-        line = sdscatsds(line, module->loadmod->path);
-        for (int i = 0; i < module->loadmod->argc; i++) {
-            line = sdscatlen(line, " ", 1);
-            line = sdscatsds(line, module->loadmod->argv[i]->ptr);
-        }
+        line = moduleLoadQueueEntryToLoadmoduleOptionStr(module, "loadmodule");
         rewriteConfigRewriteLine(state, "loadmodule", line, 1);
     }
     dictReleaseIterator(di);
diff --git a/src/db.c b/src/db.c
index e31d7e7f7f..1362b5f9dd 100644
--- a/src/db.c
+++ b/src/db.c
@@ -33,6 +33,7 @@
 #include "script.h"
 #include "functions.h"
 #include "io_threads.h"
+#include "module.h"
 
 #include <signal.h>
 #include <ctype.h>
diff --git a/src/debug.c b/src/debug.c
index 4efe12e237..7e52874e30 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -38,6 +38,7 @@
 #include "threads_mngr.h"
 #include "io_threads.h"
 #include "sds.h"
+#include "module.h"
 
 #include <arpa/inet.h>
 #include <signal.h>
@@ -263,7 +264,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o)
         ValkeyModuleDigest md = {{0}, {0}, keyobj, db->id};
         moduleValue *mv = o->ptr;
         moduleType *mt = mv->type;
-        moduleInitDigestContext(md);
+        moduleInitDigestContext(&md);
         if (mt->digest) {
             mt->digest(&md, mv->value);
             xorDigest(digest, md.x, sizeof(md.x));
diff --git a/src/defrag.c b/src/defrag.c
index a5d6c69c1c..a755db559a 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -36,6 +36,7 @@
 #include "server.h"
 #include "hashtable.h"
 #include "script.h"
+#include "module.h"
 #include <stddef.h>
 
 #ifdef HAVE_DEFRAG
diff --git a/src/functions.h b/src/functions.h
index 89e39fdc56..a48ff1b8db 100644
--- a/src/functions.h
+++ b/src/functions.h
@@ -55,6 +55,7 @@
 typedef struct functionLibInfo functionLibInfo;
 
 /* ValkeyModule type aliases for scripting engine structs and types. */
+typedef struct ValkeyModule ValkeyModule;
 typedef ValkeyModuleScriptingEngineCtx engineCtx;
 typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx;
 typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction;
diff --git a/src/lazyfree.c b/src/lazyfree.c
index 4b4c7f06ad..c22d3da964 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -2,6 +2,7 @@
 #include "bio.h"
 #include "functions.h"
 #include "cluster.h"
+#include "module.h"
 
 #include <stdatomic.h>
 
diff --git a/src/module.c b/src/module.c
index a8676cb727..dabea59d49 100644
--- a/src/module.c
+++ b/src/module.c
@@ -63,6 +63,7 @@
 #include "valkeymodule.h"
 #include "io_threads.h"
 #include "functions.h"
+#include "module.h"
 #include <dlfcn.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
@@ -75,6 +76,12 @@
  * pointers that have an API the module can call with them)
  * -------------------------------------------------------------------------- */
 
+struct moduleLoadQueueEntry {
+    sds path;
+    int argc;
+    robj **argv;
+};
+
 struct ValkeyModuleInfoCtx {
     struct ValkeyModule *module;
     dict *requested_sections;
@@ -644,6 +651,35 @@ void *VM_PoolAlloc(ValkeyModuleCtx *ctx, size_t bytes) {
  * Helpers for modules API implementation
  * -------------------------------------------------------------------------- */
 
+void moduleEnqueueLoadModule(sds path, sds *argv, int argc) {
+    int i;
+    struct moduleLoadQueueEntry *loadmod;
+
+    loadmod = zmalloc(sizeof(struct moduleLoadQueueEntry));
+    loadmod->argv = argc ? zmalloc(sizeof(robj *) * argc) : NULL;
+    loadmod->path = sdsnew(path);
+    loadmod->argc = argc;
+    for (i = 0; i < argc; i++) {
+        loadmod->argv[i] = createRawStringObject(argv[i], sdslen(argv[i]));
+    }
+    listAddNodeTail(server.loadmodule_queue, loadmod);
+}
+
+sds moduleLoadQueueEntryToLoadmoduleOptionStr(ValkeyModule *module,
+                                              const char *config_option_str) {
+    sds line;
+
+    line = sdsnew(config_option_str);
+    line = sdscatlen(line, " ", 1);
+    line = sdscatsds(line, module->loadmod->path);
+    for (int i = 0; i < module->loadmod->argc; i++) {
+        line = sdscatlen(line, " ", 1);
+        line = sdscatsds(line, module->loadmod->argv[i]->ptr);
+    }
+
+    return line;
+}
+
 client *moduleAllocTempClient(void) {
     client *c = NULL;
 
@@ -7401,7 +7437,7 @@ void *VM_LoadDataTypeFromStringEncver(const ValkeyModuleString *str, const modul
     void *ret;
 
     rioInitWithBuffer(&payload, str->ptr);
-    moduleInitIOContext(io, (moduleType *)mt, &payload, NULL, -1);
+    moduleInitIOContext(&io, (moduleType *)mt, &payload, NULL, -1);
 
     /* All VM_Save*() calls always write a version 2 compatible format, so we
      * need to make sure we read the same.
@@ -7433,7 +7469,7 @@ ValkeyModuleString *VM_SaveDataTypeToString(ValkeyModuleCtx *ctx, void *data, co
     ValkeyModuleIO io;
 
     rioInitWithBuffer(&payload, sdsempty());
-    moduleInitIOContext(io, (moduleType *)mt, &payload, NULL, -1);
+    moduleInitIOContext(&io, (moduleType *)mt, &payload, NULL, -1);
     mt->rdb_save(&io, data);
     if (io.ctx) {
         moduleFreeContext(io.ctx);
diff --git a/src/module.h b/src/module.h
index f61ef1e3cb..78d9341ca9 100644
--- a/src/module.h
+++ b/src/module.h
@@ -5,13 +5,228 @@
  * not part of the module API, but are used by the core to interact with modules
  */
 
-typedef struct ValkeyModuleCtx ValkeyModuleCtx;
-typedef struct ValkeyModule ValkeyModule;
+/* Extract encver / signature from a module type ID. */
+#define VALKEYMODULE_TYPE_ENCVER_BITS 10
+#define VALKEYMODULE_TYPE_ENCVER_MASK ((1 << VALKEYMODULE_TYPE_ENCVER_BITS) - 1)
+#define VALKEYMODULE_TYPE_ENCVER(id) ((id) & VALKEYMODULE_TYPE_ENCVER_MASK)
+#define VALKEYMODULE_TYPE_SIGN(id) \
+    (((id) & ~((uint64_t)VALKEYMODULE_TYPE_ENCVER_MASK)) >> VALKEYMODULE_TYPE_ENCVER_BITS)
 
+/* Bit flags for moduleTypeAuxSaveFunc */
+#define VALKEYMODULE_AUX_BEFORE_RDB (1 << 0)
+#define VALKEYMODULE_AUX_AFTER_RDB (1 << 1)
+
+struct ValkeyModule;
+struct ValkeyModuleIO;
+struct ValkeyModuleDigest;
+struct ValkeyModuleCtx;
+struct moduleLoadQueueEntry;
+struct ValkeyModuleKeyOptCtx;
+struct ValkeyModuleCommand;
+struct clusterState;
+
+/* Each module type implementation should export a set of methods in order
+ * to serialize and deserialize the value in the RDB file, rewrite the AOF
+ * log, create the digest for "DEBUG DIGEST", and free the value when a key
+ * is deleted. */
+typedef void *(*moduleTypeLoadFunc)(struct ValkeyModuleIO *io, int encver);
+typedef void (*moduleTypeSaveFunc)(struct ValkeyModuleIO *io, void *value);
+typedef int (*moduleTypeAuxLoadFunc)(struct ValkeyModuleIO *rdb, int encver, int when);
+typedef void (*moduleTypeAuxSaveFunc)(struct ValkeyModuleIO *rdb, int when);
+typedef void (*moduleTypeRewriteFunc)(struct ValkeyModuleIO *io, struct serverObject *key, void *value);
+typedef void (*moduleTypeDigestFunc)(struct ValkeyModuleDigest *digest, void *value);
+typedef size_t (*moduleTypeMemUsageFunc)(const void *value);
+typedef void (*moduleTypeFreeFunc)(void *value);
+typedef size_t (*moduleTypeFreeEffortFunc)(struct serverObject *key, const void *value);
+typedef void (*moduleTypeUnlinkFunc)(struct serverObject *key, void *value);
+typedef void *(*moduleTypeCopyFunc)(struct serverObject *fromkey, struct serverObject *tokey, const void *value);
+typedef int (*moduleTypeDefragFunc)(struct ValkeyModuleDefragCtx *ctx, struct serverObject *key, void **value);
+typedef size_t (*moduleTypeMemUsageFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value, size_t sample_size);
+typedef void (*moduleTypeFreeFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value);
+typedef size_t (*moduleTypeFreeEffortFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value);
+typedef void (*moduleTypeUnlinkFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value);
+typedef void *(*moduleTypeCopyFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value);
+typedef int (*moduleTypeAuthCallback)(struct ValkeyModuleCtx *ctx, void *username, void *password, const char **err);
+
+
+/* The module type, which is referenced in each value of a given type, defines
+ * the methods and links to the module exporting the type. */
+typedef struct ValkeyModuleType {
+    uint64_t id; /* Higher 54 bits of type ID + 10 lower bits of encoding ver. */
+    struct ValkeyModule *module;
+    moduleTypeLoadFunc rdb_load;
+    moduleTypeSaveFunc rdb_save;
+    moduleTypeRewriteFunc aof_rewrite;
+    moduleTypeMemUsageFunc mem_usage;
+    moduleTypeDigestFunc digest;
+    moduleTypeFreeFunc free;
+    moduleTypeFreeEffortFunc free_effort;
+    moduleTypeUnlinkFunc unlink;
+    moduleTypeCopyFunc copy;
+    moduleTypeDefragFunc defrag;
+    moduleTypeAuxLoadFunc aux_load;
+    moduleTypeAuxSaveFunc aux_save;
+    moduleTypeMemUsageFunc2 mem_usage2;
+    moduleTypeFreeEffortFunc2 free_effort2;
+    moduleTypeUnlinkFunc2 unlink2;
+    moduleTypeCopyFunc2 copy2;
+    moduleTypeAuxSaveFunc aux_save2;
+    int aux_save_triggers;
+    char name[10]; /* 9 bytes name + null term. Charset: A-Z a-z 0-9 _- */
+} moduleType;
+
+/* In Object 'robj' structures of type OBJ_MODULE, the value pointer
+ * is set to the following structure, referencing the moduleType structure
+ * in order to work with the value, and at the same time providing a raw
+ * pointer to the value, as created by the module commands operating with
+ * the module type.
+ *
+ * So for example in order to free such a value, it is possible to use
+ * the following code:
+ *
+ *  if (robj->type == OBJ_MODULE) {
+ *      moduleValue *mt = robj->ptr;
+ *      mt->type->free(mt->value);
+ *      zfree(mt); // We need to release this in-the-middle struct as well.
+ *  }
+ */
+typedef struct moduleValue {
+    moduleType *type;
+    void *value;
+} moduleValue;
+
+/* This structure represents a module inside the system. */
+typedef struct ValkeyModule {
+    void *handle;                         /* Module dlopen() handle. */
+    char *name;                           /* Module name. */
+    int ver;                              /* Module version. We use just progressive integers. */
+    int apiver;                           /* Module API version as requested during initialization.*/
+    list *types;                          /* Module data types. */
+    list *usedby;                         /* List of modules using APIs from this one. */
+    list *using;                          /* List of modules we use some APIs of. */
+    list *filters;                        /* List of filters the module has registered. */
+    list *module_configs;                 /* List of configurations the module has registered */
+    int configs_initialized;              /* Have the module configurations been initialized? */
+    int in_call;                          /* RM_Call() nesting level */
+    int in_hook;                          /* Hooks callback nesting level for this module (0 or 1). */
+    int options;                          /* Module options and capabilities. */
+    int blocked_clients;                  /* Count of ValkeyModuleBlockedClient in this module. */
+    ValkeyModuleInfoFunc info_cb;         /* Callback for module to add INFO fields. */
+    ValkeyModuleDefragFunc defrag_cb;     /* Callback for global data defrag. */
+    struct moduleLoadQueueEntry *loadmod; /* Module load arguments for config rewrite. */
+    int num_commands_with_acl_categories; /* Number of commands in this module included in acl categories */
+    int onload;                           /* Flag to identify if the call is being made from Onload (0 or 1) */
+    size_t num_acl_categories_added;      /* Number of acl categories added by this module. */
+} ValkeyModule;
+
+/* This is a wrapper for the 'rio' streams used inside rdb.c in the server, so that
+ * the user does not have to take the total count of the written bytes nor
+ * to care about error conditions. */
+typedef struct ValkeyModuleIO {
+    size_t bytes;         /* Bytes read / written so far. */
+    rio *rio;             /* Rio stream. */
+    moduleType *type;     /* Module type doing the operation. */
+    int error;            /* True if error condition happened. */
+    ValkeyModuleCtx *ctx; /* Optional context, see RM_GetContextFromIO()*/
+    robj *key;            /* Optional name of key processed */
+    int dbid;             /* The dbid of the key being processed, -1 when unknown. */
+    sds pre_flush_buffer; /* A buffer that should be flushed before next write operation
+                           * See rdbSaveSingleModuleAux for more details */
+} ValkeyModuleIO;
+
+/* Macro to initialize an IO context. Note that the 'ver' field is populated
+ * inside rdb.c according to the version of the value to load. */
+static inline void moduleInitIOContext(ValkeyModuleIO *iovar,
+                                       moduleType *mtype,
+                                       rio *rioptr,
+                                       robj *keyptr,
+                                       int db) {
+    iovar->rio = rioptr;
+    iovar->type = mtype;
+    iovar->bytes = 0;
+    iovar->error = 0;
+    iovar->key = keyptr;
+    iovar->dbid = db;
+    iovar->ctx = NULL;
+    iovar->pre_flush_buffer = NULL;
+}
+
+/* This is a structure used to export DEBUG DIGEST capabilities to
+ * modules. We want to capture both the ordered and unordered elements of
+ * a data structure, so that a digest can be created in a way that correctly
+ * reflects the values. See the DEBUG DIGEST command implementation for more
+ * background. */
+typedef struct ValkeyModuleDigest {
+    unsigned char o[20]; /* Ordered elements. */
+    unsigned char x[20]; /* Xored elements. */
+    robj *key;           /* Optional name of key processed */
+    int dbid;            /* The dbid of the key being processed */
+} ValkeyModuleDigest;
+
+/* Just start with a digest composed of all zero bytes. */
+static inline void moduleInitDigestContext(ValkeyModuleDigest *mdvar) {
+    memset(mdvar->o, 0, sizeof(mdvar->o));
+    memset(mdvar->x, 0, sizeof(mdvar->x));
+}
+
+void moduleEnqueueLoadModule(sds path, sds *argv, int argc);
+sds moduleLoadQueueEntryToLoadmoduleOptionStr(ValkeyModule *module,
+                                              const char *config_option_str);
 ValkeyModuleCtx *moduleAllocateContext(void);
 void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx,
                                       ValkeyModule *module,
                                       client *client);
 void moduleFreeContext(ValkeyModuleCtx *ctx);
+void moduleInitModulesSystem(void);
+void moduleInitModulesSystemLast(void);
+void modulesCron(void);
+int moduleLoad(const char *path, void **argv, int argc, int is_loadex);
+int moduleUnload(sds name, const char **errmsg);
+void moduleLoadFromQueue(void);
+int moduleGetCommandKeysViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result);
+int moduleGetCommandChannelsViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result);
+moduleType *moduleTypeLookupModuleByID(uint64_t id);
+moduleType *moduleTypeLookupModuleByName(const char *name);
+moduleType *moduleTypeLookupModuleByNameIgnoreCase(const char *name);
+void moduleTypeNameByID(char *name, uint64_t moduleid);
+const char *moduleTypeModuleName(moduleType *mt);
+const char *moduleNameFromCommand(struct serverCommand *cmd);
+void moduleFreeContext(ValkeyModuleCtx *ctx);
+void moduleCallCommandUnblockedHandler(client *c);
+int isModuleClientUnblocked(client *c);
+void unblockClientFromModule(client *c);
+void moduleHandleBlockedClients(void);
+void moduleBlockedClientTimedOut(client *c, int from_module);
+void modulePipeReadable(aeEventLoop *el, int fd, void *privdata, int mask);
+size_t moduleCount(void);
+void moduleAcquireGIL(void);
+int moduleTryAcquireGIL(void);
+void moduleReleaseGIL(void);
+void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid);
+void firePostExecutionUnitJobs(void);
+void moduleCallCommandFilters(client *c);
+void modulePostExecutionUnitOperations(void);
+void ModuleForkDoneHandler(int exitcode, int bysignal);
+int TerminateModuleForkChild(int child_pid, int wait);
+ssize_t rdbSaveModulesAux(rio *rdb, int when);
+int moduleAllDatatypesHandleErrors(void);
+int moduleAllModulesHandleReplAsyncLoad(void);
+sds modulesCollectInfo(sds info, dict *sections_dict, int for_crash_report, int sections);
+void moduleFireServerEvent(uint64_t eid, int subid, void *data);
+void processModuleLoadingProgressEvent(int is_aof);
+int moduleTryServeClientBlockedOnKey(client *c, robj *key);
+void moduleUnblockClient(client *c);
+int moduleBlockedClientMayTimeout(client *c);
+int moduleClientIsBlockedOnKeys(client *c);
+void moduleNotifyUserChanged(client *c);
+void moduleNotifyKeyUnlink(robj *key, robj *val, int dbid, int flags);
+size_t moduleGetFreeEffort(robj *key, robj *val, int dbid);
+size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid);
+robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value);
+int moduleDefragValue(robj *key, robj *obj, int dbid);
+int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid);
+void moduleDefragGlobals(void);
+void *moduleGetHandleByName(char *modulename);
+int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd);
 
 #endif /* _MODULE_H_ */
diff --git a/src/networking.c b/src/networking.c
index 2190fca5bf..08e9a56313 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -35,6 +35,7 @@
 #include "fpconv_dtoa.h"
 #include "fmtargs.h"
 #include "io_threads.h"
+#include "module.h"
 #include <strings.h>
 #include <sys/socket.h>
 #include <sys/uio.h>
diff --git a/src/notify.c b/src/notify.c
index c655457e8b..d10d7dd9b9 100644
--- a/src/notify.c
+++ b/src/notify.c
@@ -28,6 +28,7 @@
  */
 
 #include "server.h"
+#include "module.h"
 
 /* This file implements keyspace events notification via Pub/Sub and
  * described at https://valkey.io/topics/notifications */
diff --git a/src/object.c b/src/object.c
index 15363f31b8..637b25e30c 100644
--- a/src/object.c
+++ b/src/object.c
@@ -34,6 +34,7 @@
 #include "intset.h" /* Compact integer set structure */
 #include "zmalloc.h"
 #include "sds.h"
+#include "module.h"
 #include <math.h>
 #include <ctype.h>
 
diff --git a/src/rdb.c b/src/rdb.c
index a4eb2823fb..958eac5d4f 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -37,6 +37,7 @@
 #include "intset.h" /* Compact integer set structure */
 #include "bio.h"
 #include "zmalloc.h"
+#include "module.h"
 
 #include <math.h>
 #include <fcntl.h>
@@ -1098,7 +1099,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
          * to call the right module during loading. */
         int retval = rdbSaveLen(rdb, mt->id);
         if (retval == -1) return -1;
-        moduleInitIOContext(io, mt, rdb, key, dbid);
+        moduleInitIOContext(&io, mt, rdb, key, dbid);
         io.bytes += retval;
 
         /* Then write the module-specific representation + EOF marker. */
@@ -1242,7 +1243,7 @@ ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt) {
     /* Save a module-specific aux value. */
     ValkeyModuleIO io;
     int retval = 0;
-    moduleInitIOContext(io, mt, rdb, NULL, -1);
+    moduleInitIOContext(&io, mt, rdb, NULL, -1);
 
     /* We save the AUX field header in a temporary buffer so we can support aux_save2 API.
      * If aux_save2 is used the buffer will be flushed at the first time the module will perform
@@ -2795,7 +2796,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
         ValkeyModuleIO io;
         robj keyobj;
         initStaticStringObject(keyobj, key);
-        moduleInitIOContext(io, mt, rdb, &keyobj, dbid);
+        moduleInitIOContext(&io, mt, rdb, &keyobj, dbid);
         /* Call the rdb_load method of the module providing the 10 bit
          * encoding version in the lower 10 bits of the module ID. */
         void *ptr = mt->rdb_load(&io, moduleid & 1023);
@@ -3221,7 +3222,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin
                 }
 
                 ValkeyModuleIO io;
-                moduleInitIOContext(io, mt, rdb, NULL, -1);
+                moduleInitIOContext(&io, mt, rdb, NULL, -1);
                 /* Call the rdb_load method of the module providing the 10 bit
                  * encoding version in the lower 10 bits of the module ID. */
                 int rc = mt->aux_load(&io, moduleid & 1023, when);
diff --git a/src/replication.c b/src/replication.c
index bec52a84d0..c5611d5a5a 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -35,6 +35,7 @@
 #include "bio.h"
 #include "functions.h"
 #include "connection.h"
+#include "module.h"
 
 #include <memory.h>
 #include <sys/time.h>
diff --git a/src/script.c b/src/script.c
index f1d0a8fb79..f342d496fc 100644
--- a/src/script.c
+++ b/src/script.c
@@ -31,6 +31,7 @@
 #include "script.h"
 #include "cluster.h"
 #include "cluster_slot_stats.h"
+#include "module.h"
 
 scriptFlag scripts_flags_def[] = {
     {.flag = SCRIPT_FLAG_NO_WRITES, .str = "no-writes"},
diff --git a/src/server.c b/src/server.c
index b997a9aec6..e53e7ff552 100644
--- a/src/server.c
+++ b/src/server.c
@@ -42,6 +42,7 @@
 #include "fmtargs.h"
 #include "io_threads.h"
 #include "sds.h"
+#include "module.h"
 
 #include <time.h>
 #include <signal.h>
diff --git a/src/server.h b/src/server.h
index 582392bca0..b65488aab8 100644
--- a/src/server.h
+++ b/src/server.h
@@ -701,168 +701,7 @@ typedef enum {
 #define OBJ_STREAM 6   /* Stream object. */
 #define OBJ_TYPE_MAX 7 /* Maximum number of object types */
 
-/* Extract encver / signature from a module type ID. */
-#define VALKEYMODULE_TYPE_ENCVER_BITS 10
-#define VALKEYMODULE_TYPE_ENCVER_MASK ((1 << VALKEYMODULE_TYPE_ENCVER_BITS) - 1)
-#define VALKEYMODULE_TYPE_ENCVER(id) ((id) & VALKEYMODULE_TYPE_ENCVER_MASK)
-#define VALKEYMODULE_TYPE_SIGN(id) \
-    (((id) & ~((uint64_t)VALKEYMODULE_TYPE_ENCVER_MASK)) >> VALKEYMODULE_TYPE_ENCVER_BITS)
-
-/* Bit flags for moduleTypeAuxSaveFunc */
-#define VALKEYMODULE_AUX_BEFORE_RDB (1 << 0)
-#define VALKEYMODULE_AUX_AFTER_RDB (1 << 1)
-
-struct ValkeyModule;
-struct ValkeyModuleIO;
-struct ValkeyModuleDigest;
-struct ValkeyModuleCtx;
-struct moduleLoadQueueEntry;
-struct ValkeyModuleKeyOptCtx;
-struct ValkeyModuleCommand;
-struct clusterState;
-
-/* Each module type implementation should export a set of methods in order
- * to serialize and deserialize the value in the RDB file, rewrite the AOF
- * log, create the digest for "DEBUG DIGEST", and free the value when a key
- * is deleted. */
-typedef void *(*moduleTypeLoadFunc)(struct ValkeyModuleIO *io, int encver);
-typedef void (*moduleTypeSaveFunc)(struct ValkeyModuleIO *io, void *value);
-typedef int (*moduleTypeAuxLoadFunc)(struct ValkeyModuleIO *rdb, int encver, int when);
-typedef void (*moduleTypeAuxSaveFunc)(struct ValkeyModuleIO *rdb, int when);
-typedef void (*moduleTypeRewriteFunc)(struct ValkeyModuleIO *io, struct serverObject *key, void *value);
-typedef void (*moduleTypeDigestFunc)(struct ValkeyModuleDigest *digest, void *value);
-typedef size_t (*moduleTypeMemUsageFunc)(const void *value);
-typedef void (*moduleTypeFreeFunc)(void *value);
-typedef size_t (*moduleTypeFreeEffortFunc)(struct serverObject *key, const void *value);
-typedef void (*moduleTypeUnlinkFunc)(struct serverObject *key, void *value);
-typedef void *(*moduleTypeCopyFunc)(struct serverObject *fromkey, struct serverObject *tokey, const void *value);
-typedef int (*moduleTypeDefragFunc)(struct ValkeyModuleDefragCtx *ctx, struct serverObject *key, void **value);
-typedef size_t (*moduleTypeMemUsageFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value, size_t sample_size);
-typedef void (*moduleTypeFreeFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value);
-typedef size_t (*moduleTypeFreeEffortFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value);
-typedef void (*moduleTypeUnlinkFunc2)(struct ValkeyModuleKeyOptCtx *ctx, void *value);
-typedef void *(*moduleTypeCopyFunc2)(struct ValkeyModuleKeyOptCtx *ctx, const void *value);
-typedef int (*moduleTypeAuthCallback)(struct ValkeyModuleCtx *ctx, void *username, void *password, const char **err);
-
-
-/* The module type, which is referenced in each value of a given type, defines
- * the methods and links to the module exporting the type. */
-typedef struct ValkeyModuleType {
-    uint64_t id; /* Higher 54 bits of type ID + 10 lower bits of encoding ver. */
-    struct ValkeyModule *module;
-    moduleTypeLoadFunc rdb_load;
-    moduleTypeSaveFunc rdb_save;
-    moduleTypeRewriteFunc aof_rewrite;
-    moduleTypeMemUsageFunc mem_usage;
-    moduleTypeDigestFunc digest;
-    moduleTypeFreeFunc free;
-    moduleTypeFreeEffortFunc free_effort;
-    moduleTypeUnlinkFunc unlink;
-    moduleTypeCopyFunc copy;
-    moduleTypeDefragFunc defrag;
-    moduleTypeAuxLoadFunc aux_load;
-    moduleTypeAuxSaveFunc aux_save;
-    moduleTypeMemUsageFunc2 mem_usage2;
-    moduleTypeFreeEffortFunc2 free_effort2;
-    moduleTypeUnlinkFunc2 unlink2;
-    moduleTypeCopyFunc2 copy2;
-    moduleTypeAuxSaveFunc aux_save2;
-    int aux_save_triggers;
-    char name[10]; /* 9 bytes name + null term. Charset: A-Z a-z 0-9 _- */
-} moduleType;
-
-/* In Object 'robj' structures of type OBJ_MODULE, the value pointer
- * is set to the following structure, referencing the moduleType structure
- * in order to work with the value, and at the same time providing a raw
- * pointer to the value, as created by the module commands operating with
- * the module type.
- *
- * So for example in order to free such a value, it is possible to use
- * the following code:
- *
- *  if (robj->type == OBJ_MODULE) {
- *      moduleValue *mt = robj->ptr;
- *      mt->type->free(mt->value);
- *      zfree(mt); // We need to release this in-the-middle struct as well.
- *  }
- */
-typedef struct moduleValue {
-    moduleType *type;
-    void *value;
-} moduleValue;
-
-/* This structure represents a module inside the system. */
-struct ValkeyModule {
-    void *handle;                         /* Module dlopen() handle. */
-    char *name;                           /* Module name. */
-    int ver;                              /* Module version. We use just progressive integers. */
-    int apiver;                           /* Module API version as requested during initialization.*/
-    list *types;                          /* Module data types. */
-    list *usedby;                         /* List of modules using APIs from this one. */
-    list *using;                          /* List of modules we use some APIs of. */
-    list *filters;                        /* List of filters the module has registered. */
-    list *module_configs;                 /* List of configurations the module has registered */
-    int configs_initialized;              /* Have the module configurations been initialized? */
-    int in_call;                          /* RM_Call() nesting level */
-    int in_hook;                          /* Hooks callback nesting level for this module (0 or 1). */
-    int options;                          /* Module options and capabilities. */
-    int blocked_clients;                  /* Count of ValkeyModuleBlockedClient in this module. */
-    ValkeyModuleInfoFunc info_cb;         /* Callback for module to add INFO fields. */
-    ValkeyModuleDefragFunc defrag_cb;     /* Callback for global data defrag. */
-    struct moduleLoadQueueEntry *loadmod; /* Module load arguments for config rewrite. */
-    int num_commands_with_acl_categories; /* Number of commands in this module included in acl categories */
-    int onload;                           /* Flag to identify if the call is being made from Onload (0 or 1) */
-    size_t num_acl_categories_added;      /* Number of acl categories added by this module. */
-};
-typedef struct ValkeyModule ValkeyModule;
-
-/* This is a wrapper for the 'rio' streams used inside rdb.c in the server, so that
- * the user does not have to take the total count of the written bytes nor
- * to care about error conditions. */
-struct ValkeyModuleIO {
-    size_t bytes;                /* Bytes read / written so far. */
-    rio *rio;                    /* Rio stream. */
-    moduleType *type;            /* Module type doing the operation. */
-    int error;                   /* True if error condition happened. */
-    struct ValkeyModuleCtx *ctx; /* Optional context, see RM_GetContextFromIO()*/
-    struct serverObject *key;    /* Optional name of key processed */
-    int dbid;                    /* The dbid of the key being processed, -1 when unknown. */
-    sds pre_flush_buffer;        /* A buffer that should be flushed before next write operation
-                                  * See rdbSaveSingleModuleAux for more details */
-};
-
-/* Macro to initialize an IO context. Note that the 'ver' field is populated
- * inside rdb.c according to the version of the value to load. */
-#define moduleInitIOContext(iovar, mtype, rioptr, keyptr, db) \
-    do {                                                      \
-        iovar.rio = rioptr;                                   \
-        iovar.type = mtype;                                   \
-        iovar.bytes = 0;                                      \
-        iovar.error = 0;                                      \
-        iovar.key = keyptr;                                   \
-        iovar.dbid = db;                                      \
-        iovar.ctx = NULL;                                     \
-        iovar.pre_flush_buffer = NULL;                        \
-    } while (0)
-
-/* This is a structure used to export DEBUG DIGEST capabilities to
- * modules. We want to capture both the ordered and unordered elements of
- * a data structure, so that a digest can be created in a way that correctly
- * reflects the values. See the DEBUG DIGEST command implementation for more
- * background. */
-struct ValkeyModuleDigest {
-    unsigned char o[20];      /* Ordered elements. */
-    unsigned char x[20];      /* Xored elements. */
-    struct serverObject *key; /* Optional name of key processed */
-    int dbid;                 /* The dbid of the key being processed */
-};
-
-/* Just start with a digest composed of all zero bytes. */
-#define moduleInitDigestContext(mdvar)       \
-    do {                                     \
-        memset(mdvar.o, 0, sizeof(mdvar.o)); \
-        memset(mdvar.x, 0, sizeof(mdvar.x)); \
-    } while (0)
+typedef struct ValkeyModuleType moduleType;
 
 /* Macro to check if the client is in the middle of module based authentication. */
 #define clientHasModuleAuthInProgress(c) ((c)->module_auth_ctx != NULL)
@@ -1418,12 +1257,6 @@ struct saveparam {
     int changes;
 };
 
-struct moduleLoadQueueEntry {
-    sds path;
-    int argc;
-    robj **argv;
-};
-
 struct sentinelLoadQueueEntry {
     int argc;
     sds *argv;
@@ -2717,59 +2550,6 @@ extern dict *modules;
 /* Command metadata */
 void populateCommandLegacyRangeSpec(struct serverCommand *c);
 
-/* Modules */
-void moduleInitModulesSystem(void);
-void moduleInitModulesSystemLast(void);
-void modulesCron(void);
-int moduleLoad(const char *path, void **argv, int argc, int is_loadex);
-int moduleUnload(sds name, const char **errmsg);
-void moduleLoadFromQueue(void);
-int moduleGetCommandKeysViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result);
-int moduleGetCommandChannelsViaAPI(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result);
-moduleType *moduleTypeLookupModuleByID(uint64_t id);
-moduleType *moduleTypeLookupModuleByName(const char *name);
-moduleType *moduleTypeLookupModuleByNameIgnoreCase(const char *name);
-void moduleTypeNameByID(char *name, uint64_t moduleid);
-const char *moduleTypeModuleName(moduleType *mt);
-const char *moduleNameFromCommand(struct serverCommand *cmd);
-void moduleFreeContext(struct ValkeyModuleCtx *ctx);
-void moduleCallCommandUnblockedHandler(client *c);
-int isModuleClientUnblocked(client *c);
-void unblockClientFromModule(client *c);
-void moduleHandleBlockedClients(void);
-void moduleBlockedClientTimedOut(client *c, int from_module);
-void modulePipeReadable(aeEventLoop *el, int fd, void *privdata, int mask);
-size_t moduleCount(void);
-void moduleAcquireGIL(void);
-int moduleTryAcquireGIL(void);
-void moduleReleaseGIL(void);
-void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid);
-void firePostExecutionUnitJobs(void);
-void moduleCallCommandFilters(client *c);
-void modulePostExecutionUnitOperations(void);
-void ModuleForkDoneHandler(int exitcode, int bysignal);
-int TerminateModuleForkChild(int child_pid, int wait);
-ssize_t rdbSaveModulesAux(rio *rdb, int when);
-int moduleAllDatatypesHandleErrors(void);
-int moduleAllModulesHandleReplAsyncLoad(void);
-sds modulesCollectInfo(sds info, dict *sections_dict, int for_crash_report, int sections);
-void moduleFireServerEvent(uint64_t eid, int subid, void *data);
-void processModuleLoadingProgressEvent(int is_aof);
-int moduleTryServeClientBlockedOnKey(client *c, robj *key);
-void moduleUnblockClient(client *c);
-int moduleBlockedClientMayTimeout(client *c);
-int moduleClientIsBlockedOnKeys(client *c);
-void moduleNotifyUserChanged(client *c);
-void moduleNotifyKeyUnlink(robj *key, robj *val, int dbid, int flags);
-size_t moduleGetFreeEffort(robj *key, robj *val, int dbid);
-size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid);
-robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value);
-int moduleDefragValue(robj *key, robj *obj, int dbid);
-int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid);
-void moduleDefragGlobals(void);
-void *moduleGetHandleByName(char *modulename);
-int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd);
-
 /* Utils */
 long long ustime(void);
 mstime_t mstime(void);
diff --git a/src/valkey-check-rdb.c b/src/valkey-check-rdb.c
index ba94c172c7..6f23c21fb8 100644
--- a/src/valkey-check-rdb.c
+++ b/src/valkey-check-rdb.c
@@ -30,6 +30,7 @@
 #include "mt19937-64.h"
 #include "server.h"
 #include "rdb.h"
+#include "module.h"
 
 #include <stdarg.h>
 #include <sys/time.h>

From 4dcb2b706edf91be13f5c85530f11cb1e66b68ea Mon Sep 17 00:00:00 2001
From: Wen Hui <wen.hui.ware@gmail.com>
Date: Thu, 2 Jan 2025 20:37:55 -0500
Subject: [PATCH 049/101] Update Redis legacy keyword and link in
 utils/whatisdoing.sh (#1495)

Signed-off-by: hwware <wen.hui.ware@gmail.com>
---
 utils/whatisdoing.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/whatisdoing.sh b/utils/whatisdoing.sh
index 68d7f7cca6..00117f4863 100755
--- a/utils/whatisdoing.sh
+++ b/utils/whatisdoing.sh
@@ -1,15 +1,15 @@
 # This script is from http://poormansprofiler.org/
 #
-# NOTE: Instead of using this script, you should use the Redis
+# NOTE: Instead of using this script, you should use the Valkey
 # Software Watchdog, which provides a similar functionality but in
 # a more reliable / easy to use way.
 #
-# Check https://redis.io/topics/latency for more information.
+# Check https://valkey.io/topics/latency for more information.
 
 #!/bin/bash
 nsamples=1
 sleeptime=0
-pid=$(ps auxww | grep '[r]edis-server' | awk '{print $2}')
+pid=$(ps auxww | grep '[v]alkey-server' | awk '{print $2}')
 
 for x in $(seq 1 $nsamples)
   do

From 890bb71114358ec5473f5d78b5cd4a6913488126 Mon Sep 17 00:00:00 2001
From: gmbnomis <gmbnomis@users.noreply.github.com>
Date: Fri, 3 Jan 2025 02:41:15 +0100
Subject: [PATCH 050/101] Use the correct command proc for the LOOKUP_NOTOUCH
 exception in lookupKey (#1499)

When looking up a key in no-touch mode, `LOOKUP_NOTOUCH` is set to avoid
updating the last access time in `lookupKey`. An exception must be made
for the `TOUCH` command which must always update the key.

When called from a script, `server.executing_client` will point to the
`TOUCH` command, while `server.current_client` will point to e.g. an
`EVAL` command. So, we must use the former to find out the currently
executing command if defined.

This fix addresses the issue where TOUCH wasn't updating key access
times when called from scripts like EVAL.

Fixes #1498

Signed-off-by: Simon Baatz <gmbnomis@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/db.c                       |  2 +-
 tests/unit/introspection-2.tcl | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/db.c b/src/db.c
index 1362b5f9dd..9a53e6b4d1 100644
--- a/src/db.c
+++ b/src/db.c
@@ -125,7 +125,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) {
          * Don't do it if we have a saving child, as this will trigger
          * a copy on write madness. */
         if (server.current_client && server.current_client->flag.no_touch &&
-            server.current_client->cmd->proc != touchCommand)
+            server.executing_client->cmd->proc != touchCommand)
             flags |= LOOKUP_NOTOUCH;
         if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) {
             /* Shared objects can't be stored in the database. */
diff --git a/tests/unit/introspection-2.tcl b/tests/unit/introspection-2.tcl
index b8f4e0aed4..301c86937b 100644
--- a/tests/unit/introspection-2.tcl
+++ b/tests/unit/introspection-2.tcl
@@ -30,11 +30,24 @@ start_server {tags {"introspection"}} {
         assert {[r object idletime foo] >= 2}
     }
 
-    test {TOUCH alters the last access time of a key} {
+    proc test_touch_alters_access_time {} {
         r set foo bar
+        r set script_foo bar
         after 3000
         r touch foo
+        r eval {redis.call('touch', KEYS[1])} 1 script_foo
         assert {[r object idletime foo] < 2}
+        assert {[r object idletime script_foo] < 2}
+    }
+
+    test {TOUCH alters the last access time of a key} {
+        test_touch_alters_access_time
+    }
+
+    test {TOUCH alters the last access time of a key in no-touch mode} {
+        r client no-touch on
+        test_touch_alters_access_time
+        r client no-touch off
     }
 
     test {Operations in no-touch mode do not alter the last access time of a key} {

From 888ea5aeeb5c572a9118040a506f00e5a259ace9 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Thu, 2 Jan 2025 17:43:16 -0800
Subject: [PATCH 051/101] Move coverity back to ubuntu 22 until test failures
 are fixed (#1504)

The issues in #1453 seem to
have only shown up since we moved to ubuntu 24, as part of the rolling
`ubunut-latest` migration from 22->24.

Closes #1453.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 .github/workflows/coverity.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
index 2561e4ceb5..acf5686c13 100644
--- a/.github/workflows/coverity.yml
+++ b/.github/workflows/coverity.yml
@@ -17,7 +17,7 @@ permissions:
 jobs:
   coverity:
     if: github.repository == 'valkey-io/valkey'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Download and extract the Coverity Build Tool

From 3b085074ee894ce253d0e8415c9b53e8549dff43 Mon Sep 17 00:00:00 2001
From: eifrah-aws <eifrah@amazon.com>
Date: Fri, 3 Jan 2025 03:44:41 +0200
Subject: [PATCH 052/101] CMake: fail on warnings (#1503)

When building with `CMake` (especially the targets `valkey-cli`,
`valkey-server` and `valkey-benchmark`) it is possible to have a
successful build while having warnings.

This PR fixes this - which is aligned with how the `Makefile` is working
today:
- Enable `-Wall` + `-Werror` for valkey targets
- Fixed warning in valkey-cli:jsonStringOutput method

Signed-off-by: Eran Ifrah <eifrah@amazon.com>
---
 cmake/Modules/ValkeySetup.cmake | 3 +++
 src/valkey-cli.c                | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake
index 8a4d4da1c9..77360844fc 100644
--- a/cmake/Modules/ValkeySetup.cmake
+++ b/cmake/Modules/ValkeySetup.cmake
@@ -93,6 +93,9 @@ macro (valkey_build_and_install_bin target sources ld_flags libs link_name)
         target_link_libraries(${target} execinfo)
     endif ()
 
+    # Enable all warnings + fail on warning
+    target_compile_options(${target} PRIVATE -Werror -Wall)
+
     # Install cli tool and create a redis symbolic link
     valkey_install_bin(${target})
     valkey_create_symlink(${target} ${link_name})
diff --git a/src/valkey-cli.c b/src/valkey-cli.c
index 4416e09431..0a4f1affa2 100644
--- a/src/valkey-cli.c
+++ b/src/valkey-cli.c
@@ -2040,6 +2040,8 @@ static sds jsonStringOutput(sds out, const char *p, int len, int mode) {
     } else {
         assert(0);
     }
+    /* Silence compiler warning */
+    return NULL;
 }
 
 static sds cliFormatReplyJson(sds out, redisReply *r, int mode) {

From b95ad54c76cce569b8da3803a256f44e47a3cb88 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 4 Jan 2025 10:47:32 +0800
Subject: [PATCH 053/101] Explicitly check C_ERR condition to improve
 readability in clusterSaveConfig (#1505)

It's not obvious to see it at first, modify it to use C_ERR.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index a1b1d0e986..807488b57d 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -868,7 +868,7 @@ int clusterSaveConfig(int do_fsync) {
 
 cleanup:
     if (fd != -1) close(fd);
-    if (retval) unlink(tmpfilename);
+    if (retval == C_ERR) unlink(tmpfilename);
     sdsfree(tmpfilename);
     sdsfree(ci);
     return retval;

From 3072443b4ea05cb3e522654f898d144849b9a657 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 6 Jan 2025 09:26:17 +0800
Subject: [PATCH 054/101] Check whether to switch to fail when setting the node
 to pfail in cron (#1061)

This may speed up the transition to the fail state a bit.
Previously we would only check when we received a pfail/fail
report from others in gossip. If myself is the last vote,
we can directly switch to fail in here without waiting for
the next gossip packet.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 807488b57d..b59c30126a 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -5217,7 +5217,7 @@ void clusterCron(void) {
             if (!(node->flags & (CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) {
                 node->flags |= CLUSTER_NODE_PFAIL;
                 update_state = 1;
-                if (server.cluster->size == 1 && clusterNodeIsVotingPrimary(myself)) {
+                if (clusterNodeIsVotingPrimary(myself)) {
                     markNodeAsFailingIfNeeded(node);
                 } else {
                     serverLog(LL_NOTICE, "NODE %.40s (%s) possibly failing.", node->name, node->human_nodename);

From bbd22eacfe333caba305e9d755def340461114df Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Mon, 6 Jan 2025 14:02:16 -0800
Subject: [PATCH 055/101] Fix Read/Write key pattern selector (CVE-2024-51741)
 (#1514)

The explanation on the original commit was wrong. Key based access must
have a `~` in order to correctly configure whey key prefixes to apply
the selector to. If this is missing, a server assert will be triggered
later.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: YaacovHazan <yaacov.hazan@redis.com>
---
 src/acl.c             | 11 ++++++++---
 tests/unit/acl-v2.tcl | 23 ++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/acl.c b/src/acl.c
index 725419dcf2..0928c43914 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -1078,19 +1078,24 @@ int ACLSetSelector(aclSelector *selector, const char *op, size_t oplen) {
         int flags = 0;
         size_t offset = 1;
         if (op[0] == '%') {
+            int perm_ok = 1;
             for (; offset < oplen; offset++) {
                 if (toupper(op[offset]) == 'R' && !(flags & ACL_READ_PERMISSION)) {
                     flags |= ACL_READ_PERMISSION;
                 } else if (toupper(op[offset]) == 'W' && !(flags & ACL_WRITE_PERMISSION)) {
                     flags |= ACL_WRITE_PERMISSION;
-                } else if (op[offset] == '~' && flags) {
+                } else if (op[offset] == '~') {
                     offset++;
                     break;
                 } else {
-                    errno = EINVAL;
-                    return C_ERR;
+                    perm_ok = 0;
+                    break;
                 }
             }
+            if (!flags || !perm_ok) {
+                errno = EINVAL;
+                return C_ERR;
+            }
         } else {
             flags = ACL_ALL_PERMISSION;
         }
diff --git a/tests/unit/acl-v2.tcl b/tests/unit/acl-v2.tcl
index e8229d1b36..b33b53eabc 100644
--- a/tests/unit/acl-v2.tcl
+++ b/tests/unit/acl-v2.tcl
@@ -116,11 +116,32 @@ start_server {tags {"acl external:skip"}} {
         assert_match "*NOPERM*key*" $err
     }
 
-    test {Validate read and write permissions format} {
+    test {Validate read and write permissions format - empty permission} {
         catch {r ACL SETUSER key-permission-RW %~} err
         set err
     } {ERR Error in ACL SETUSER modifier '%~': Syntax error}
 
+    test {Validate read and write permissions format - empty selector} {
+        catch {r ACL SETUSER key-permission-RW %} err
+        set err
+    } {ERR Error in ACL SETUSER modifier '%': Syntax error}
+
+    test {Validate read and write permissions format - empty pattern} {
+        # Empty pattern results with R/W access to no key
+        r ACL SETUSER key-permission-RW on nopass %RW~ +@all
+        $r2 auth key-permission-RW password
+        catch {$r2 SET x 5} err
+        set err
+    } {NOPERM No permissions to access a key}
+
+    test {Validate read and write permissions format - no pattern} {
+        # No pattern results with R/W access to no key (currently we accept this syntax error)
+        r ACL SETUSER key-permission-RW on nopass %RW +@all
+        $r2 auth key-permission-RW password
+        catch {$r2 SET x 5} err
+        set err
+    } {NOPERM No permissions to access a key}
+
     test {Test separate read and write permissions on different selectors are not additive} {
         r ACL SETUSER key-permission-RW-selector on nopass "(%R~read* +@all)" "(%W~write* +@all)"
         $r2 auth key-permission-RW-selector password

From 38910e2ec17c3c3b38dd7d946dd6e6bcbc5cde94 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Mon, 6 Jan 2025 14:02:22 -0800
Subject: [PATCH 056/101] Fix LUA garbage collector (CVE-2024-46981) (#1513)

Reset GC state before closing the lua VM to prevent user data to be
wrongly freed while still might be used on destructor callbacks.

Created and publish by Redis in their OSS branch.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: YaacovHazan <yaacov.hazan@redis.com>
---
 src/eval.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/eval.c b/src/eval.c
index e9fac531f5..9aa185d77b 100644
--- a/src/eval.c
+++ b/src/eval.c
@@ -285,6 +285,7 @@ void scriptingInit(int setup) {
 void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) {
     dictRelease(lua_scripts);
     listRelease(lua_scripts_lru_list);
+    lua_gc(lctx.lua, LUA_GCCOLLECT, 0);
     lua_close(lua);
 
 #if !defined(USE_LIBC)

From 990782e3f5ba85eb4aabb1d49ea8a757bd909b09 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Mon, 6 Jan 2025 15:46:55 -0800
Subject: [PATCH 057/101] Add tests for acl selectors with no permissions or
 patterns  (#1515)

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/acl-v2.tcl | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/tests/unit/acl-v2.tcl b/tests/unit/acl-v2.tcl
index b33b53eabc..bcaab9e817 100644
--- a/tests/unit/acl-v2.tcl
+++ b/tests/unit/acl-v2.tcl
@@ -116,31 +116,26 @@ start_server {tags {"acl external:skip"}} {
         assert_match "*NOPERM*key*" $err
     }
 
-    test {Validate read and write permissions format - empty permission} {
-        catch {r ACL SETUSER key-permission-RW %~} err
-        set err
-    } {ERR Error in ACL SETUSER modifier '%~': Syntax error}
-
-    test {Validate read and write permissions format - empty selector} {
-        catch {r ACL SETUSER key-permission-RW %} err
-        set err
-    } {ERR Error in ACL SETUSER modifier '%': Syntax error}
-
-    test {Validate read and write permissions format - empty pattern} {
-        # Empty pattern results with R/W access to no key
-        r ACL SETUSER key-permission-RW on nopass %RW~ +@all
-        $r2 auth key-permission-RW password
-        catch {$r2 SET x 5} err
-        set err
-    } {NOPERM No permissions to access a key}
+    test {Validate read and write permissions format} {
+        # Regression tests for CVE-2024-51741
+        assert_error "ERR Error in ACL SETUSER modifier '%~': Syntax error" {r ACL SETUSER invalid %~}
+        assert_error "ERR Error in ACL SETUSER modifier '%': Syntax error" {r ACL SETUSER invalid %}
+    }
 
-    test {Validate read and write permissions format - no pattern} {
-        # No pattern results with R/W access to no key (currently we accept this syntax error)
-        r ACL SETUSER key-permission-RW on nopass %RW +@all
-        $r2 auth key-permission-RW password
-        catch {$r2 SET x 5} err
-        set err
-    } {NOPERM No permissions to access a key}
+    test {Validate key permissions format - empty and omitted pattern} {
+        # Empty pattern results with access to only the empty key
+        r ACL SETUSER key-permission-no-key on nopass %RW~ +@all
+        assert_equal "User key-permission-no-key has no permissions to access the 'x' key" [r ACL DRYRUN key-permission-no-key GET x]
+        assert_equal "OK" [r ACL DRYRUN key-permission-no-key GET ""]
+
+        # This is incorrect syntax, it should have `~`, but we'll allow it for compatibility since it does something
+        r ACL SETUSER key-permission-omit on nopass %RW +@all
+        assert_equal "User key-permission-omit has no permissions to access the 'x' key" [r ACL DRYRUN key-permission-omit GET x]
+        assert_equal "OK" [r ACL DRYRUN key-permission-omit GET ""]
+
+        # Assert these two are equivalent 
+        assert_equal [r ACL GETUSER key-permission-omit] [r ACL GETUSER key-permission-no-key]
+    }
 
     test {Test separate read and write permissions on different selectors are not additive} {
         r ACL SETUSER key-permission-RW-selector on nopass "(%R~read* +@all)" "(%W~write* +@all)"

From 794567fe656d7a8e43d279669e3d3f196d1d1239 Mon Sep 17 00:00:00 2001
From: Rueian <rueiancsie@gmail.com>
Date: Tue, 7 Jan 2025 13:54:55 -0800
Subject: [PATCH 058/101] Add `availability_zone` to the HELLO response (#1487)

It's inconvenient for client implementations to extract the
`availability_zone` information from the `INFO` response. The `INFO`
response contains a lot of information that a client implementation
typically doesn't need.

This PR adds the availability zone to the `HELLO` response. Clients
usually already use the `HELLO` command for protocol negotiation and
also get the server `version` and `role` from its response. To keep the
`HELLO` response small, the field is only added if availability zone is
configured.

---------

Signed-off-by: Rueian <rueiancsie@gmail.com>
---
 src/networking.c        |  7 ++++++-
 tests/unit/protocol.tcl | 34 ++++++++++++++++++++++++++++++++++
 tests/unit/tracking.tcl | 17 -----------------
 3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/src/networking.c b/src/networking.c
index 08e9a56313..86f87deb8b 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -4206,7 +4206,7 @@ void helloCommand(client *c) {
 
     /* Let's switch to the specified RESP mode. */
     if (ver) c->resp = ver;
-    addReplyMapLen(c, 6 + !server.sentinel_mode);
+    addReplyMapLen(c, 6 + !server.sentinel_mode + (sdslen(server.availability_zone) != 0));
 
     addReplyBulkCString(c, "server");
     addReplyBulkCString(c, server.extended_redis_compat ? "redis" : SERVER_NAME);
@@ -4235,6 +4235,11 @@ void helloCommand(client *c) {
 
     addReplyBulkCString(c, "modules");
     addReplyLoadedModules(c);
+
+    if (sdslen(server.availability_zone) != 0) {
+        addReplyBulkCString(c, "availability_zone");
+        addReplyBulkCBuffer(c, server.availability_zone, sdslen(server.availability_zone));
+    }
 }
 
 /* This callback is bound to POST and "Host:" command names. Those are not
diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl
index f3a2b8e1a8..f0e64368cc 100644
--- a/tests/unit/protocol.tcl
+++ b/tests/unit/protocol.tcl
@@ -232,6 +232,40 @@ start_server {tags {"protocol network"}} {
 
 }
 
+start_server {tags {"protocol hello"}} {
+    test {HELLO without protover} {
+        set reply [r HELLO 3]
+        assert_equal [dict get $reply proto] 3
+
+        set reply [r HELLO]
+        assert_equal [dict get $reply proto] 3
+
+        set reply [r HELLO 2]
+        assert_equal [dict get $reply proto] 2
+
+        set reply [r HELLO]
+        assert_equal [dict get $reply proto] 2
+    }
+
+    test {HELLO and availability-zone} {
+        r CONFIG SET availability-zone myzone
+
+        set reply [r HELLO 3]
+        assert_equal [dict get $reply availability_zone] myzone
+
+        set reply [r HELLO 2]
+        assert_equal [dict get $reply availability_zone] myzone
+
+        r CONFIG SET availability-zone ""
+
+        set reply [r HELLO 3]
+        assert_equal [dict exists $reply availability_zone] 0
+
+        set reply [r HELLO 2]
+        assert_equal [dict exists $reply availability_zone] 0
+    }
+}
+
 start_server {tags {"regression"}} {
     test "Regression for a crash with blocking ops and pipelining" {
         set rd [valkey_deferring_client]
diff --git a/tests/unit/tracking.tcl b/tests/unit/tracking.tcl
index 313293dcb7..9fdc4b79cd 100644
--- a/tests/unit/tracking.tcl
+++ b/tests/unit/tracking.tcl
@@ -154,23 +154,6 @@ start_server {tags {"tracking network logreqres:skip"}} {
         assert_equal [dict get $reply proto] 3
     }
 
-    test {HELLO without protover} {
-        set reply [r HELLO 3]
-        assert_equal [dict get $reply proto] 3
-
-        set reply [r HELLO]
-        assert_equal [dict get $reply proto] 3
-
-        set reply [r HELLO 2]
-        assert_equal [dict get $reply proto] 2
-
-        set reply [r HELLO]
-        assert_equal [dict get $reply proto] 2
-
-        # restore RESP3 for next test
-        r HELLO 3
-    }
-
     test {RESP3 based basic invalidation} {
         r CLIENT TRACKING off
         r CLIENT TRACKING on

From 50487cca7956f495d601c132ec901e65456a2833 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Tue, 7 Jan 2025 15:43:46 -0800
Subject: [PATCH 059/101] Actually run code coverage on ubuntu 22 (#1522)

This commit, https://github.com/valkey-io/valkey/pull/1504, moved the
wrong worker to ubuntu 22. We wanted to move codecov and not coverity.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 .github/workflows/codecov.yml  | 2 +-
 .github/workflows/coverity.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
index 951b5c2862..cd1f1b20a7 100644
--- a/.github/workflows/codecov.yml
+++ b/.github/workflows/codecov.yml
@@ -10,7 +10,7 @@ concurrency:
 
 jobs:
   code-coverage:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
index acf5686c13..2561e4ceb5 100644
--- a/.github/workflows/coverity.yml
+++ b/.github/workflows/coverity.yml
@@ -17,7 +17,7 @@ permissions:
 jobs:
   coverity:
     if: github.repository == 'valkey-io/valkey'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Download and extract the Coverity Build Tool

From 1987c3647d6a55915a833fe8c0adf76bdaa1cdda Mon Sep 17 00:00:00 2001
From: Rueian <rueiancsie@gmail.com>
Date: Tue, 7 Jan 2025 18:04:58 -0800
Subject: [PATCH 060/101] Add `availability_zone` to the HELLO command history
 (#1524)

This PR is a followup for #1487.

Signed-off-by: Rueian <rueiancsie@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/commands.def        | 3 ++-
 src/commands/hello.json | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/commands.def b/src/commands.def
index f03e44db9f..c5d766e3f8 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -1719,6 +1719,7 @@ struct COMMAND_ARG ECHO_Args[] = {
 #ifndef SKIP_CMD_HISTORY_TABLE
 /* HELLO history */
 commandHistory HELLO_History[] = {
+{"8.1.0","A new `availability_zone` field is added to the response if the `availability-zone` config is set."},
 {"6.2.0","`protover` made optional; when called without arguments the command reports the current connection's context."},
 };
 #endif
@@ -10911,7 +10912,7 @@ struct COMMAND_STRUCT serverCommandTable[] = {
 {MAKE_CMD("auth","Authenticates the connection.","O(N) where N is the number of passwords defined for the user","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,AUTH_History,1,AUTH_Tips,0,authCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,AUTH_Keyspecs,0,NULL,2),.args=AUTH_Args},
 {MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,NULL,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands},
 {MAKE_CMD("echo","Returns the given string.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,ECHO_History,0,ECHO_Tips,0,echoCommand,2,CMD_LOADING|CMD_STALE|CMD_FAST,ACL_CATEGORY_CONNECTION,ECHO_Keyspecs,0,NULL,1),.args=ECHO_Args},
-{MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,1,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args},
+{MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,2,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args},
 {MAKE_CMD("ping","Returns the server's liveliness response.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,PING_History,0,PING_Tips,2,pingCommand,-1,CMD_FAST|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,PING_Keyspecs,0,NULL,1),.args=PING_Args},
 {MAKE_CMD("quit","Closes the connection.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"just closing the connection","7.2.0","connection",COMMAND_GROUP_CONNECTION,QUIT_History,0,QUIT_Tips,0,quitCommand,-1,CMD_ALLOW_BUSY|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH,ACL_CATEGORY_CONNECTION,QUIT_Keyspecs,0,NULL,0)},
 {MAKE_CMD("reset","Resets the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,RESET_History,0,RESET_Tips,0,resetCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,RESET_Keyspecs,0,NULL,0)},
diff --git a/src/commands/hello.json b/src/commands/hello.json
index f3fcc5a13c..15fd81c655 100644
--- a/src/commands/hello.json
+++ b/src/commands/hello.json
@@ -7,6 +7,10 @@
         "arity": -1,
         "function": "helloCommand",
         "history": [
+            [
+                "8.1.0",
+                "A new `availability_zone` field is added to the response if the `availability-zone` config is set."
+            ],
             [
                 "6.2.0",
                 "`protover` made optional; when called without arguments the command reports the current connection's context."

From ac096a19efcfa038914fe3d6e2ee1dc542a8af14 Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Wed, 8 Jan 2025 10:28:54 +0200
Subject: [PATCH 061/101] client struct: lazy init components and optimize
 struct layout (#1405)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Refactor client structure to use modular data components

## Current State
The client structure allocates memory for replication / pubsub /
multi-keys / module / blocked data for every client, despite these
features being used by only a small subset of clients. In addition the
current field layout in the client struct is suboptimal, with poor
alignment and unnecessary padding between fields, leading to a larger
than necessary memory footprint of 896 bytes per client. Furthermore,
fields that are frequently accessed together during operations are
scattered throughout the struct, resulting in poor cache locality.

## This PR's Change

1.  Lazy Initialization
- **Components are only allocated when first used:**
  - PubSubData: Created on first SUBSCRIBE/PUBLISH operation
  - ReplicationData: Initialized only for replica connections
  - ModuleData: Allocated when module interaction begins
  - BlockingState: Created when first blocking command is issued
  - MultiState: Initialized on MULTI command

2. Memory Layout Optimization:
   - Grouped related fields for better locality
   - Moved rarely accessed fields (e.g., client->name) to struct end
   - Optimized field alignment to eliminate padding

3. Additional changes:
   - Moved watched_keys to be static allocated in the `mstate` struct
   - Relocated replication init logic to replication.c


### Key Benefits
- **Efficient Memory Usage:**
- 45% smaller base client structure - Basic clients now use 528 bytes
(down from 896).
- Better memory locality for related operations
- Performance improvement in high throughput scenarios. No performance
regressions in other cases.


### Performance Impact

Tested with 650 clients and 512 bytes values.

#### Single Thread Performance
| Operation   | Dataset | New (ops/sec) | Old (ops/sec) | Change % |
|------------|---------|---------------|---------------|-----------|
| SET        | 1 key   | 261,799      | 258,261      | +1.37%    |
| SET        | 3M keys | 209,134      | ~209,000     | ~0%       |
| GET        | 1 key   | 281,564      | 277,965      | +1.29%    |
| GET        | 3M keys | 231,158      | 228,410      | +1.20%    |

#### 8 IO Threads Performance
| Operation   | Dataset | New (ops/sec) | Old (ops/sec) | Change % |
|------------|---------|---------------|---------------|-----------|
| SET        | 1 key   | 1,331,578    | 1,331,626    | -0.00%    |
| SET        | 3M keys | 1,254,441    | 1,152,645    | +8.83%    |
| GET        | 1 key   | 1,293,149    | 1,289,503    | +0.28%    |
| GET        | 3M keys | 1,152,898    | 1,101,791    | +4.64%    |

#### Pipeline Performance (3M keys)
| Operation | Pipeline Size | New (ops/sec) | Old (ops/sec) | Change % |
|-----------|--------------|---------------|---------------|-----------|
| SET       | 10          | 548,964      | 538,498      | +1.94%    |
| SET       | 20          | 606,148      | 594,872      | +1.89%    |
| SET       | 30          | 631,122      | 616,606      | +2.35%    |
| GET       | 10          | 628,482      | 624,166      | +0.69%    |
| GET       | 20          | 687,371      | 681,659      | +0.84%    |
| GET       | 30          | 725,855      | 721,102      | +0.66%    |

### Observations:
1. Single-threaded operations show consistent improvements (1-1.4%)
2. Multi-threaded performance shows significant gains for large
datasets:
   - SET with 3M keys: +8.83% improvement
   - GET with 3M keys: +4.64% improvement
3. Pipeline operations show consistent improvements:
   - SET operations: +1.89% to +2.35%
   - GET operations: +0.66% to +0.84%
4. No performance regressions observed in any test scenario


Related issue:https://github.com/valkey-io/valkey/issues/761

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Signed-off-by: uriyage <78144248+uriyage@users.noreply.github.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/acl.c            |   6 +-
 src/aof.c            |   3 +-
 src/blocked.c        | 127 ++++++++------
 src/cluster.c        |  14 +-
 src/cluster_legacy.c |   2 +-
 src/module.c         | 100 ++++++-----
 src/module.h         |   1 +
 src/multi.c          | 113 ++++++------
 src/networking.c     | 238 ++++++++-----------------
 src/pubsub.c         |  66 +++++--
 src/rdb.c            |   8 +-
 src/replication.c    | 405 +++++++++++++++++++++++++------------------
 src/script.c         |   1 +
 src/server.c         |  65 +++----
 src/server.h         | 258 ++++++++++++++-------------
 src/timeout.c        |   8 +-
 src/tracking.c       |  23 +--
 17 files changed, 761 insertions(+), 677 deletions(-)

diff --git a/src/acl.c b/src/acl.c
index 0928c43914..184fa54116 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -1960,7 +1960,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) {
 
     if (getClientType(c) == CLIENT_TYPE_PUBSUB) {
         /* Check for pattern violations. */
-        dictIterator *di = dictGetIterator(c->pubsub_patterns);
+        dictIterator *di = dictGetIterator(c->pubsub_data->pubsub_patterns);
         dictEntry *de;
         while (!kill && ((de = dictNext(di)) != NULL)) {
             o = dictGetKey(de);
@@ -1972,7 +1972,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) {
         /* Check for channel violations. */
         if (!kill) {
             /* Check for global channels violation. */
-            di = dictGetIterator(c->pubsub_channels);
+            di = dictGetIterator(c->pubsub_data->pubsub_channels);
 
             while (!kill && ((de = dictNext(di)) != NULL)) {
                 o = dictGetKey(de);
@@ -1983,7 +1983,7 @@ int ACLShouldKillPubsubClient(client *c, list *upcoming) {
         }
         if (!kill) {
             /* Check for shard channels violation. */
-            di = dictGetIterator(c->pubsubshard_channels);
+            di = dictGetIterator(c->pubsub_data->pubsubshard_channels);
             while (!kill && ((de = dictNext(di)) != NULL)) {
                 o = dictGetKey(de);
                 int res = ACLCheckChannelAgainstList(upcoming, o->ptr, sdslen(o->ptr), 0);
diff --git a/src/aof.c b/src/aof.c
index 8ac44f64c2..3629fa1acf 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1382,7 +1382,8 @@ struct client *createAOFClient(void) {
 
     /* We set the fake client as a replica waiting for the synchronization
      * so that the server will not try to send replies to this client. */
-    c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
+    initClientReplicationData(c);
+    c->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
     return c;
 }
 
diff --git a/src/blocked.c b/src/blocked.c
index 39050932d9..d2d6a5d314 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -75,16 +75,25 @@ static void moduleUnblockClientOnKey(client *c, robj *key);
 static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key);
 
 void initClientBlockingState(client *c) {
-    c->bstate.btype = BLOCKED_NONE;
-    c->bstate.timeout = 0;
-    c->bstate.unblock_on_nokey = 0;
-    c->bstate.keys = dictCreate(&objectKeyHeapPointerValueDictType);
-    c->bstate.numreplicas = 0;
-    c->bstate.numlocal = 0;
-    c->bstate.reploffset = 0;
-    c->bstate.generic_blocked_list_node = NULL;
-    c->bstate.module_blocked_handle = NULL;
-    c->bstate.async_rm_call_handle = NULL;
+    if (c->bstate) return;
+    c->bstate = zmalloc(sizeof(blockingState));
+    c->bstate->btype = BLOCKED_NONE;
+    c->bstate->timeout = 0;
+    c->bstate->unblock_on_nokey = 0;
+    c->bstate->keys = dictCreate(&objectKeyHeapPointerValueDictType);
+    c->bstate->numreplicas = 0;
+    c->bstate->numlocal = 0;
+    c->bstate->reploffset = 0;
+    c->bstate->generic_blocked_list_node = NULL;
+    c->bstate->module_blocked_handle = NULL;
+    c->bstate->async_rm_call_handle = NULL;
+}
+
+void freeClientBlockingState(client *c) {
+    if (!c->bstate) return;
+    dictRelease(c->bstate->keys);
+    zfree(c->bstate);
+    c->bstate = NULL;
 }
 
 /* Block a client for the specific operation type. Once the CLIENT_BLOCKED
@@ -94,8 +103,10 @@ void blockClient(client *c, int btype) {
     /* Primary client should never be blocked unless pause or module */
     serverAssert(!(c->flag.primary && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE));
 
+    initClientBlockingState(c);
+
     c->flag.blocked = 1;
-    c->bstate.btype = btype;
+    c->bstate->btype = btype;
     if (!c->flag.module)
         server.blocked_clients++; /* We count blocked client stats on regular clients and not on module clients */
     server.blocked_clients_by_type[btype]++;
@@ -199,18 +210,18 @@ void queueClientForReprocessing(client *c) {
 /* Unblock a client calling the right function depending on the kind
  * of operation the client is blocking for. */
 void unblockClient(client *c, int queue_for_reprocessing) {
-    if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM) {
+    if (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || c->bstate->btype == BLOCKED_STREAM) {
         unblockClientWaitingData(c);
-    } else if (c->bstate.btype == BLOCKED_WAIT) {
+    } else if (c->bstate->btype == BLOCKED_WAIT) {
         unblockClientWaitingReplicas(c);
-    } else if (c->bstate.btype == BLOCKED_MODULE) {
+    } else if (c->bstate->btype == BLOCKED_MODULE) {
         if (moduleClientIsBlockedOnKeys(c)) unblockClientWaitingData(c);
         unblockClientFromModule(c);
-    } else if (c->bstate.btype == BLOCKED_POSTPONE) {
-        serverAssert(c->bstate.postponed_list_node);
-        listDelNode(server.postponed_clients, c->bstate.postponed_list_node);
-        c->bstate.postponed_list_node = NULL;
-    } else if (c->bstate.btype == BLOCKED_SHUTDOWN) {
+    } else if (c->bstate->btype == BLOCKED_POSTPONE) {
+        serverAssert(c->bstate->postponed_list_node);
+        listDelNode(server.postponed_clients, c->bstate->postponed_list_node);
+        c->bstate->postponed_list_node = NULL;
+    } else if (c->bstate->btype == BLOCKED_SHUTDOWN) {
         /* No special cleanup. */
     } else {
         serverPanic("Unknown btype in unblockClient().");
@@ -218,7 +229,7 @@ void unblockClient(client *c, int queue_for_reprocessing) {
 
     /* Reset the client for a new query, unless the client has pending command to process
      * or in case a shutdown operation was canceled and we are still in the processCommand sequence  */
-    if (!c->flag.pending_command && c->bstate.btype != BLOCKED_SHUTDOWN) {
+    if (!c->flag.pending_command && c->bstate->btype != BLOCKED_SHUTDOWN) {
         /* Clients that are not blocked on keys are not reprocessed so we must
          * call reqresAppendResponse here (for clients blocked on key,
          * unblockClientOnKey is called, which eventually calls processCommand,
@@ -229,12 +240,12 @@ void unblockClient(client *c, int queue_for_reprocessing) {
 
     /* We count blocked client stats on regular clients and not on module clients */
     if (!c->flag.module) server.blocked_clients--;
-    server.blocked_clients_by_type[c->bstate.btype]--;
+    server.blocked_clients_by_type[c->bstate->btype]--;
     /* Clear the flags, and put the client in the unblocked list so that
      * we'll process new commands in its query buffer ASAP. */
     c->flag.blocked = 0;
-    c->bstate.btype = BLOCKED_NONE;
-    c->bstate.unblock_on_nokey = 0;
+    c->bstate->btype = BLOCKED_NONE;
+    c->bstate->unblock_on_nokey = 0;
     removeClientFromTimeoutTable(c);
     if (queue_for_reprocessing) queueClientForReprocessing(c);
 }
@@ -243,22 +254,22 @@ void unblockClient(client *c, int queue_for_reprocessing) {
  * send it a reply of some kind. After this function is called,
  * unblockClient() will be called with the same client as argument. */
 void replyToBlockedClientTimedOut(client *c) {
-    if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM) {
+    if (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET || c->bstate->btype == BLOCKED_STREAM) {
         addReplyNullArray(c);
         updateStatsOnUnblock(c, 0, 0, 0);
-    } else if (c->bstate.btype == BLOCKED_WAIT) {
+    } else if (c->bstate->btype == BLOCKED_WAIT) {
         if (c->cmd->proc == waitCommand) {
-            addReplyLongLong(c, replicationCountAcksByOffset(c->bstate.reploffset));
+            addReplyLongLong(c, replicationCountAcksByOffset(c->bstate->reploffset));
         } else if (c->cmd->proc == waitaofCommand) {
             addReplyArrayLen(c, 2);
-            addReplyLongLong(c, server.fsynced_reploff >= c->bstate.reploffset);
-            addReplyLongLong(c, replicationCountAOFAcksByOffset(c->bstate.reploffset));
+            addReplyLongLong(c, server.fsynced_reploff >= c->bstate->reploffset);
+            addReplyLongLong(c, replicationCountAOFAcksByOffset(c->bstate->reploffset));
         } else if (c->cmd->proc == clusterCommand) {
             addReplyErrorObject(c, shared.noreplicaserr);
         } else {
             serverPanic("Unknown wait command %s in replyToBlockedClientTimedOut().", c->cmd->declared_name);
         }
-    } else if (c->bstate.btype == BLOCKED_MODULE) {
+    } else if (c->bstate->btype == BLOCKED_MODULE) {
         moduleBlockedClientTimedOut(c, 0);
     } else {
         serverPanic("Unknown btype in replyToBlockedClientTimedOut().");
@@ -274,7 +285,7 @@ void replyToClientsBlockedOnShutdown(void) {
     listRewind(server.clients, &li);
     while ((ln = listNext(&li))) {
         client *c = listNodeValue(ln);
-        if (c->flag.blocked && c->bstate.btype == BLOCKED_SHUTDOWN) {
+        if (c->flag.blocked && c->bstate->btype == BLOCKED_SHUTDOWN) {
             addReplyError(c, "Errors trying to SHUTDOWN. Check logs.");
             unblockClient(c, 1);
         }
@@ -301,7 +312,7 @@ void disconnectAllBlockedClients(void) {
              * command processing will start from scratch, and the command will
              * be either executed or rejected. (unlike LIST blocked clients for
              * which the command is already in progress in a way. */
-            if (c->bstate.btype == BLOCKED_POSTPONE) continue;
+            if (c->bstate->btype == BLOCKED_POSTPONE) continue;
 
             unblockClientOnError(c, "-UNBLOCKED force unblock from blocking operation, "
                                     "instance state changed (master -> replica?)");
@@ -386,15 +397,17 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo
     list *l;
     int j;
 
+    initClientBlockingState(c);
+
     if (!c->flag.reprocessing_command) {
         /* If the client is re-processing the command, we do not set the timeout
          * because we need to retain the client's original timeout. */
-        c->bstate.timeout = timeout;
+        c->bstate->timeout = timeout;
     }
 
     for (j = 0; j < numkeys; j++) {
         /* If the key already exists in the dictionary ignore it. */
-        if (!(client_blocked_entry = dictAddRaw(c->bstate.keys, keys[j], NULL))) {
+        if (!(client_blocked_entry = dictAddRaw(c->bstate->keys, keys[j], NULL))) {
             continue;
         }
         incrRefCount(keys[j]);
@@ -411,7 +424,7 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo
             l = dictGetVal(db_blocked_existing_entry);
         }
         listAddNodeTail(l, c);
-        dictSetVal(c->bstate.keys, client_blocked_entry, listLast(l));
+        dictSetVal(c->bstate->keys, client_blocked_entry, listLast(l));
 
         /* We need to add the key to blocking_keys_unblock_on_nokey, if the client
          * wants to be awakened if key is deleted (like XREADGROUP) */
@@ -425,7 +438,7 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo
             }
         }
     }
-    c->bstate.unblock_on_nokey = unblock_on_nokey;
+    c->bstate->unblock_on_nokey = unblock_on_nokey;
     /* Currently we assume key blocking will require reprocessing the command.
      * However in case of modules, they have a different way to handle the reprocessing
      * which does not require setting the pending command flag */
@@ -439,15 +452,15 @@ static void unblockClientWaitingData(client *c) {
     dictEntry *de;
     dictIterator *di;
 
-    if (dictSize(c->bstate.keys) == 0) return;
+    if (dictSize(c->bstate->keys) == 0) return;
 
-    di = dictGetIterator(c->bstate.keys);
+    di = dictGetIterator(c->bstate->keys);
     /* The client may wait for multiple keys, so unblock it for every key. */
     while ((de = dictNext(di)) != NULL) {
         releaseBlockedEntry(c, de, 0);
     }
     dictReleaseIterator(di);
-    dictEmpty(c->bstate.keys, NULL);
+    dictEmpty(c->bstate->keys, NULL);
 }
 
 static blocking_type getBlockedTypeByType(int type) {
@@ -546,7 +559,7 @@ static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key) {
     if (listLength(l) == 0) {
         dictDelete(c->db->blocking_keys, key);
         dictDelete(c->db->blocking_keys_unblock_on_nokey, key);
-    } else if (c->bstate.unblock_on_nokey) {
+    } else if (c->bstate->unblock_on_nokey) {
         unblock_on_nokey_entry = dictFind(c->db->blocking_keys_unblock_on_nokey, key);
         /* it is not possible to have a client blocked on nokey with no matching entry */
         serverAssertWithInfo(c, key, unblock_on_nokey_entry != NULL);
@@ -555,7 +568,7 @@ static void releaseBlockedEntry(client *c, dictEntry *de, int remove_key) {
             dictDelete(c->db->blocking_keys_unblock_on_nokey, key);
         }
     }
-    if (remove_key) dictDelete(c->bstate.keys, key);
+    if (remove_key) dictDelete(c->bstate->keys, key);
 }
 
 void signalKeyAsReady(serverDb *db, robj *key, int type) {
@@ -593,9 +606,9 @@ static void handleClientsBlockedOnKey(readyList *rl) {
              *    module is trying to accomplish right now.
              * 3. In case of XREADGROUP call we will want to unblock on any change in object type
              *    or in case the key was deleted, since the group is no longer valid. */
-            if ((o != NULL && (receiver->bstate.btype == getBlockedTypeByType(o->type))) ||
-                (o != NULL && (receiver->bstate.btype == BLOCKED_MODULE)) || (receiver->bstate.unblock_on_nokey)) {
-                if (receiver->bstate.btype != BLOCKED_MODULE)
+            if ((o != NULL && (receiver->bstate->btype == getBlockedTypeByType(o->type))) ||
+                (o != NULL && (receiver->bstate->btype == BLOCKED_MODULE)) || (receiver->bstate->unblock_on_nokey)) {
+                if (receiver->bstate->btype != BLOCKED_MODULE)
                     unblockClientOnKey(receiver, rl->key);
                 else
                     moduleUnblockClientOnKey(receiver, rl->key);
@@ -606,16 +619,17 @@ static void handleClientsBlockedOnKey(readyList *rl) {
 
 /* block a client for replica acknowledgement */
 void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, long numreplicas, int numlocal) {
-    c->bstate.timeout = timeout;
-    c->bstate.reploffset = offset;
-    c->bstate.numreplicas = numreplicas;
-    c->bstate.numlocal = numlocal;
+    initClientBlockingState(c);
+    c->bstate->timeout = timeout;
+    c->bstate->reploffset = offset;
+    c->bstate->numreplicas = numreplicas;
+    c->bstate->numlocal = numlocal;
     listAddNodeHead(server.clients_waiting_acks, c);
     /* Note that we remember the linked list node where the client is stored,
      * this way removing the client in unblockClientWaitingReplicas() will not
      * require a linear scan, but just a constant time operation. */
-    serverAssert(c->bstate.client_waiting_acks_list_node == NULL);
-    c->bstate.client_waiting_acks_list_node = listFirst(server.clients_waiting_acks);
+    serverAssert(c->bstate->client_waiting_acks_list_node == NULL);
+    c->bstate->client_waiting_acks_list_node = listFirst(server.clients_waiting_acks);
     blockClient(c, BLOCKED_WAIT);
 }
 
@@ -623,11 +637,12 @@ void blockClientForReplicaAck(client *c, mstime_t timeout, long long offset, lon
  * requesting to avoid processing clients commands which will be processed later
  * when the it is ready to accept them. */
 void blockPostponeClient(client *c) {
-    c->bstate.timeout = 0;
+    initClientBlockingState(c);
+    c->bstate->timeout = 0;
     blockClient(c, BLOCKED_POSTPONE);
     listAddNodeTail(server.postponed_clients, c);
-    serverAssert(c->bstate.postponed_list_node == NULL);
-    c->bstate.postponed_list_node = listLast(server.postponed_clients);
+    serverAssert(c->bstate->postponed_list_node == NULL);
+    c->bstate->postponed_list_node = listLast(server.postponed_clients);
     /* Mark this client to execute its command */
     c->flag.pending_command = 1;
 }
@@ -644,13 +659,13 @@ void blockClientShutdown(client *c) {
 static void unblockClientOnKey(client *c, robj *key) {
     dictEntry *de;
 
-    de = dictFind(c->bstate.keys, key);
+    de = dictFind(c->bstate->keys, key);
     releaseBlockedEntry(c, de, 1);
 
     /* Only in case of blocking API calls, we might be blocked on several keys.
        however we should force unblock the entire blocking keys */
-    serverAssert(c->bstate.btype == BLOCKED_STREAM || c->bstate.btype == BLOCKED_LIST ||
-                 c->bstate.btype == BLOCKED_ZSET);
+    serverAssert(c->bstate->btype == BLOCKED_STREAM || c->bstate->btype == BLOCKED_LIST ||
+                 c->bstate->btype == BLOCKED_ZSET);
 
     /* We need to unblock the client before calling processCommandAndResetClient
      * because it checks the CLIENT_BLOCKED flag */
@@ -712,7 +727,7 @@ static void moduleUnblockClientOnKey(client *c, robj *key) {
  * command with timeout reply. */
 void unblockClientOnTimeout(client *c) {
     /* The client has been unlocked (in the moduleUnblocked list), return ASAP. */
-    if (c->bstate.btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return;
+    if (c->bstate->btype == BLOCKED_MODULE && isModuleClientUnblocked(c)) return;
 
     replyToBlockedClientTimedOut(c);
     if (c->flag.pending_command) c->flag.pending_command = 0;
diff --git a/src/cluster.c b/src/cluster.c
index 39d9161b9c..309279e0be 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -1006,7 +1006,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
         /* If CLIENT_MULTI flag is not set EXEC is just going to return an
          * error. */
         if (!c->flag.multi) return myself;
-        ms = &c->mstate;
+        ms = c->mstate;
     } else {
         /* In order to have a single codepath create a fake Multi State
          * structure if the client is not in MULTI/EXEC state, this way
@@ -1023,7 +1023,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
 
     /* Only valid for sharded pubsub as regular pubsub can operate on any node and bypasses this layer. */
     int pubsubshard_included =
-        (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_PUBSUB));
+        (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_PUBSUB));
 
     /* Check that all the keys are in the same hash slot, and obtain this
      * slot and the node associated. */
@@ -1176,7 +1176,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
      * node is a replica and the request is about a hash slot our primary
      * is serving, we can reply without redirection. */
     int is_write_command =
-        (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE));
+        (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_WRITE));
     if ((c->flag.readonly || pubsubshard_included) && !is_write_command && clusterNodeIsReplica(myself) &&
         clusterNodeGetPrimary(myself) == n) {
         return myself;
@@ -1233,14 +1233,14 @@ void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_co
  * returns 1. Otherwise 0 is returned and no operation is performed. */
 int clusterRedirectBlockedClientIfNeeded(client *c) {
     clusterNode *myself = getMyClusterNode();
-    if (c->flag.blocked && (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET ||
-                            c->bstate.btype == BLOCKED_STREAM || c->bstate.btype == BLOCKED_MODULE)) {
+    if (c->flag.blocked && (c->bstate->btype == BLOCKED_LIST || c->bstate->btype == BLOCKED_ZSET ||
+                            c->bstate->btype == BLOCKED_STREAM || c->bstate->btype == BLOCKED_MODULE)) {
         dictEntry *de;
         dictIterator *di;
 
         /* If the client is blocked on module, but not on a specific key,
          * don't unblock it. */
-        if (c->bstate.btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) return 0;
+        if (c->bstate->btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c)) return 0;
 
         /* If the cluster is down, unblock the client with the right error.
          * If the cluster is configured to allow reads on cluster down, we
@@ -1252,7 +1252,7 @@ int clusterRedirectBlockedClientIfNeeded(client *c) {
         }
 
         /* All keys must belong to the same slot, so check first key only. */
-        di = dictGetIterator(c->bstate.keys);
+        di = dictGetIterator(c->bstate->keys);
         if ((de = dictNext(di)) != NULL) {
             robj *key = dictGetKey(de);
             int slot = keyHashSlot((char *)key->ptr, sdslen(key->ptr));
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index b59c30126a..0777d6d8c6 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -6574,7 +6574,7 @@ void clusterCommandSetSlot(client *c) {
              * replication, it would also unlikely win the election.
              *
              * And 0x702ff is 7.2.255, we only support new versions in this case. */
-            if (r->repl_state == REPLICA_STATE_ONLINE && r->replica_version > 0x702ff) {
+            if (r->repl_data->repl_state == REPLICA_STATE_ONLINE && r->repl_data->replica_version > 0x702ff) {
                 num_eligible_replicas++;
             }
         }
diff --git a/src/module.c b/src/module.c
index dabea59d49..7388dc6a20 100644
--- a/src/module.c
+++ b/src/module.c
@@ -651,6 +651,19 @@ void *VM_PoolAlloc(ValkeyModuleCtx *ctx, size_t bytes) {
  * Helpers for modules API implementation
  * -------------------------------------------------------------------------- */
 
+static void initClientModuleData(client *c) {
+    if (c->module_data) return;
+    c->module_data = zcalloc(sizeof(ClientModuleData));
+}
+
+void freeClientModuleData(client *c) {
+    if (!c->module_data) return;
+    /* Free the ValkeyModuleBlockedClient held onto for reprocessing if not already freed. */
+    zfree(c->module_data->module_blocked_client);
+    zfree(c->module_data);
+    c->module_data = NULL;
+}
+
 void moduleEnqueueLoadModule(sds path, sds *argv, int argc) {
     int i;
     struct moduleLoadQueueEntry *loadmod;
@@ -721,11 +734,11 @@ void moduleReleaseTempClient(client *c) {
     c->flag.fake = 1;
     c->user = NULL; /* Root user */
     c->cmd = c->lastcmd = c->realcmd = c->io_parsed_cmd = NULL;
-    if (c->bstate.async_rm_call_handle) {
-        ValkeyModuleAsyncRMCallPromise *promise = c->bstate.async_rm_call_handle;
+    if (c->bstate && c->bstate->async_rm_call_handle) {
+        ValkeyModuleAsyncRMCallPromise *promise = c->bstate->async_rm_call_handle;
         promise->c = NULL; /* Remove the client from the promise so it will no longer be possible to abort it. */
         freeValkeyModuleAsyncRMCallPromise(promise);
-        c->bstate.async_rm_call_handle = NULL;
+        c->bstate->async_rm_call_handle = NULL;
     }
     moduleTempClients[moduleTempClientCount++] = c;
 }
@@ -897,7 +910,7 @@ static CallReply *moduleParseReply(client *c, ValkeyModuleCtx *ctx) {
 
 void moduleCallCommandUnblockedHandler(client *c) {
     ValkeyModuleCtx ctx;
-    ValkeyModuleAsyncRMCallPromise *promise = c->bstate.async_rm_call_handle;
+    ValkeyModuleAsyncRMCallPromise *promise = c->bstate->async_rm_call_handle;
     serverAssert(promise);
     ValkeyModule *module = promise->module;
     if (!promise->on_unblocked) {
@@ -6569,7 +6582,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const
             .ctx = (ctx->flags & VALKEYMODULE_CTX_AUTO_MEMORY) ? ctx : NULL,
         };
         reply = callReplyCreatePromise(promise);
-        c->bstate.async_rm_call_handle = promise;
+        c->bstate->async_rm_call_handle = promise;
         if (!(call_flags & CMD_CALL_PROPAGATE_AOF)) {
             /* No need for AOF propagation, set the relevant flags of the client */
             c->flag.module_prevent_aof_prop = 1;
@@ -7679,7 +7692,7 @@ void VM_LatencyAddSample(const char *event, mstime_t latency) {
 
 /* Returns 1 if the client already in the moduleUnblocked list, 0 otherwise. */
 int isModuleClientUnblocked(client *c) {
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
 
     return bc->unblocked == 1;
 }
@@ -7697,7 +7710,7 @@ int isModuleClientUnblocked(client *c) {
  * The structure ValkeyModuleBlockedClient will be always deallocated when
  * running the list of clients blocked by a module that need to be unblocked. */
 void unblockClientFromModule(client *c) {
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
 
     /* Call the disconnection callback if any. Note that
      * bc->disconnect_callback is set to NULL if the client gets disconnected
@@ -7765,9 +7778,10 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx,
     client *c = ctx->client;
     int islua = scriptIsRunning();
     int ismulti = server.in_exec;
+    initClientBlockingState(c);
 
-    c->bstate.module_blocked_handle = zmalloc(sizeof(ValkeyModuleBlockedClient));
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    c->bstate->module_blocked_handle = zmalloc(sizeof(ValkeyModuleBlockedClient));
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
     ctx->module->blocked_clients++;
 
     /* We need to handle the invalid operation of calling modules blocking
@@ -7795,7 +7809,7 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx,
     if (timeout_ms) {
         mstime_t now = mstime();
         if (timeout_ms > LLONG_MAX - now) {
-            c->bstate.module_blocked_handle = NULL;
+            c->bstate->module_blocked_handle = NULL;
             addReplyError(c, "timeout is out of range"); /* 'timeout_ms+now' would overflow */
             return bc;
         }
@@ -7803,20 +7817,20 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx,
     }
 
     if (islua || ismulti) {
-        c->bstate.module_blocked_handle = NULL;
+        c->bstate->module_blocked_handle = NULL;
         addReplyError(c, islua ? "Blocking module command called from Lua script"
                                : "Blocking module command called from transaction");
     } else if (ctx->flags & VALKEYMODULE_CTX_BLOCKED_REPLY) {
-        c->bstate.module_blocked_handle = NULL;
+        c->bstate->module_blocked_handle = NULL;
         addReplyError(c, "Blocking module command called from a Reply callback context");
     } else if (!auth_reply_callback && clientHasModuleAuthInProgress(c)) {
-        c->bstate.module_blocked_handle = NULL;
+        c->bstate->module_blocked_handle = NULL;
         addReplyError(c, "Clients undergoing module based authentication can only be blocked on auth");
     } else {
         if (keys) {
             blockForKeys(c, BLOCKED_MODULE, keys, numkeys, timeout, flags & VALKEYMODULE_BLOCK_UNBLOCK_DELETED);
         } else {
-            c->bstate.timeout = timeout;
+            c->bstate->timeout = timeout;
             blockClient(c, BLOCKED_MODULE);
         }
     }
@@ -7912,7 +7926,7 @@ void moduleUnregisterAuthCBs(ValkeyModule *module) {
 /* Search for & attempt next module auth callback after skipping the ones already attempted.
  * Returns the result of the module auth callback. */
 int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) {
-    int handle_next_callback = c->module_auth_ctx == NULL;
+    int handle_next_callback = (!c->module_data || c->module_data->module_auth_ctx == NULL);
     ValkeyModuleAuthCtx *cur_auth_ctx = NULL;
     listNode *ln;
     listIter li;
@@ -7922,7 +7936,7 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) {
         cur_auth_ctx = listNodeValue(ln);
         /* Skip over the previously attempted auth contexts. */
         if (!handle_next_callback) {
-            handle_next_callback = cur_auth_ctx == c->module_auth_ctx;
+            handle_next_callback = cur_auth_ctx == c->module_data->module_auth_ctx;
             continue;
         }
         /* Remove the module auth complete flag before we attempt the next cb. */
@@ -7931,7 +7945,8 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) {
         moduleCreateContext(&ctx, cur_auth_ctx->module, VALKEYMODULE_CTX_NONE);
         ctx.client = c;
         *err = NULL;
-        c->module_auth_ctx = cur_auth_ctx;
+        initClientModuleData(c);
+        c->module_data->module_auth_ctx = cur_auth_ctx;
         result = cur_auth_ctx->auth_cb(&ctx, username, password, err);
         moduleFreeContext(&ctx);
         if (result == VALKEYMODULE_AUTH_HANDLED) break;
@@ -7947,8 +7962,8 @@ int attemptNextAuthCb(client *c, robj *username, robj *password, robj **err) {
  * return the result of the reply callback. */
 int attemptBlockedAuthReplyCallback(client *c, robj *username, robj *password, robj **err) {
     int result = VALKEYMODULE_AUTH_NOT_HANDLED;
-    if (!c->module_blocked_client) return result;
-    ValkeyModuleBlockedClient *bc = (ValkeyModuleBlockedClient *)c->module_blocked_client;
+    if (!c->module_data || !c->module_data->module_blocked_client) return result;
+    ValkeyModuleBlockedClient *bc = (ValkeyModuleBlockedClient *)c->module_data->module_blocked_client;
     bc->client = c;
     if (bc->auth_reply_cb) {
         ValkeyModuleCtx ctx;
@@ -7961,7 +7976,7 @@ int attemptBlockedAuthReplyCallback(client *c, robj *username, robj *password, r
         moduleFreeContext(&ctx);
     }
     moduleInvokeFreePrivDataCallback(c, bc);
-    c->module_blocked_client = NULL;
+    c->module_data->module_blocked_client = NULL;
     c->lastcmd->microseconds += bc->background_duration;
     bc->module->blocked_clients--;
     zfree(bc);
@@ -7989,7 +8004,7 @@ int checkModuleAuthentication(client *c, robj *username, robj *password, robj **
         serverAssert(result == VALKEYMODULE_AUTH_HANDLED);
         return AUTH_BLOCKED;
     }
-    c->module_auth_ctx = NULL;
+    if (c->module_data) c->module_data->module_auth_ctx = NULL;
     if (result == VALKEYMODULE_AUTH_NOT_HANDLED) {
         c->flag.module_auth_has_result = 0;
         return AUTH_NOT_HANDLED;
@@ -8011,7 +8026,7 @@ int checkModuleAuthentication(client *c, robj *username, robj *password, robj **
  * This function returns 1 if client was served (and should be unblocked) */
 int moduleTryServeClientBlockedOnKey(client *c, robj *key) {
     int served = 0;
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
 
     /* Protect against re-processing: don't serve clients that are already
      * in the unblocking list for any reason (including VM_UnblockClient()
@@ -8223,14 +8238,14 @@ int moduleUnblockClientByHandle(ValkeyModuleBlockedClient *bc, void *privdata) {
 /* This API is used by the server core to unblock a client that was blocked
  * by a module. */
 void moduleUnblockClient(client *c) {
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
     moduleUnblockClientByHandle(bc, NULL);
 }
 
 /* Return true if the client 'c' was blocked by a module using
  * VM_BlockClientOnKeys(). */
 int moduleClientIsBlockedOnKeys(client *c) {
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
     return bc->blocked_on_keys;
 }
 
@@ -8340,7 +8355,7 @@ void moduleHandleBlockedClients(void) {
         /* Hold onto the blocked client if module auth is in progress. The reply callback is invoked
          * when the client is reprocessed. */
         if (c && clientHasModuleAuthInProgress(c)) {
-            c->module_blocked_client = bc;
+            c->module_data->module_blocked_client = bc;
         } else {
             /* Free privdata if any. */
             moduleInvokeFreePrivDataCallback(c, bc);
@@ -8402,9 +8417,9 @@ void moduleHandleBlockedClients(void) {
  * moduleBlockedClientTimedOut().
  */
 int moduleBlockedClientMayTimeout(client *c) {
-    if (c->bstate.btype != BLOCKED_MODULE) return 1;
+    if (c->bstate->btype != BLOCKED_MODULE) return 1;
 
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
     return (bc && bc->timeout_callback != NULL);
 }
 
@@ -8420,7 +8435,7 @@ int moduleBlockedClientMayTimeout(client *c) {
  * of the client synchronously. This ensures that we can reply to the client before
  * resetClient() is called. */
 void moduleBlockedClientTimedOut(client *c, int from_module) {
-    ValkeyModuleBlockedClient *bc = c->bstate.module_blocked_handle;
+    ValkeyModuleBlockedClient *bc = c->bstate->module_blocked_handle;
 
     /* Protect against re-processing: don't serve clients that are already
      * in the unblocking list for any reason (including VM_UnblockClient()
@@ -9559,16 +9574,16 @@ static void eventLoopHandleOneShotEvents(void) {
  * A client's user can be changed through the AUTH command, module
  * authentication, and when a client is freed. */
 void moduleNotifyUserChanged(client *c) {
-    if (c->auth_callback) {
-        c->auth_callback(c->id, c->auth_callback_privdata);
+    if (!c->module_data || !c->module_data->auth_callback) return;
 
-        /* The callback will fire exactly once, even if the user remains
-         * the same. It is expected to completely clean up the state
-         * so all references are cleared here. */
-        c->auth_callback = NULL;
-        c->auth_callback_privdata = NULL;
-        c->auth_module = NULL;
-    }
+    c->module_data->auth_callback(c->id, c->module_data->auth_callback_privdata);
+
+    /* The callback will fire exactly once, even if the user remains
+     * the same. It is expected to completely clean up the state
+     * so all references are cleared here. */
+    c->module_data->auth_callback = NULL;
+    c->module_data->auth_callback_privdata = NULL;
+    c->module_data->auth_module = NULL;
 }
 
 void revokeClientAuthentication(client *c) {
@@ -9599,9 +9614,9 @@ static void moduleFreeAuthenticatedClients(ValkeyModule *module) {
     listRewind(server.clients, &li);
     while ((ln = listNext(&li)) != NULL) {
         client *c = listNodeValue(ln);
-        if (!c->auth_module) continue;
+        if (!c->module_data || !c->module_data->auth_module) continue;
 
-        ValkeyModule *auth_module = (ValkeyModule *)c->auth_module;
+        ValkeyModule *auth_module = (ValkeyModule *)c->module_data->auth_module;
         if (auth_module == module) {
             revokeClientAuthentication(c);
         }
@@ -9909,9 +9924,10 @@ static int authenticateClientWithUser(ValkeyModuleCtx *ctx,
     }
 
     if (callback) {
-        ctx->client->auth_callback = callback;
-        ctx->client->auth_callback_privdata = privdata;
-        ctx->client->auth_module = ctx->module;
+        initClientModuleData(ctx->client);
+        ctx->client->module_data->auth_callback = callback;
+        ctx->client->module_data->auth_callback_privdata = privdata;
+        ctx->client->module_data->auth_module = ctx->module;
     }
 
     if (client_id) {
diff --git a/src/module.h b/src/module.h
index 78d9341ca9..f4e4de67eb 100644
--- a/src/module.h
+++ b/src/module.h
@@ -228,5 +228,6 @@ int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime end
 void moduleDefragGlobals(void);
 void *moduleGetHandleByName(char *modulename);
 int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd);
+void freeClientModuleData(client *c);
 
 #endif /* _MODULE_H_ */
diff --git a/src/multi.c b/src/multi.c
index 9e1f019244..9e3aad9d3c 100644
--- a/src/multi.c
+++ b/src/multi.c
@@ -33,33 +33,42 @@
 
 /* Client state initialization for MULTI/EXEC */
 void initClientMultiState(client *c) {
-    c->mstate.commands = NULL;
-    c->mstate.count = 0;
-    c->mstate.cmd_flags = 0;
-    c->mstate.cmd_inv_flags = 0;
-    c->mstate.argv_len_sums = 0;
-    c->mstate.alloc_count = 0;
+    if (c->mstate) return;
+    c->mstate = zcalloc(sizeof(multiState));
 }
 
-/* Release all the resources associated with MULTI/EXEC state */
-void freeClientMultiState(client *c) {
-    int j;
-
-    for (j = 0; j < c->mstate.count; j++) {
+void freeClientMultiStateCmds(client *c) {
+    for (int j = 0; j < c->mstate->count; j++) {
         int i;
-        multiCmd *mc = c->mstate.commands + j;
+        multiCmd *mc = c->mstate->commands + j;
 
         for (i = 0; i < mc->argc; i++) decrRefCount(mc->argv[i]);
         zfree(mc->argv);
     }
-    zfree(c->mstate.commands);
+
+    zfree(c->mstate->commands);
+    c->mstate->commands = NULL;
+}
+
+/* Release all the resources associated with MULTI/EXEC state */
+void freeClientMultiState(client *c) {
+    if (!c->mstate) return;
+
+    freeClientMultiStateCmds(c);
+    unwatchAllKeys(c);
+    zfree(c->mstate);
+    c->mstate = NULL;
 }
 
 void resetClientMultiState(client *c) {
-    if (c->mstate.commands) {
-        freeClientMultiState(c);
-        initClientMultiState(c);
-    }
+    if (!c->mstate || !c->mstate->commands) return;
+
+    freeClientMultiStateCmds(c);
+    c->mstate->count = 0;
+    c->mstate->cmd_flags = 0;
+    c->mstate->cmd_inv_flags = 0;
+    c->mstate->argv_len_sums = 0;
+    c->mstate->alloc_count = 0;
 }
 
 /* Add a new command into the MULTI commands queue */
@@ -71,26 +80,27 @@ void queueMultiCommand(client *c, uint64_t cmd_flags) {
      * bother to read previous responses and didn't notice the multi was already
      * aborted. */
     if (c->flag.dirty_cas || c->flag.dirty_exec) return;
-    if (c->mstate.count == 0) {
+    if (!c->mstate) initClientMultiState(c);
+    if (c->mstate->count == 0) {
         /* If a client is using multi/exec, assuming it is used to execute at least
          * two commands. Hence, creating by default size of 2. */
-        c->mstate.commands = zmalloc(sizeof(multiCmd) * 2);
-        c->mstate.alloc_count = 2;
+        c->mstate->commands = zmalloc(sizeof(multiCmd) * 2);
+        c->mstate->alloc_count = 2;
     }
-    if (c->mstate.count == c->mstate.alloc_count) {
-        c->mstate.alloc_count = c->mstate.alloc_count < INT_MAX / 2 ? c->mstate.alloc_count * 2 : INT_MAX;
-        c->mstate.commands = zrealloc(c->mstate.commands, sizeof(multiCmd) * (c->mstate.alloc_count));
+    if (c->mstate->count == c->mstate->alloc_count) {
+        c->mstate->alloc_count = c->mstate->alloc_count < INT_MAX / 2 ? c->mstate->alloc_count * 2 : INT_MAX;
+        c->mstate->commands = zrealloc(c->mstate->commands, sizeof(multiCmd) * (c->mstate->alloc_count));
     }
-    mc = c->mstate.commands + c->mstate.count;
+    mc = c->mstate->commands + c->mstate->count;
     mc->cmd = c->cmd;
     mc->argc = c->argc;
     mc->argv = c->argv;
     mc->argv_len = c->argv_len;
 
-    c->mstate.count++;
-    c->mstate.cmd_flags |= cmd_flags;
-    c->mstate.cmd_inv_flags |= ~cmd_flags;
-    c->mstate.argv_len_sums += c->argv_len_sum + sizeof(robj *) * c->argc;
+    c->mstate->count++;
+    c->mstate->cmd_flags |= cmd_flags;
+    c->mstate->cmd_inv_flags |= ~cmd_flags;
+    c->mstate->argv_len_sums += c->argv_len_sum + sizeof(robj *) * c->argc;
 
     /* Reset the client's args since we copied them into the mstate and shouldn't
      * reference them from c anymore. */
@@ -118,6 +128,7 @@ void flagTransaction(client *c) {
 }
 
 void multiCommand(client *c) {
+    if (!c->mstate) initClientMultiState(c);
     c->flag.multi = 1;
     addReply(c, shared.ok);
 }
@@ -195,12 +206,12 @@ void execCommand(client *c) {
     orig_argv_len = c->argv_len;
     orig_argc = c->argc;
     orig_cmd = c->cmd;
-    addReplyArrayLen(c, c->mstate.count);
-    for (j = 0; j < c->mstate.count; j++) {
-        c->argc = c->mstate.commands[j].argc;
-        c->argv = c->mstate.commands[j].argv;
-        c->argv_len = c->mstate.commands[j].argv_len;
-        c->cmd = c->realcmd = c->mstate.commands[j].cmd;
+    addReplyArrayLen(c, c->mstate->count);
+    for (j = 0; j < c->mstate->count; j++) {
+        c->argc = c->mstate->commands[j].argc;
+        c->argv = c->mstate->commands[j].argv;
+        c->argv_len = c->mstate->commands[j].argv_len;
+        c->cmd = c->realcmd = c->mstate->commands[j].cmd;
 
         /* ACL permissions are also checked at the time of execution in case
          * they were changed after the commands were queued. */
@@ -234,10 +245,10 @@ void execCommand(client *c) {
         }
 
         /* Commands may alter argc/argv, restore mstate. */
-        c->mstate.commands[j].argc = c->argc;
-        c->mstate.commands[j].argv = c->argv;
-        c->mstate.commands[j].argv_len = c->argv_len;
-        c->mstate.commands[j].cmd = c->cmd;
+        c->mstate->commands[j].argc = c->argc;
+        c->mstate->commands[j].argv = c->argv;
+        c->mstate->commands[j].argv_len = c->argv_len;
+        c->mstate->commands[j].cmd = c->cmd;
 
         /* The original argv has already been processed for slowlog and monitor,
          * so we can safely free it before proceeding to the next command. */
@@ -304,10 +315,10 @@ void watchForKey(client *c, robj *key) {
     listNode *ln;
     watchedKey *wk;
 
-    if (listLength(c->watched_keys) == 0) server.watching_clients++;
+    if (listLength(&c->mstate->watched_keys) == 0) server.watching_clients++;
 
     /* Check if we are already watching for this key */
-    listRewind(c->watched_keys, &li);
+    listRewind(&c->mstate->watched_keys, &li);
     while ((ln = listNext(&li))) {
         wk = listNodeValue(ln);
         if (wk->db == c->db && equalStringObjects(key, wk->key)) return; /* Key already watched */
@@ -326,7 +337,7 @@ void watchForKey(client *c, robj *key) {
     wk->db = c->db;
     wk->expired = keyIsExpired(c->db, key);
     incrRefCount(key);
-    listAddNodeTail(c->watched_keys, wk);
+    listAddNodeTail(&c->mstate->watched_keys, wk);
     watchedKeyLinkToClients(clients, wk);
 }
 
@@ -336,8 +347,8 @@ void unwatchAllKeys(client *c) {
     listIter li;
     listNode *ln;
 
-    if (listLength(c->watched_keys) == 0) return;
-    listRewind(c->watched_keys, &li);
+    if (!c->mstate || listLength(&c->mstate->watched_keys) == 0) return;
+    listRewind(&c->mstate->watched_keys, &li);
     while ((ln = listNext(&li))) {
         list *clients;
         watchedKey *wk;
@@ -350,7 +361,7 @@ void unwatchAllKeys(client *c) {
         /* Kill the entry at all if this was the only client */
         if (listLength(clients) == 0) dictDelete(wk->db->watched_keys, wk->key);
         /* Remove this watched key from the client->watched list */
-        listDelNode(c->watched_keys, ln);
+        listDelNode(&c->mstate->watched_keys, ln);
         decrRefCount(wk->key);
         zfree(wk);
     }
@@ -363,8 +374,8 @@ int isWatchedKeyExpired(client *c) {
     listIter li;
     listNode *ln;
     watchedKey *wk;
-    if (listLength(c->watched_keys) == 0) return 0;
-    listRewind(c->watched_keys, &li);
+    if (!c->mstate || listLength(&c->mstate->watched_keys) == 0) return 0;
+    listRewind(&c->mstate->watched_keys, &li);
     while ((ln = listNext(&li))) {
         wk = listNodeValue(ln);
         if (wk->expired) continue; /* was expired when WATCH was called */
@@ -474,6 +485,9 @@ void watchCommand(client *c) {
         addReply(c, shared.ok);
         return;
     }
+
+    if (!c->mstate) initClientMultiState(c);
+
     for (j = 1; j < c->argc; j++) watchForKey(c, c->argv[j]);
     addReply(c, shared.ok);
 }
@@ -485,11 +499,12 @@ void unwatchCommand(client *c) {
 }
 
 size_t multiStateMemOverhead(client *c) {
-    size_t mem = c->mstate.argv_len_sums;
+    if (!c->mstate) return 0;
+    size_t mem = c->mstate->argv_len_sums;
     /* Add watched keys overhead, Note: this doesn't take into account the watched keys themselves, because they aren't
      * managed per-client. */
-    mem += listLength(c->watched_keys) * (sizeof(listNode) + sizeof(watchedKey));
+    mem += listLength(&c->mstate->watched_keys) * (sizeof(listNode) + sizeof(c->mstate->watched_keys));
     /* Reserved memory for queued multi commands. */
-    mem += c->mstate.alloc_count * sizeof(multiCmd);
+    mem += c->mstate->alloc_count * sizeof(multiCmd);
     return mem;
 }
diff --git a/src/networking.c b/src/networking.c
index 86f87deb8b..339cd304d4 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -119,7 +119,7 @@ int authRequired(client *c) {
 }
 
 static inline int isReplicaReadyForReplData(client *replica) {
-    return (replica->repl_state == REPLICA_STATE_ONLINE || replica->repl_state == REPLICA_STATE_BG_RDB_LOAD) &&
+    return (replica->repl_data->repl_state == REPLICA_STATE_ONLINE || replica->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD) &&
            !(replica->flag.close_asap);
 }
 
@@ -154,8 +154,6 @@ client *createClient(connection *conn) {
     c->bufpos = 0;
     c->buf_peak = c->buf_usable_size;
     c->buf_peak_last_reset_time = server.unixtime;
-    c->ref_repl_buf_node = NULL;
-    c->ref_block_pos = 0;
     c->qb_pos = 0;
     c->querybuf = NULL;
     c->querybuf_peak = 0;
@@ -180,55 +178,31 @@ client *createClient(connection *conn) {
     c->ctime = c->last_interaction = server.unixtime;
     c->duration = 0;
     clientSetDefaultAuth(c);
-    c->repl_state = REPL_STATE_NONE;
-    c->repl_start_cmd_stream_on_ack = 0;
-    c->reploff = 0;
-    c->read_reploff = 0;
-    c->repl_applied = 0;
-    c->repl_ack_off = 0;
-    c->repl_ack_time = 0;
-    c->repl_aof_off = 0;
-    c->repl_last_partial_write = 0;
-    c->replica_listening_port = 0;
-    c->replica_addr = NULL;
-    c->replica_version = 0;
-    c->replica_capa = REPLICA_CAPA_NONE;
-    c->replica_req = REPLICA_REQ_NONE;
-    c->associated_rdb_client_id = 0;
-    c->rdb_client_disconnect_time = 0;
     c->reply = listCreate();
     c->deferred_reply_errors = NULL;
     c->reply_bytes = 0;
     c->obuf_soft_limit_reached_time = 0;
     listSetFreeMethod(c->reply, freeClientReplyValue);
     listSetDupMethod(c->reply, dupClientReplyValue);
-    initClientBlockingState(c);
+    c->repl_data = NULL;
+    c->bstate = NULL;
+    c->pubsub_data = NULL;
+    c->module_data = NULL;
+    c->mstate = NULL;
     c->woff = 0;
-    c->watched_keys = listCreate();
-    c->pubsub_channels = dictCreate(&objectKeyPointerValueDictType);
-    c->pubsub_patterns = dictCreate(&objectKeyPointerValueDictType);
-    c->pubsubshard_channels = dictCreate(&objectKeyPointerValueDictType);
     c->peerid = NULL;
     c->sockname = NULL;
     c->client_list_node = NULL;
     c->io_read_state = CLIENT_IDLE;
     c->io_write_state = CLIENT_IDLE;
     c->nwritten = 0;
-    c->client_tracking_redirection = 0;
-    c->client_tracking_prefixes = NULL;
     c->last_memory_usage = 0;
     c->last_memory_type = CLIENT_TYPE_NORMAL;
-    c->module_blocked_client = NULL;
-    c->module_auth_ctx = NULL;
-    c->auth_callback = NULL;
-    c->auth_callback_privdata = NULL;
-    c->auth_module = NULL;
     listInitNode(&c->clients_pending_write_node, c);
     listInitNode(&c->pending_read_list_node, c);
     c->mem_usage_bucket = NULL;
     c->mem_usage_bucket_node = NULL;
     if (conn) linkClient(c);
-    initClientMultiState(c);
     c->net_input_bytes = 0;
     c->net_input_bytes_curr_cmd = 0;
     c->net_output_bytes = 0;
@@ -266,7 +240,9 @@ void putClientInPendingWriteQueue(client *c) {
      * if not already done and, for replicas, if the replica can actually receive
      * writes at this stage. */
     if (!c->flag.pending_write &&
-        (c->repl_state == REPL_STATE_NONE || (isReplicaReadyForReplData(c) && !c->repl_start_cmd_stream_on_ack))) {
+        (!c->repl_data ||
+         c->repl_data->repl_state == REPL_STATE_NONE ||
+         (isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack))) {
         /* Here instead of installing the write handler, we just flag the
          * client and put it into a list of clients that have something
          * to write to the socket. This way before re-entering the event
@@ -1340,10 +1316,10 @@ void deferredAfterErrorReply(client *c, list *errors) {
 void copyReplicaOutputBuffer(client *dst, client *src) {
     serverAssert(src->bufpos == 0 && listLength(src->reply) == 0);
 
-    if (src->ref_repl_buf_node == NULL) return;
-    dst->ref_repl_buf_node = src->ref_repl_buf_node;
-    dst->ref_block_pos = src->ref_block_pos;
-    ((replBufBlock *)listNodeValue(dst->ref_repl_buf_node))->refcount++;
+    if (src->repl_data->ref_repl_buf_node == NULL) return;
+    dst->repl_data->ref_repl_buf_node = src->repl_data->ref_repl_buf_node;
+    dst->repl_data->ref_block_pos = src->repl_data->ref_block_pos;
+    ((replBufBlock *)listNodeValue(dst->repl_data->ref_repl_buf_node))->refcount++;
 }
 
 /* Return true if the specified client has pending reply buffers to write to
@@ -1353,13 +1329,13 @@ int clientHasPendingReplies(client *c) {
         /* Replicas use global shared replication buffer instead of
          * private output buffer. */
         serverAssert(c->bufpos == 0 && listLength(c->reply) == 0);
-        if (c->ref_repl_buf_node == NULL) return 0;
+        if (c->repl_data->ref_repl_buf_node == NULL) return 0;
 
         /* If the last replication buffer block content is totally sent,
          * we have nothing to send. */
         listNode *ln = listLast(server.repl_buffer_blocks);
         replBufBlock *tail = listNodeValue(ln);
-        if (ln == c->ref_repl_buf_node && c->ref_block_pos == tail->used) return 0;
+        if (ln == c->repl_data->ref_repl_buf_node && c->repl_data->ref_block_pos == tail->used) return 0;
 
         return 1;
     } else {
@@ -1526,23 +1502,6 @@ void disconnectReplicas(void) {
     }
 }
 
-/* Check if there is any other replica waiting dumping RDB finished expect me.
- * This function is useful to judge current dumping RDB can be used for full
- * synchronization or not. */
-int anyOtherReplicaWaitRdb(client *except_me) {
-    listIter li;
-    listNode *ln;
-
-    listRewind(server.replicas, &li);
-    while ((ln = listNext(&li))) {
-        client *replica = ln->value;
-        if (replica != except_me && replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
 /* Remove the specified client from global lists where the client could
  * be referenced, not including the Pub/Sub channels.
  * This is used by freeClient() and replicationCachePrimary(). */
@@ -1567,7 +1526,7 @@ void unlinkClient(client *c) {
 
         /* Check if this is a replica waiting for diskless replication (rdb pipe),
          * in which case it needs to be cleaned from that list */
-        if (c->flag.replica && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) {
+        if (c->repl_data && c->flag.replica && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) {
             int i;
             int still_alive = 0;
             for (i = 0; i < server.rdb_pipe_numconns; i++) {
@@ -1653,11 +1612,7 @@ void clearClientConnectionState(client *c) {
     clientSetDefaultAuth(c);
     moduleNotifyUserChanged(c);
     discardTransaction(c);
-
-    pubsubUnsubscribeAllChannels(c, 0);
-    pubsubUnsubscribeShardAllChannels(c, 0);
-    pubsubUnsubscribeAllPatterns(c, 0);
-    unmarkClientAsPubSub(c);
+    freeClientPubSubData(c);
 
     if (c->name) {
         decrRefCount(c->name);
@@ -1696,9 +1651,7 @@ void freeClient(client *c) {
 
     /* Notify module system that this client auth status changed. */
     moduleNotifyUserChanged(c);
-
-    /* Free the RedisModuleBlockedClient held onto for reprocessing if not already freed. */
-    zfree(c->module_blocked_client);
+    freeClientModuleData(c);
 
     /* If this client was scheduled for async freeing we need to remove it
      * from the queue. Note that we need to do this here, because later
@@ -1745,31 +1698,16 @@ void freeClient(client *c) {
     /* If there is any in-flight command, we don't record their duration. */
     c->duration = 0;
     if (c->flag.blocked) unblockClient(c, 1);
-    dictRelease(c->bstate.keys);
-
-    /* UNWATCH all the keys */
-    unwatchAllKeys(c);
-    listRelease(c->watched_keys);
-    c->watched_keys = NULL;
-
-    /* Unsubscribe from all the pubsub channels */
-    pubsubUnsubscribeAllChannels(c, 0);
-    pubsubUnsubscribeShardAllChannels(c, 0);
-    pubsubUnsubscribeAllPatterns(c, 0);
-    unmarkClientAsPubSub(c);
-    dictRelease(c->pubsub_channels);
-    c->pubsub_channels = NULL;
-    dictRelease(c->pubsub_patterns);
-    c->pubsub_patterns = NULL;
-    dictRelease(c->pubsubshard_channels);
-    c->pubsubshard_channels = NULL;
+
+    freeClientBlockingState(c);
+    freeClientPubSubData(c);
 
     /* Free data structures. */
     listRelease(c->reply);
     c->reply = NULL;
     zfree_with_size(c->buf, c->buf_usable_size);
     c->buf = NULL;
-    freeReplicaReferencedReplBuffer(c);
+
     freeClientArgv(c);
     freeClientOriginalArgv(c);
     if (c->deferred_reply_errors) listRelease(c->deferred_reply_errors);
@@ -1787,45 +1725,7 @@ void freeClient(client *c) {
      * places where active clients may be referenced. */
     unlinkClient(c);
 
-    /* Primary/replica cleanup Case 1:
-     * we lost the connection with a replica. */
-    if (c->flag.replica) {
-        /* If there is no any other replica waiting dumping RDB finished, the
-         * current child process need not continue to dump RDB, then we kill it.
-         * So child process won't use more memory, and we also can fork a new
-         * child process asap to dump rdb for next full synchronization or bgsave.
-         * But we also need to check if users enable 'save' RDB, if enable, we
-         * should not remove directly since that means RDB is important for users
-         * to keep data safe and we may delay configured 'save' for full sync. */
-        if (server.saveparamslen == 0 && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END &&
-            server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK &&
-            anyOtherReplicaWaitRdb(c) == 0) {
-            serverLog(LL_NOTICE, "Background saving, persistence disabled, last replica dropped, killing fork child.");
-            killRDBChild();
-        }
-        if (c->repl_state == REPLICA_STATE_SEND_BULK) {
-            if (c->repldbfd != -1) close(c->repldbfd);
-            if (c->replpreamble) sdsfree(c->replpreamble);
-        }
-        list *l = (c->flag.monitor) ? server.monitors : server.replicas;
-        ln = listSearchKey(l, c);
-        serverAssert(ln != NULL);
-        listDelNode(l, ln);
-        /* We need to remember the time when we started to have zero
-         * attached replicas, as after some time we'll free the replication
-         * backlog. */
-        if (getClientType(c) == CLIENT_TYPE_REPLICA && listLength(server.replicas) == 0)
-            server.repl_no_replicas_since = server.unixtime;
-        refreshGoodReplicasCount();
-        /* Fire the replica change modules event. */
-        if (c->repl_state == REPLICA_STATE_ONLINE)
-            moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE,
-                                  NULL);
-    }
-
-    /* Primary/replica cleanup Case 2:
-     * we lost the connection with the primary. */
-    if (c->flag.primary) replicationHandlePrimaryDisconnection();
+    freeClientReplicationData(c);
 
     /* Remove client from memory usage buckets */
     if (c->mem_usage_bucket) {
@@ -1841,7 +1741,6 @@ void freeClient(client *c) {
     freeClientMultiState(c);
     sdsfree(c->peerid);
     sdsfree(c->sockname);
-    sdsfree(c->replica_addr);
     zfree(c);
 }
 
@@ -1932,10 +1831,10 @@ void beforeNextClient(client *c) {
          * In these scenarios, qb_pos points to the part of the current command
          * or the beginning of next command, and the current command is not applied yet,
          * so the repl_applied is not equal to qb_pos. */
-        if (c->repl_applied) {
-            sdsrange(c->querybuf, c->repl_applied, -1);
-            c->qb_pos -= c->repl_applied;
-            c->repl_applied = 0;
+        if (c->repl_data->repl_applied) {
+            sdsrange(c->querybuf, c->repl_data->repl_applied, -1);
+            c->qb_pos -= c->repl_data->repl_applied;
+            c->repl_data->repl_applied = 0;
         }
     } else {
         trimClientQueryBuffer(c);
@@ -1974,18 +1873,18 @@ int freeClientsInAsyncFreeQueue(void) {
              * The primary gives a grace period before freeing this client because
              * it serves as a reference to the first required replication data block for
              * this replica */
-            if (!c->rdb_client_disconnect_time) {
+            if (!c->repl_data->rdb_client_disconnect_time) {
                 if (c->conn) connSetReadHandler(c->conn, NULL);
-                c->rdb_client_disconnect_time = server.unixtime;
+                c->repl_data->rdb_client_disconnect_time = server.unixtime;
                 dualChannelServerLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds",
                                      (unsigned long long)c->id, replicationGetReplicaName(c), server.wait_before_rdb_client_free);
             }
-            if (server.unixtime - c->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue;
+            if (server.unixtime - c->repl_data->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue;
             dualChannelServerLog(
                 LL_NOTICE,
                 "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). "
                 "Freeing RDB client %llu.",
-                (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id);
+                (long int)(server.unixtime - c->repl_data->rdb_client_disconnect_time), (unsigned long long)c->id);
             c->flag.protected_rdb_channel = 0;
         }
 
@@ -2015,27 +1914,27 @@ void writeToReplica(client *c) {
     int nwritten = 0;
     serverAssert(c->bufpos == 0 && listLength(c->reply) == 0);
     while (clientHasPendingReplies(c)) {
-        replBufBlock *o = listNodeValue(c->ref_repl_buf_node);
-        serverAssert(o->used >= c->ref_block_pos);
+        replBufBlock *o = listNodeValue(c->repl_data->ref_repl_buf_node);
+        serverAssert(o->used >= c->repl_data->ref_block_pos);
 
         /* Send current block if it is not fully sent. */
-        if (o->used > c->ref_block_pos) {
-            nwritten = connWrite(c->conn, o->buf + c->ref_block_pos, o->used - c->ref_block_pos);
+        if (o->used > c->repl_data->ref_block_pos) {
+            nwritten = connWrite(c->conn, o->buf + c->repl_data->ref_block_pos, o->used - c->repl_data->ref_block_pos);
             if (nwritten <= 0) {
                 c->write_flags |= WRITE_FLAGS_WRITE_ERROR;
                 return;
             }
             c->nwritten += nwritten;
-            c->ref_block_pos += nwritten;
+            c->repl_data->ref_block_pos += nwritten;
         }
 
         /* If we fully sent the object on head, go to the next one. */
-        listNode *next = listNextNode(c->ref_repl_buf_node);
-        if (next && c->ref_block_pos == o->used) {
+        listNode *next = listNextNode(c->repl_data->ref_repl_buf_node);
+        if (next && c->repl_data->ref_block_pos == o->used) {
             o->refcount--;
             ((replBufBlock *)(listNodeValue(next)))->refcount++;
-            c->ref_repl_buf_node = next;
-            c->ref_block_pos = 0;
+            c->repl_data->ref_repl_buf_node = next;
+            c->repl_data->ref_block_pos = 0;
             incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL);
         }
     }
@@ -2338,7 +2237,7 @@ int handleReadResult(client *c) {
     c->last_interaction = server.unixtime;
     c->net_input_bytes += c->nread;
     if (c->flag.primary) {
-        c->read_reploff += c->nread;
+        c->repl_data->read_reploff += c->nread;
         server.stat_net_repl_input_bytes += c->nread;
     } else {
         server.stat_net_input_bytes += c->nread;
@@ -2409,7 +2308,7 @@ parseResult handleParseResults(client *c) {
     }
 
     if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN && getClientType(c) == CLIENT_TYPE_REPLICA) {
-        c->repl_ack_time = server.unixtime;
+        c->repl_data->repl_ack_time = server.unixtime;
     }
 
     if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN) {
@@ -2993,10 +2892,12 @@ void commandProcessed(client *c) {
     clusterSlotStatsAddNetworkBytesInForUserClient(c);
     resetClient(c);
 
-    long long prev_offset = c->reploff;
+    if (!c->repl_data) return;
+
+    long long prev_offset = c->repl_data->reploff;
     if (c->flag.primary && !c->flag.multi) {
         /* Update the applied replication offset of our primary. */
-        c->reploff = c->read_reploff - sdslen(c->querybuf) + c->qb_pos;
+        c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos;
     }
 
     /* If the client is a primary we need to compute the difference
@@ -3006,10 +2907,10 @@ void commandProcessed(client *c) {
      * part of the replication stream, will be propagated to the
      * sub-replicas and to the replication backlog. */
     if (c->flag.primary) {
-        long long applied = c->reploff - prev_offset;
+        long long applied = c->repl_data->reploff - prev_offset;
         if (applied) {
-            replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_applied, applied);
-            c->repl_applied += applied;
+            replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied);
+            c->repl_data->repl_applied += applied;
         }
     }
 }
@@ -3241,7 +3142,7 @@ void readToQueryBuf(client *c) {
          * so they are also considered a part of the query buffer in a broader sense.
          *
          * For unauthenticated clients, the query buffer cannot exceed 1MB at most. */
-        size_t qb_memory = sdslen(c->querybuf) + c->mstate.argv_len_sums;
+        size_t qb_memory = sdslen(c->querybuf) + (c->mstate ? c->mstate->argv_len_sums : 0);
         if (qb_memory > server.client_max_querybuf_len ||
             (qb_memory > 1024 * 1024 && (c->read_flags & READ_FLAGS_AUTH_REQUIRED))) {
             c->read_flags |= READ_FLAGS_QB_LIMIT_REACHED;
@@ -3369,9 +3270,9 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) {
     size_t obufmem, total_mem = getClientMemoryUsage(client, &obufmem);
 
     size_t used_blocks_of_repl_buf = 0;
-    if (client->ref_repl_buf_node) {
+    if (client->repl_data && client->repl_data->ref_repl_buf_node) {
         replBufBlock *last = listNodeValue(listLast(server.repl_buffer_blocks));
-        replBufBlock *cur = listNodeValue(client->ref_repl_buf_node);
+        replBufBlock *cur = listNodeValue(client->repl_data->ref_repl_buf_node);
         used_blocks_of_repl_buf = last->id - cur->id + 1;
     }
     sds ret = sdscatfmt(
@@ -3386,15 +3287,15 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) {
             " idle=%I", (long long)(server.unixtime - client->last_interaction),
             " flags=%s", flags,
             " db=%i", client->db->id,
-            " sub=%i", (int)dictSize(client->pubsub_channels),
-            " psub=%i", (int)dictSize(client->pubsub_patterns),
-            " ssub=%i", (int)dictSize(client->pubsubshard_channels),
-            " multi=%i", (client->flag.multi) ? client->mstate.count : -1,
-            " watch=%i", (int)listLength(client->watched_keys),
+            " sub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsub_channels) : 0,
+            " psub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsub_patterns) : 0,
+            " ssub=%i", client->pubsub_data ? (int)dictSize(client->pubsub_data->pubsubshard_channels) : 0,
+            " multi=%i", client->mstate ? client->mstate->count : -1,
+            " watch=%i", client->mstate ? (int)listLength(&client->mstate->watched_keys) : 0,
             " qbuf=%U", client->querybuf ? (unsigned long long)sdslen(client->querybuf) : 0,
             " qbuf-free=%U", client->querybuf ? (unsigned long long)sdsavail(client->querybuf) : 0,
             " argv-mem=%U", (unsigned long long)client->argv_len_sum,
-            " multi-mem=%U", (unsigned long long)client->mstate.argv_len_sums,
+            " multi-mem=%U", client->mstate ? (unsigned long long)client->mstate->argv_len_sums : 0,
             " rbs=%U", (unsigned long long)client->buf_usable_size,
             " rbp=%U", (unsigned long long)client->buf_peak,
             " obl=%U", (unsigned long long)client->bufpos,
@@ -3404,7 +3305,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) {
             " events=%s", events,
             " cmd=%s", client->lastcmd ? client->lastcmd->fullname : "NULL",
             " user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"),
-            " redir=%I", (client->flag.tracking) ? (long long)client->client_tracking_redirection : -1,
+            " redir=%I", (client->flag.tracking) ? (long long)client->pubsub_data->client_tracking_redirection : -1,
             " resp=%i", client->resp,
             " lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "",
             " lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : "",
@@ -3892,6 +3793,7 @@ void clientCommand(client *c) {
         struct ClientFlags options = {0};
         robj **prefix = NULL;
         size_t numprefix = 0;
+        initClientPubSubData(c);
 
         /* Parse the options. */
         for (int j = 3; j < c->argc; j++) {
@@ -4031,7 +3933,7 @@ void clientCommand(client *c) {
     } else if (!strcasecmp(c->argv[1]->ptr, "getredir") && c->argc == 2) {
         /* CLIENT GETREDIR */
         if (c->flag.tracking) {
-            addReplyLongLong(c, c->client_tracking_redirection);
+            addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
         } else {
             addReplyLongLong(c, -1);
         }
@@ -4077,17 +3979,17 @@ void clientCommand(client *c) {
         /* Redirect */
         addReplyBulkCString(c, "redirect");
         if (c->flag.tracking) {
-            addReplyLongLong(c, c->client_tracking_redirection);
+            addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
         } else {
             addReplyLongLong(c, -1);
         }
 
         /* Prefixes */
         addReplyBulkCString(c, "prefixes");
-        if (c->client_tracking_prefixes) {
-            addReplyArrayLen(c, raxSize(c->client_tracking_prefixes));
+        if (c->pubsub_data->client_tracking_prefixes) {
+            addReplyArrayLen(c, raxSize(c->pubsub_data->client_tracking_prefixes));
             raxIterator ri;
-            raxStart(&ri, c->client_tracking_prefixes);
+            raxStart(&ri, c->pubsub_data->client_tracking_prefixes);
             raxSeek(&ri, "^", NULL, 0);
             while (raxNext(&ri)) {
                 addReplyBulkCBuffer(c, ri.key, ri.key_len);
@@ -4410,9 +4312,9 @@ size_t getClientOutputBufferMemoryUsage(client *c) {
         size_t repl_buf_size = 0;
         size_t repl_node_num = 0;
         size_t repl_node_size = sizeof(listNode) + sizeof(replBufBlock);
-        if (c->ref_repl_buf_node) {
+        if (c->repl_data->ref_repl_buf_node) {
             replBufBlock *last = listNodeValue(listLast(server.repl_buffer_blocks));
-            replBufBlock *cur = listNodeValue(c->ref_repl_buf_node);
+            replBufBlock *cur = listNodeValue(c->repl_data->ref_repl_buf_node);
             repl_buf_size = last->repl_offset + last->size - cur->repl_offset;
             repl_node_num = last->id - cur->id + 1;
         }
@@ -4445,8 +4347,8 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) {
 
     /* Add memory overhead of the tracking prefixes, this is an underestimation so we don't need to traverse the entire
      * rax */
-    if (c->client_tracking_prefixes)
-        mem += c->client_tracking_prefixes->numnodes * (sizeof(raxNode) * sizeof(raxNode *));
+    if (c->pubsub_data && c->pubsub_data->client_tracking_prefixes)
+        mem += c->pubsub_data->client_tracking_prefixes->numnodes * (sizeof(raxNode) * sizeof(raxNode *));
 
     return mem;
 }
@@ -4612,7 +4514,7 @@ void flushReplicasOutputBuffers(void) {
          * 3. Obviously if the replica is not ONLINE.
          */
         if (isReplicaReadyForReplData(replica) && !(replica->flag.close_asap) && can_receive_writes &&
-            !replica->repl_start_cmd_stream_on_ack && clientHasPendingReplies(replica)) {
+            !replica->repl_data->repl_start_cmd_stream_on_ack && clientHasPendingReplies(replica)) {
             writeToClient(replica);
         }
     }
diff --git a/src/pubsub.c b/src/pubsub.c
index 3781fa39aa..27b5611788 100644
--- a/src/pubsub.c
+++ b/src/pubsub.c
@@ -219,20 +219,20 @@ int serverPubsubShardSubscriptionCount(void) {
 
 /* Return the number of channels + patterns a client is subscribed to. */
 int clientSubscriptionsCount(client *c) {
-    return dictSize(c->pubsub_channels) + dictSize(c->pubsub_patterns);
+    return dictSize(c->pubsub_data->pubsub_channels) + dictSize(c->pubsub_data->pubsub_patterns);
 }
 
 /* Return the number of shard level channels a client is subscribed to. */
 int clientShardSubscriptionsCount(client *c) {
-    return dictSize(c->pubsubshard_channels);
+    return dictSize(c->pubsub_data->pubsubshard_channels);
 }
 
 dict *getClientPubSubChannels(client *c) {
-    return c->pubsub_channels;
+    return c->pubsub_data->pubsub_channels;
 }
 
 dict *getClientPubSubShardChannels(client *c) {
-    return c->pubsubshard_channels;
+    return c->pubsub_data->pubsubshard_channels;
 }
 
 /* Return the number of pubsub + pubsub shard level channels
@@ -255,6 +255,36 @@ void unmarkClientAsPubSub(client *c) {
     }
 }
 
+void initClientPubSubData(client *c) {
+    if (c->pubsub_data) return;
+    c->pubsub_data = zmalloc(sizeof(ClientPubSubData));
+    c->pubsub_data->pubsub_channels = dictCreate(&objectKeyPointerValueDictType);
+    c->pubsub_data->pubsub_patterns = dictCreate(&objectKeyPointerValueDictType);
+    c->pubsub_data->pubsubshard_channels = dictCreate(&objectKeyPointerValueDictType);
+    c->pubsub_data->client_tracking_redirection = 0;
+    c->pubsub_data->client_tracking_prefixes = NULL;
+}
+
+void freeClientPubSubData(client *c) {
+    if (!c->pubsub_data) return;
+    /* Unsubscribe from all the pubsub channels */
+    pubsubUnsubscribeAllChannels(c, 0);
+    pubsubUnsubscribeShardAllChannels(c, 0);
+    pubsubUnsubscribeAllPatterns(c, 0);
+    unmarkClientAsPubSub(c);
+    dictRelease(c->pubsub_data->pubsub_channels);
+    c->pubsub_data->pubsub_channels = NULL;
+    dictRelease(c->pubsub_data->pubsub_patterns);
+    c->pubsub_data->pubsub_patterns = NULL;
+    dictRelease(c->pubsub_data->pubsubshard_channels);
+    c->pubsub_data->pubsubshard_channels = NULL;
+    if (c->pubsub_data->client_tracking_prefixes) {
+        disableTracking(c);
+    }
+    zfree(c->pubsub_data);
+    c->pubsub_data = NULL;
+}
+
 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
  * 0 if the client was already subscribed to that channel. */
 int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) {
@@ -262,6 +292,8 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) {
     int retval = 0;
     unsigned int slot = 0;
 
+    if (!c->pubsub_data) initClientPubSubData(c);
+
     /* Add the channel to the client -> channels hash table */
     void *position = dictFindPositionForInsert(type.clientPubSubChannels(c), channel, NULL);
     if (position) { /* Not yet subscribed to this channel */
@@ -344,7 +376,7 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) {
         dictEntry *entry;
         while ((entry = dictNext(iter)) != NULL) {
             client *c = dictGetKey(entry);
-            int retval = dictDelete(c->pubsubshard_channels, channel);
+            int retval = dictDelete(c->pubsub_data->pubsubshard_channels, channel);
             serverAssertWithInfo(c, channel, retval == DICT_OK);
             addReplyPubsubUnsubscribed(c, channel, pubSubShardType);
             /* If the client has no other pubsub subscription,
@@ -366,7 +398,9 @@ int pubsubSubscribePattern(client *c, robj *pattern) {
     dict *clients;
     int retval = 0;
 
-    if (dictAdd(c->pubsub_patterns, pattern, NULL) == DICT_OK) {
+    if (!c->pubsub_data) initClientPubSubData(c);
+
+    if (dictAdd(c->pubsub_data->pubsub_patterns, pattern, NULL) == DICT_OK) {
         retval = 1;
         incrRefCount(pattern);
         /* Add the client to the pattern -> list of clients hash table */
@@ -392,8 +426,10 @@ int pubsubUnsubscribePattern(client *c, robj *pattern, int notify) {
     dict *clients;
     int retval = 0;
 
+    if (!c->pubsub_data) initClientPubSubData(c);
+
     incrRefCount(pattern); /* Protect the object. May be the same we remove */
-    if (dictDelete(c->pubsub_patterns, pattern) == DICT_OK) {
+    if (dictDelete(c->pubsub_data->pubsub_patterns, pattern) == DICT_OK) {
         retval = 1;
         /* Remove the client from the pattern -> clients list hash table */
         de = dictFind(server.pubsub_patterns, pattern);
@@ -454,9 +490,10 @@ int pubsubUnsubscribeShardAllChannels(client *c, int notify) {
  * client was subscribed from. */
 int pubsubUnsubscribeAllPatterns(client *c, int notify) {
     int count = 0;
+    if (!c->pubsub_data) initClientPubSubData(c);
 
-    if (dictSize(c->pubsub_patterns) > 0) {
-        dictIterator *di = dictGetSafeIterator(c->pubsub_patterns);
+    if (dictSize(c->pubsub_data->pubsub_patterns) > 0) {
+        dictIterator *di = dictGetSafeIterator(c->pubsub_data->pubsub_patterns);
         dictEntry *de;
 
         while ((de = dictNext(di)) != NULL) {
@@ -560,6 +597,8 @@ void subscribeCommand(client *c) {
 
 /* UNSUBSCRIBE [channel ...] */
 void unsubscribeCommand(client *c) {
+    if (!c->pubsub_data) initClientPubSubData(c);
+
     if (c->argc == 1) {
         pubsubUnsubscribeAllChannels(c, 1);
     } else {
@@ -732,6 +771,8 @@ void ssubscribeCommand(client *c) {
 
 /* SUNSUBSCRIBE [shardchannel [shardchannel ...]] */
 void sunsubscribeCommand(client *c) {
+    if (!c->pubsub_data) initClientPubSubData(c);
+
     if (c->argc == 1) {
         pubsubUnsubscribeShardAllChannels(c, 1);
     } else {
@@ -745,12 +786,13 @@ void sunsubscribeCommand(client *c) {
 }
 
 size_t pubsubMemOverhead(client *c) {
+    if (!c->pubsub_data) return 0;
     /* PubSub patterns */
-    size_t mem = dictMemUsage(c->pubsub_patterns);
+    size_t mem = dictMemUsage(c->pubsub_data->pubsub_patterns);
     /* Global PubSub channels */
-    mem += dictMemUsage(c->pubsub_channels);
+    mem += dictMemUsage(c->pubsub_data->pubsub_channels);
     /* Sharded PubSub channels */
-    mem += dictMemUsage(c->pubsubshard_channels);
+    mem += dictMemUsage(c->pubsub_data->pubsubshard_channels);
     return mem;
 }
 
diff --git a/src/rdb.c b/src/rdb.c
index 958eac5d4f..32c9021669 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -3573,9 +3573,9 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
     listRewind(server.replicas, &li);
     while ((ln = listNext(&li))) {
         client *replica = ln->value;
-        if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
+        if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
             /* Check replica has the exact requirements */
-            if (replica->replica_req != req) continue;
+            if (replica->repl_data->replica_req != req) continue;
 
             conns[connsnum++] = replica->conn;
             if (dual_channel) {
@@ -3646,8 +3646,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
             listRewind(server.replicas, &li);
             while ((ln = listNext(&li))) {
                 client *replica = ln->value;
-                if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
-                    replica->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
+                if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
+                    replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
                 }
             }
             if (!dual_channel) {
diff --git a/src/replication.c b/src/replication.c
index c5611d5a5a..9913d64d65 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -82,10 +82,10 @@ char *replicationGetReplicaName(client *c) {
 
     ip[0] = '\0';
     buf[0] = '\0';
-    if (c->replica_addr || connAddrPeerName(c->conn, ip, sizeof(ip), NULL) != -1) {
-        char *addr = c->replica_addr ? c->replica_addr : ip;
-        if (c->replica_listening_port)
-            formatAddr(buf, sizeof(buf), addr, c->replica_listening_port);
+    if (c->repl_data->replica_addr || connAddrPeerName(c->conn, ip, sizeof(ip), NULL) != -1) {
+        char *addr = c->repl_data->replica_addr ? c->repl_data->replica_addr : ip;
+        if (c->repl_data->replica_listening_port)
+            formatAddr(buf, sizeof(buf), addr, c->repl_data->replica_listening_port);
         else
             snprintf(buf, sizeof(buf), "%s:<unknown-replica-port>", addr);
     } else {
@@ -231,7 +231,7 @@ void addRdbReplicaToPsyncWait(client *replica_rdb_client) {
     dualChannelServerLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ",
                          replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id,
                          tail ? "tracking repl-backlog tail" : "no repl-backlog to track");
-    replica_rdb_client->ref_repl_buf_node = tail ? ln : NULL;
+    replica_rdb_client->repl_data->ref_repl_buf_node = tail ? ln : NULL;
     /* Prevent rdb client from being freed before psync is established. */
     replica_rdb_client->flag.protected_rdb_channel = 1;
     uint64_t id = htonu64(replica_rdb_client->id);
@@ -250,8 +250,8 @@ void backfillRdbReplicasToPsyncWait(void) {
     raxSeek(&iter, "^", NULL, 0);
     while (raxNext(&iter)) {
         client *replica_rdb_client = iter.data;
-        if (replica_rdb_client->ref_repl_buf_node) continue;
-        replica_rdb_client->ref_repl_buf_node = ln;
+        if (replica_rdb_client->repl_data->ref_repl_buf_node) continue;
+        replica_rdb_client->repl_data->ref_repl_buf_node = ln;
         head->refcount++;
         dualChannelServerLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block",
                              (long long unsigned int)replica_rdb_client->id);
@@ -263,18 +263,18 @@ void removeReplicaFromPsyncWait(client *replica_main_client) {
     listNode *ln;
     replBufBlock *o;
     /* Get replBufBlock pointed by this replica */
-    client *replica_rdb_client = lookupRdbClientByID(replica_main_client->associated_rdb_client_id);
-    ln = replica_rdb_client->ref_repl_buf_node;
+    client *replica_rdb_client = lookupRdbClientByID(replica_main_client->repl_data->associated_rdb_client_id);
+    ln = replica_rdb_client->repl_data->ref_repl_buf_node;
     o = ln ? listNodeValue(ln) : NULL;
     if (o != NULL) {
         serverAssert(o->refcount > 0);
         o->refcount--;
     }
-    replica_rdb_client->ref_repl_buf_node = NULL;
+    replica_rdb_client->repl_data->ref_repl_buf_node = NULL;
     replica_rdb_client->flag.protected_rdb_channel = 0;
     dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s",
                          replicationGetReplicaName(replica_main_client),
-                         (long long unsigned int)replica_main_client->associated_rdb_client_id,
+                         (long long unsigned int)replica_main_client->repl_data->associated_rdb_client_id,
                          o ? "ref count decreased" : "doesn't exist");
     uint64_t id = htonu64(replica_rdb_client->id);
     raxRemove(server.replicas_waiting_psync, (unsigned char *)&id, sizeof(id), NULL);
@@ -291,7 +291,7 @@ int canFeedReplicaReplBuffer(client *replica) {
     if (replica->flag.repl_rdbonly) return 0;
 
     /* Don't feed replicas that are still waiting for BGSAVE to start. */
-    if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) return 0;
+    if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) return 0;
 
     return 1;
 }
@@ -396,15 +396,15 @@ void freeReplicaReferencedReplBuffer(client *replica) {
                                  replicationGetReplicaName(replica), (long long unsigned int)replica->id);
         }
     }
-    if (replica->ref_repl_buf_node != NULL) {
+    if (replica->repl_data->ref_repl_buf_node != NULL) {
         /* Decrease the start buffer node reference count. */
-        replBufBlock *o = listNodeValue(replica->ref_repl_buf_node);
+        replBufBlock *o = listNodeValue(replica->repl_data->ref_repl_buf_node);
         serverAssert(o->refcount > 0);
         o->refcount--;
         incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL);
     }
-    replica->ref_repl_buf_node = NULL;
-    replica->ref_block_pos = 0;
+    replica->repl_data->ref_repl_buf_node = NULL;
+    replica->repl_data->ref_block_pos = 0;
 }
 
 /* Replication: Primary side.
@@ -486,9 +486,9 @@ void feedReplicationBuffer(char *s, size_t len) {
             client *replica = ln->value;
             if (!canFeedReplicaReplBuffer(replica) && !(replica->flag.protected_rdb_channel)) continue;
             /* Update shared replication buffer start position. */
-            if (replica->ref_repl_buf_node == NULL) {
-                replica->ref_repl_buf_node = start_node;
-                replica->ref_block_pos = start_pos;
+            if (replica->repl_data->ref_repl_buf_node == NULL) {
+                replica->repl_data->ref_repl_buf_node = start_node;
+                replica->repl_data->ref_block_pos = start_pos;
                 /* Only increase the start block reference count. */
                 ((replBufBlock *)listNodeValue(start_node))->refcount++;
             }
@@ -771,8 +771,8 @@ long long addReplyReplicationBacklog(client *c, long long offset) {
     /* Setting output buffer of the replica. */
     replBufBlock *o = listNodeValue(node);
     o->refcount++;
-    c->ref_repl_buf_node = node;
-    c->ref_block_pos = offset - o->repl_offset;
+    c->repl_data->ref_repl_buf_node = node;
+    c->repl_data->ref_block_pos = offset - o->repl_offset;
 
     return server.repl_backlog->histlen - skip;
 }
@@ -805,8 +805,8 @@ int replicationSetupReplicaForFullResync(client *replica, long long offset) {
     char buf[128];
     int buflen;
 
-    replica->psync_initial_offset = offset;
-    replica->repl_state = REPLICA_STATE_WAIT_BGSAVE_END;
+    replica->repl_data->psync_initial_offset = offset;
+    replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_END;
     /* We are going to accumulate the incremental changes for this
      * replica as well. Set replicas_eldb to -1 in order to force to re-emit
      * a SELECT statement in the replication stream. */
@@ -889,19 +889,19 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) {
      * 4) Send the backlog data (from the offset to the end) to the replica. */
     waitForClientIO(c);
     c->flag.replica = 1;
-    if (c->associated_rdb_client_id && lookupRdbClientByID(c->associated_rdb_client_id)) {
-        c->repl_state = REPLICA_STATE_BG_RDB_LOAD;
+    if (c->repl_data->associated_rdb_client_id && lookupRdbClientByID(c->repl_data->associated_rdb_client_id)) {
+        c->repl_data->repl_state = REPLICA_STATE_BG_RDB_LOAD;
         removeReplicaFromPsyncWait(c);
     } else {
-        c->repl_state = REPLICA_STATE_ONLINE;
+        c->repl_data->repl_state = REPLICA_STATE_ONLINE;
     }
-    c->repl_ack_time = server.unixtime;
-    c->repl_start_cmd_stream_on_ack = 0;
+    c->repl_data->repl_ack_time = server.unixtime;
+    c->repl_data->repl_start_cmd_stream_on_ack = 0;
     listAddNodeTail(server.replicas, c);
     /* We can't use the connection buffers since they are used to accumulate
      * new commands at this stage. But we are sure the socket send buffer is
      * empty so this write will never fail actually. */
-    if (c->replica_capa & REPLICA_CAPA_PSYNC2) {
+    if (c->repl_data->replica_capa & REPLICA_CAPA_PSYNC2) {
         buflen = snprintf(buf, sizeof(buf), "+CONTINUE %s\r\n", server.replid);
     } else {
         buflen = snprintf(buf, sizeof(buf), "+CONTINUE\r\n");
@@ -1003,8 +1003,8 @@ int startBgsaveForReplication(int mincapa, int req) {
         while ((ln = listNext(&li))) {
             client *replica = ln->value;
 
-            if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
-                replica->repl_state = REPL_STATE_NONE;
+            if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
+                replica->repl_data->repl_state = REPL_STATE_NONE;
                 replica->flag.replica = 0;
                 listDelNode(server.replicas, ln);
                 addReplyError(replica, "BGSAVE failed, replication can't continue");
@@ -1021,9 +1021,9 @@ int startBgsaveForReplication(int mincapa, int req) {
         while ((ln = listNext(&li))) {
             client *replica = ln->value;
 
-            if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
+            if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
                 /* Check replica has the exact requirements */
-                if (replica->replica_req != req) continue;
+                if (replica->repl_data->replica_req != req) continue;
                 replicationSetupReplicaForFullResync(replica, getPsyncInitialOffset());
             }
         }
@@ -1037,6 +1037,8 @@ void syncCommand(client *c) {
     /* ignore SYNC if already replica or in monitor mode */
     if (c->flag.replica) return;
 
+    initClientReplicationData(c);
+
     /* Wait for any IO pending operation to finish before changing the client state to replica */
     waitForClientIO(c);
 
@@ -1089,7 +1091,7 @@ void syncCommand(client *c) {
     /* Fail sync if replica doesn't support EOF capability but wants a filtered RDB. This is because we force filtered
      * RDB's to be generated over a socket and not through a file to avoid conflicts with the snapshot files. Forcing
      * use of a socket is handled, if needed, in `startBgsaveForReplication`. */
-    if (c->replica_req & REPLICA_REQ_RDB_MASK && !(c->replica_capa & REPLICA_CAPA_EOF)) {
+    if (c->repl_data->replica_req & REPLICA_REQ_RDB_MASK && !(c->repl_data->replica_capa & REPLICA_CAPA_EOF)) {
         addReplyError(c, "Filtered replica requires EOF capability");
         return;
     }
@@ -1124,7 +1126,7 @@ void syncCommand(client *c) {
              * resync on purpose when they are not able to partially
              * resync. */
             if (primary_replid[0] != '?') server.stat_sync_partial_err++;
-            if (c->replica_capa & REPLICA_CAPA_DUAL_CHANNEL) {
+            if (c->repl_data->replica_capa & REPLICA_CAPA_DUAL_CHANNEL) {
                 dualChannelServerLog(LL_NOTICE,
                                      "Replica %s is capable of dual channel synchronization, and partial sync "
                                      "isn't possible. "
@@ -1149,9 +1151,9 @@ void syncCommand(client *c) {
 
     /* Setup the replica as one waiting for BGSAVE to start. The following code
      * paths will change the state if we handle the replica differently. */
-    c->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
+    c->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
     if (server.repl_disable_tcp_nodelay) connDisableTcpNoDelay(c->conn); /* Non critical if it fails. */
-    c->repldbfd = -1;
+    c->repl_data->repldbfd = -1;
     c->flag.replica = 1;
     listAddNodeTail(server.replicas, c);
 
@@ -1183,20 +1185,20 @@ void syncCommand(client *c) {
             replica = ln->value;
             /* If the client needs a buffer of commands, we can't use
              * a replica without replication buffer. */
-            if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END &&
+            if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END &&
                 (!(replica->flag.repl_rdbonly) || (c->flag.repl_rdbonly)))
                 break;
         }
         /* To attach this replica, we check that it has at least all the
          * capabilities of the replica that triggered the current BGSAVE
          * and its exact requirements. */
-        if (ln && ((c->replica_capa & replica->replica_capa) == replica->replica_capa) &&
-            c->replica_req == replica->replica_req) {
+        if (ln && ((c->repl_data->replica_capa & replica->repl_data->replica_capa) == replica->repl_data->replica_capa) &&
+            c->repl_data->replica_req == replica->repl_data->replica_req) {
             /* Perfect, the server is already registering differences for
              * another replica. Set the right state, and copy the buffer.
              * We don't copy buffer if clients don't want. */
             if (!c->flag.repl_rdbonly) copyReplicaOutputBuffer(c, replica);
-            replicationSetupReplicaForFullResync(c, replica->psync_initial_offset);
+            replicationSetupReplicaForFullResync(c, replica->repl_data->psync_initial_offset);
             serverLog(LL_NOTICE, "Waiting for end of BGSAVE for SYNC");
         } else {
             /* No way, we need to wait for the next BGSAVE in order to
@@ -1213,7 +1215,7 @@ void syncCommand(client *c) {
 
         /* CASE 3: There is no BGSAVE is in progress. */
     } else {
-        if (server.repl_diskless_sync && (c->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) {
+        if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) {
             /* Diskless replication RDB child is created inside
              * replicationCron() since we want to delay its start a
              * few seconds to wait for more replicas to arrive. */
@@ -1222,7 +1224,7 @@ void syncCommand(client *c) {
             /* We don't have a BGSAVE in progress, let's start one. Diskless
              * or disk-based mode is determined by replica's capacity. */
             if (!hasActiveChildProcess()) {
-                startBgsaveForReplication(c->replica_capa, c->replica_req);
+                startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req);
             } else {
                 serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. "
                                      "BGSAVE for replication delayed");
@@ -1232,6 +1234,72 @@ void syncCommand(client *c) {
     return;
 }
 
+/* Check if there is any other replica waiting dumping RDB finished expect me.
+ * This function is useful to judge current dumping RDB can be used for full
+ * synchronization or not. */
+int anyOtherReplicaWaitRdb(client *except_me) {
+    listIter li;
+    listNode *ln;
+
+    listRewind(server.replicas, &li);
+    while ((ln = listNext(&li))) {
+        client *replica = ln->value;
+        if (replica != except_me && replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+void initClientReplicationData(client *c) {
+    if (c->repl_data) return;
+    c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData));
+}
+
+void freeClientReplicationData(client *c) {
+    if (!c->repl_data) return;
+    freeReplicaReferencedReplBuffer(c);
+    /* Primary/replica cleanup Case 1:
+     * we lost the connection with a replica. */
+    if (c->flag.replica) {
+        /* If there is no any other replica waiting dumping RDB finished, the
+         * current child process need not continue to dump RDB, then we kill it.
+         * So child process won't use more memory, and we also can fork a new
+         * child process asap to dump rdb for next full synchronization or bgsave.
+         * But we also need to check if users enable 'save' RDB, if enable, we
+         * should not remove directly since that means RDB is important for users
+         * to keep data safe and we may delay configured 'save' for full sync. */
+        if (server.saveparamslen == 0 && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END &&
+            server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK &&
+            anyOtherReplicaWaitRdb(c) == 0) {
+            serverLog(LL_NOTICE, "Background saving, persistence disabled, last replica dropped, killing fork child.");
+            killRDBChild();
+        }
+        if (c->repl_data->repl_state == REPLICA_STATE_SEND_BULK) {
+            if (c->repl_data->repldbfd != -1) close(c->repl_data->repldbfd);
+            if (c->repl_data->replpreamble) sdsfree(c->repl_data->replpreamble);
+        }
+        list *l = (c->flag.monitor) ? server.monitors : server.replicas;
+        listNode *ln = listSearchKey(l, c);
+        serverAssert(ln != NULL);
+        listDelNode(l, ln);
+        /* We need to remember the time when we started to have zero
+         * attached replicas, as after some time we'll free the replication
+         * backlog. */
+        if (getClientType(c) == CLIENT_TYPE_REPLICA && listLength(server.replicas) == 0)
+            server.repl_no_replicas_since = server.unixtime;
+        refreshGoodReplicasCount();
+        /* Fire the replica change modules event. */
+        if (c->repl_data->repl_state == REPLICA_STATE_ONLINE)
+            moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE,
+                                  NULL);
+    }
+    if (c->flag.primary) replicationHandlePrimaryDisconnection();
+    sdsfree(c->repl_data->replica_addr);
+    zfree(c->repl_data);
+    c->repl_data = NULL;
+}
+
 /* REPLCONF <option> <value> <option> <value> ...
  * This command is used by a replica in order to configure the replication
  * process before starting it with the SYNC command.
@@ -1286,18 +1354,20 @@ void replconfCommand(client *c) {
         return;
     }
 
+    initClientReplicationData(c);
+
     /* Process every option-value pair. */
     for (j = 1; j < c->argc; j += 2) {
         if (!strcasecmp(c->argv[j]->ptr, "listening-port")) {
             long port;
 
             if ((getLongFromObjectOrReply(c, c->argv[j + 1], &port, NULL) != C_OK)) return;
-            c->replica_listening_port = port;
+            c->repl_data->replica_listening_port = port;
         } else if (!strcasecmp(c->argv[j]->ptr, "ip-address")) {
             sds addr = c->argv[j + 1]->ptr;
             if (sdslen(addr) < NET_HOST_STR_LEN) {
-                if (c->replica_addr) sdsfree(c->replica_addr);
-                c->replica_addr = sdsdup(addr);
+                if (c->repl_data->replica_addr) sdsfree(c->repl_data->replica_addr);
+                c->repl_data->replica_addr = sdsdup(addr);
             } else {
                 addReplyErrorFormat(c,
                                     "REPLCONF ip-address provided by "
@@ -1308,14 +1378,14 @@ void replconfCommand(client *c) {
         } else if (!strcasecmp(c->argv[j]->ptr, "capa")) {
             /* Ignore capabilities not understood by this primary. */
             if (!strcasecmp(c->argv[j + 1]->ptr, "eof"))
-                c->replica_capa |= REPLICA_CAPA_EOF;
+                c->repl_data->replica_capa |= REPLICA_CAPA_EOF;
             else if (!strcasecmp(c->argv[j + 1]->ptr, "psync2"))
-                c->replica_capa |= REPLICA_CAPA_PSYNC2;
+                c->repl_data->replica_capa |= REPLICA_CAPA_PSYNC2;
             else if (!strcasecmp(c->argv[j + 1]->ptr, "dual-channel") && server.dual_channel_replication &&
                      server.repl_diskless_sync) {
                 /* If dual-channel is disable on this primary, treat this command as unrecognized
                  * replconf option. */
-                c->replica_capa |= REPLICA_CAPA_DUAL_CHANNEL;
+                c->repl_data->replica_capa |= REPLICA_CAPA_DUAL_CHANNEL;
             }
         } else if (!strcasecmp(c->argv[j]->ptr, "ack")) {
             /* REPLCONF ACK is used by replica to inform the primary the amount
@@ -1325,12 +1395,12 @@ void replconfCommand(client *c) {
 
             if (!c->flag.replica) return;
             if ((getLongLongFromObject(c->argv[j + 1], &offset) != C_OK)) return;
-            if (offset > c->repl_ack_off) c->repl_ack_off = offset;
+            if (offset > c->repl_data->repl_ack_off) c->repl_data->repl_ack_off = offset;
             if (c->argc > j + 3 && !strcasecmp(c->argv[j + 2]->ptr, "fack")) {
                 if ((getLongLongFromObject(c->argv[j + 3], &offset) != C_OK)) return;
-                if (offset > c->repl_aof_off) c->repl_aof_off = offset;
+                if (offset > c->repl_data->repl_aof_off) c->repl_data->repl_aof_off = offset;
             }
-            c->repl_ack_time = server.unixtime;
+            c->repl_data->repl_ack_time = server.unixtime;
             /* If this was a diskless replication, we need to really put
              * the replica online when the first ACK is received (which
              * confirms replica is online and ready to get more data). This
@@ -1339,10 +1409,10 @@ void replconfCommand(client *c) {
              * There's a chance the ACK got to us before we detected that the
              * bgsave is done (since that depends on cron ticks), so run a
              * quick check first (instead of waiting for the next ACK. */
-            if (server.child_type == CHILD_TYPE_RDB && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END)
+            if (server.child_type == CHILD_TYPE_RDB && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END)
                 checkChildrenDone();
-            if (c->repl_start_cmd_stream_on_ack && c->repl_state == REPLICA_STATE_ONLINE) replicaStartCommandStream(c);
-            if (c->repl_state == REPLICA_STATE_BG_RDB_LOAD) {
+            if (c->repl_data->repl_start_cmd_stream_on_ack && c->repl_data->repl_state == REPLICA_STATE_ONLINE) replicaStartCommandStream(c);
+            if (c->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD) {
                 replicaPutOnline(c);
             }
             /* Note: this command does not reply anything! */
@@ -1376,11 +1446,11 @@ void replconfCommand(client *c) {
                 return;
             }
             /* By default filter out all parts of the rdb */
-            c->replica_req |= REPLICA_REQ_RDB_EXCLUDE_DATA;
-            c->replica_req |= REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS;
+            c->repl_data->replica_req |= REPLICA_REQ_RDB_EXCLUDE_DATA;
+            c->repl_data->replica_req |= REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS;
             for (i = 0; i < filter_count; i++) {
                 if (!strcasecmp(filters[i], "functions"))
-                    c->replica_req &= ~REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS;
+                    c->repl_data->replica_req &= ~REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS;
                 else {
                     addReplyErrorFormat(c, "Unsupported rdb-filter-only option: %s", (char *)filters[i]);
                     sdsfreesplitres(filters, filter_count);
@@ -1392,7 +1462,7 @@ void replconfCommand(client *c) {
             /* REPLCONF VERSION x.y.z */
             int version = version2num(c->argv[j + 1]->ptr);
             if (version >= 0) {
-                c->replica_version = version;
+                c->repl_data->replica_version = version;
             } else {
                 addReplyErrorFormat(c, "Unrecognized version format: %s", (char *)c->argv[j + 1]->ptr);
                 return;
@@ -1404,10 +1474,10 @@ void replconfCommand(client *c) {
             }
             if (start_with_offset == 1) {
                 c->flag.repl_rdb_channel = 1;
-                c->replica_req |= REPLICA_REQ_RDB_CHANNEL;
+                c->repl_data->replica_req |= REPLICA_REQ_RDB_CHANNEL;
             } else {
                 c->flag.repl_rdb_channel = 0;
-                c->replica_req &= ~REPLICA_REQ_RDB_CHANNEL;
+                c->repl_data->replica_req &= ~REPLICA_REQ_RDB_CHANNEL;
             }
         } else if (!strcasecmp(c->argv[j]->ptr, "set-rdb-client-id")) {
             /* REPLCONF identify <client-id> is used to identify the current replica main channel with existing
@@ -1420,7 +1490,7 @@ void replconfCommand(client *c) {
                 addReplyErrorFormat(c, "Unrecognized RDB client id %lld", client_id);
                 return;
             }
-            c->associated_rdb_client_id = (uint64_t)client_id;
+            c->repl_data->associated_rdb_client_id = (uint64_t)client_id;
         } else {
             addReplyErrorFormat(c, "Unrecognized REPLCONF option: %s", (char *)c->argv[j]->ptr);
             return;
@@ -1441,14 +1511,14 @@ void replconfCommand(client *c) {
  * */
 int replicaPutOnline(client *replica) {
     if (replica->flag.repl_rdbonly) {
-        replica->repl_state = REPLICA_STATE_RDB_TRANSMITTED;
+        replica->repl_data->repl_state = REPLICA_STATE_RDB_TRANSMITTED;
         /* The client asked for RDB only so we should close it ASAP */
         serverLog(LL_NOTICE, "RDB transfer completed, rdb only replica (%s) should be disconnected asap",
                   replicationGetReplicaName(replica));
         return 0;
     }
-    replica->repl_state = REPLICA_STATE_ONLINE;
-    replica->repl_ack_time = server.unixtime; /* Prevent false timeout. */
+    replica->repl_data->repl_state = REPLICA_STATE_ONLINE;
+    replica->repl_data->repl_ack_time = server.unixtime; /* Prevent false timeout. */
 
     refreshGoodReplicasCount();
     /* Fire the replica change modules event. */
@@ -1471,7 +1541,7 @@ int replicaPutOnline(client *replica) {
  *    won't get mixed with the RDB stream. */
 void replicaStartCommandStream(client *replica) {
     serverAssert(!(replica->flag.repl_rdbonly));
-    replica->repl_start_cmd_stream_on_ack = 0;
+    replica->repl_data->repl_start_cmd_stream_on_ack = 0;
 
     putClientInPendingWriteQueue(replica);
 }
@@ -1502,9 +1572,9 @@ void removeRDBUsedToSyncReplicas(void) {
         listRewind(server.replicas, &li);
         while ((ln = listNext(&li))) {
             replica = ln->value;
-            if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START ||
-                replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END ||
-                replica->repl_state == REPLICA_STATE_SEND_BULK) {
+            if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START ||
+                replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END ||
+                replica->repl_data->repl_state == REPLICA_STATE_SEND_BULK) {
                 delrdb = 0;
                 break; /* No need to check the other replicas. */
             }
@@ -1530,18 +1600,18 @@ void closeRepldbfd(client *myself) {
     listRewind(server.replicas, &li);
     while ((ln = listNext(&li))) {
         client *replica = ln->value;
-        if (replica != myself && replica->repl_state == REPLICA_STATE_SEND_BULK) {
+        if (replica != myself && replica->repl_data->repl_state == REPLICA_STATE_SEND_BULK) {
             reclaim = 0;
             break;
         }
     }
 
     if (reclaim) {
-        bioCreateCloseJob(myself->repldbfd, 0, 1);
+        bioCreateCloseJob(myself->repl_data->repldbfd, 0, 1);
     } else {
-        close(myself->repldbfd);
+        close(myself->repl_data->repldbfd);
     }
-    myself->repldbfd = -1;
+    myself->repl_data->repldbfd = -1;
 }
 
 void sendBulkToReplica(connection *conn) {
@@ -1552,18 +1622,18 @@ void sendBulkToReplica(connection *conn) {
     /* Before sending the RDB file, we send the preamble as configured by the
      * replication process. Currently the preamble is just the bulk count of
      * the file in the form "$<length>\r\n". */
-    if (replica->replpreamble) {
-        nwritten = connWrite(conn, replica->replpreamble, sdslen(replica->replpreamble));
+    if (replica->repl_data->replpreamble) {
+        nwritten = connWrite(conn, replica->repl_data->replpreamble, sdslen(replica->repl_data->replpreamble));
         if (nwritten == -1) {
             serverLog(LL_WARNING, "Write error sending RDB preamble to replica: %s", connGetLastError(conn));
             freeClient(replica);
             return;
         }
         server.stat_net_repl_output_bytes += nwritten;
-        sdsrange(replica->replpreamble, nwritten, -1);
-        if (sdslen(replica->replpreamble) == 0) {
-            sdsfree(replica->replpreamble);
-            replica->replpreamble = NULL;
+        sdsrange(replica->repl_data->replpreamble, nwritten, -1);
+        if (sdslen(replica->repl_data->replpreamble) == 0) {
+            sdsfree(replica->repl_data->replpreamble);
+            replica->repl_data->replpreamble = NULL;
             /* fall through sending data. */
         } else {
             return;
@@ -1571,8 +1641,8 @@ void sendBulkToReplica(connection *conn) {
     }
 
     /* If the preamble was already transferred, send the RDB bulk data. */
-    lseek(replica->repldbfd, replica->repldboff, SEEK_SET);
-    buflen = read(replica->repldbfd, buf, PROTO_IOBUF_LEN);
+    lseek(replica->repl_data->repldbfd, replica->repl_data->repldboff, SEEK_SET);
+    buflen = read(replica->repl_data->repldbfd, buf, PROTO_IOBUF_LEN);
     if (buflen <= 0) {
         serverLog(LL_WARNING, "Read error sending DB to replica: %s",
                   (buflen == 0) ? "premature EOF" : strerror(errno));
@@ -1586,9 +1656,9 @@ void sendBulkToReplica(connection *conn) {
         }
         return;
     }
-    replica->repldboff += nwritten;
+    replica->repl_data->repldboff += nwritten;
     server.stat_net_repl_output_bytes += nwritten;
-    if (replica->repldboff == replica->repldbsize) {
+    if (replica->repl_data->repldboff == replica->repl_data->repldbsize) {
         closeRepldbfd(replica);
         connSetWriteHandler(replica->conn, NULL);
         if (!replicaPutOnline(replica)) {
@@ -1605,7 +1675,7 @@ void rdbPipeWriteHandlerConnRemoved(struct connection *conn) {
     if (!connHasWriteHandler(conn)) return;
     connSetWriteHandler(conn, NULL);
     client *replica = connGetPrivateData(conn);
-    replica->repl_last_partial_write = 0;
+    replica->repl_data->repl_last_partial_write = 0;
     server.rdb_pipe_numconns_writing--;
     /* if there are no more writes for now for this conn, or write error: */
     if (server.rdb_pipe_numconns_writing == 0) {
@@ -1621,17 +1691,17 @@ void rdbPipeWriteHandler(struct connection *conn) {
     serverAssert(server.rdb_pipe_bufflen > 0);
     client *replica = connGetPrivateData(conn);
     ssize_t nwritten;
-    if ((nwritten = connWrite(conn, server.rdb_pipe_buff + replica->repldboff,
-                              server.rdb_pipe_bufflen - replica->repldboff)) == -1) {
+    if ((nwritten = connWrite(conn, server.rdb_pipe_buff + replica->repl_data->repldboff,
+                              server.rdb_pipe_bufflen - replica->repl_data->repldboff)) == -1) {
         if (connGetState(conn) == CONN_STATE_CONNECTED) return; /* equivalent to EAGAIN */
         serverLog(LL_WARNING, "Write error sending DB to replica: %s", connGetLastError(conn));
         freeClient(replica);
         return;
     } else {
-        replica->repldboff += nwritten;
+        replica->repl_data->repldboff += nwritten;
         server.stat_net_repl_output_bytes += nwritten;
-        if (replica->repldboff < server.rdb_pipe_bufflen) {
-            replica->repl_last_partial_write = server.unixtime;
+        if (replica->repl_data->repldboff < server.rdb_pipe_bufflen) {
+            replica->repl_data->repl_last_partial_write = server.unixtime;
             return; /* more data to write.. */
         }
     }
@@ -1698,17 +1768,17 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData,
                     continue;
                 }
                 /* An error and still in connected state, is equivalent to EAGAIN */
-                replica->repldboff = 0;
+                replica->repl_data->repldboff = 0;
             } else {
                 /* Note: when use diskless replication, 'repldboff' is the offset
                  * of 'rdb_pipe_buff' sent rather than the offset of entire RDB. */
-                replica->repldboff = nwritten;
+                replica->repl_data->repldboff = nwritten;
                 server.stat_net_repl_output_bytes += nwritten;
             }
             /* If we were unable to write all the data to one of the replicas,
              * setup write handler (and disable pipe read handler, below) */
             if (nwritten != server.rdb_pipe_bufflen) {
-                replica->repl_last_partial_write = server.unixtime;
+                replica->repl_data->repl_last_partial_write = server.unixtime;
                 server.rdb_pipe_numconns_writing++;
                 connSetWriteHandler(conn, rdbPipeWriteHandler);
             }
@@ -1739,7 +1809,7 @@ void updateReplicasWaitingBgsave(int bgsaveerr, int type) {
     while ((ln = listNext(&li))) {
         client *replica = ln->value;
 
-        if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
+        if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
             int repldbfd;
             struct valkey_stat buf;
 
@@ -1790,7 +1860,7 @@ void updateReplicasWaitingBgsave(int bgsaveerr, int type) {
                     freeClientAsync(replica);
                     continue;
                 }
-                replica->repl_start_cmd_stream_on_ack = 1;
+                replica->repl_data->repl_start_cmd_stream_on_ack = 1;
             } else {
                 repldbfd = open(server.rdb_filename, O_RDONLY);
                 if (repldbfd == -1) {
@@ -1804,11 +1874,11 @@ void updateReplicasWaitingBgsave(int bgsaveerr, int type) {
                     close(repldbfd);
                     continue;
                 }
-                replica->repldbfd = repldbfd;
-                replica->repldboff = 0;
-                replica->repldbsize = buf.st_size;
-                replica->repl_state = REPLICA_STATE_SEND_BULK;
-                replica->replpreamble = sdscatprintf(sdsempty(), "$%lld\r\n", (unsigned long long)replica->repldbsize);
+                replica->repl_data->repldbfd = repldbfd;
+                replica->repl_data->repldboff = 0;
+                replica->repl_data->repldbsize = buf.st_size;
+                replica->repl_data->repl_state = REPLICA_STATE_SEND_BULK;
+                replica->repl_data->replpreamble = sdscatprintf(sdsempty(), "$%lld\r\n", (unsigned long long)replica->repl_data->repldbsize);
 
                 /* When repl_state changes to REPLICA_STATE_SEND_BULK, we will release
                  * the resources in freeClient. */
@@ -1917,13 +1987,14 @@ void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, Conne
     /* Allocate a private query buffer for the primary client instead of using the shared query buffer.
      * This is done because the primary's query buffer data needs to be preserved for my sub-replicas to use. */
     server.primary->querybuf = sdsempty();
-    server.primary->reploff = server.primary_initial_offset;
-    server.primary->read_reploff = server.primary->reploff;
+    initClientReplicationData(server.primary);
+    server.primary->repl_data->reploff = server.primary_initial_offset;
+    server.primary->repl_data->read_reploff = server.primary->repl_data->reploff;
     server.primary->user = NULL; /* This client can do everything. */
-    memcpy(server.primary->replid, server.primary_replid, sizeof(server.primary_replid));
+    memcpy(server.primary->repl_data->replid, server.primary_replid, sizeof(server.primary_replid));
     /* If primary offset is set to -1, this primary is old and is not
      * PSYNC capable, so we flag it accordingly. */
-    if (server.primary->reploff == -1) server.primary->flag.pre_psync = 1;
+    if (server.primary->repl_data->reploff == -1) server.primary->flag.pre_psync = 1;
     if (dbid != -1) selectDb(server.primary, dbid);
 }
 
@@ -2418,8 +2489,8 @@ void readSyncBulkPayload(connection *conn) {
         /* After a full resynchronization we use the replication ID and
          * offset of the primary. The secondary ID / offset are cleared since
          * we are starting a new history. */
-        memcpy(server.replid, server.primary->replid, sizeof(server.replid));
-        server.primary_repl_offset = server.primary->reploff;
+        memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.replid));
+        server.primary_repl_offset = server.primary->repl_data->reploff;
     }
     clearReplicationId2();
 
@@ -2914,7 +2985,7 @@ int streamReplDataBufToDb(client *c) {
         replDataBufBlock *o = listNodeValue(cur);
         used = o->used;
         c->querybuf = sdscatlen(c->querybuf, o->buf, used);
-        c->read_reploff += used;
+        c->repl_data->read_reploff += used;
         processInputBuffer(c);
         server.pending_repl_data.len -= used;
         offset += used;
@@ -3072,8 +3143,8 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) {
                                  "Trying a partial resynchronization using main channel (request %s:%s).",
                                  psync_replid, psync_offset);
         } else if (server.cached_primary) {
-            psync_replid = server.cached_primary->replid;
-            snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->reploff + 1);
+            psync_replid = server.cached_primary->repl_data->replid;
+            snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->repl_data->reploff + 1);
             serverLog(LL_NOTICE, "Trying a partial resynchronization (request %s:%s).", psync_replid, psync_offset);
         } else {
             serverLog(LL_NOTICE, "Partial resynchronization not possible (no cached primary)");
@@ -3168,18 +3239,18 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) {
             memcpy(new, start, CONFIG_RUN_ID_SIZE);
             new[CONFIG_RUN_ID_SIZE] = '\0';
 
-            if (strcmp(new, server.cached_primary->replid)) {
+            if (strcmp(new, server.cached_primary->repl_data->replid)) {
                 /* Primary ID changed. */
                 serverLog(LL_NOTICE, "Primary replication ID changed to %s", new);
 
                 /* Set the old ID as our ID2, up to the current offset+1. */
-                memcpy(server.replid2, server.cached_primary->replid, sizeof(server.replid2));
+                memcpy(server.replid2, server.cached_primary->repl_data->replid, sizeof(server.replid2));
                 server.second_replid_offset = server.primary_repl_offset + 1;
 
                 /* Update the cached primary ID and our own primary ID to the
                  * new one. */
                 memcpy(server.replid, new, sizeof(server.replid));
-                memcpy(server.cached_primary->replid, new, sizeof(server.replid));
+                memcpy(server.cached_primary->repl_data->replid, new, sizeof(server.replid));
 
                 /* Disconnect all the sub-replicas: they need to be notified. */
                 disconnectReplicas();
@@ -4048,17 +4119,17 @@ void roleCommand(client *c) {
         listRewind(server.replicas, &li);
         while ((ln = listNext(&li))) {
             client *replica = ln->value;
-            char ip[NET_IP_STR_LEN], *replica_addr = replica->replica_addr;
+            char ip[NET_IP_STR_LEN], *replica_addr = replica->repl_data->replica_addr;
 
             if (!replica_addr) {
                 if (connAddrPeerName(replica->conn, ip, sizeof(ip), NULL) == -1) continue;
                 replica_addr = ip;
             }
-            if (replica->repl_state != REPLICA_STATE_ONLINE) continue;
+            if (replica->repl_data->repl_state != REPLICA_STATE_ONLINE) continue;
             addReplyArrayLen(c, 3);
             addReplyBulkCString(c, replica_addr);
-            addReplyBulkLongLong(c, replica->replica_listening_port);
-            addReplyBulkLongLong(c, replica->repl_ack_off);
+            addReplyBulkLongLong(c, replica->repl_data->replica_listening_port);
+            addReplyBulkLongLong(c, replica->repl_data->repl_ack_off);
             replicas++;
         }
         setDeferredArrayLen(c, mbcount, replicas);
@@ -4082,7 +4153,7 @@ void roleCommand(client *c) {
             }
         }
         addReplyBulkCString(c, replica_state);
-        addReplyLongLong(c, server.primary ? server.primary->reploff : -1);
+        addReplyLongLong(c, server.primary ? server.primary->repl_data->reploff : -1);
     }
 }
 
@@ -4098,7 +4169,7 @@ void replicationSendAck(void) {
         addReplyArrayLen(c, send_fack ? 5 : 3);
         addReplyBulkCString(c, "REPLCONF");
         addReplyBulkCString(c, "ACK");
-        addReplyBulkLongLong(c, c->reploff);
+        addReplyBulkLongLong(c, c->repl_data->reploff);
         if (send_fack) {
             addReplyBulkCString(c, "FACK");
             addReplyBulkLongLong(c, server.fsynced_reploff);
@@ -4146,8 +4217,8 @@ void replicationCachePrimary(client *c) {
      * pending outputs to the primary. */
     sdsclear(server.primary->querybuf);
     server.primary->qb_pos = 0;
-    server.primary->repl_applied = 0;
-    server.primary->read_reploff = server.primary->reploff;
+    server.primary->repl_data->repl_applied = 0;
+    server.primary->repl_data->read_reploff = server.primary->repl_data->reploff;
     if (c->flag.multi) discardTransaction(c);
     listEmpty(c->reply);
     c->sentlen = 0;
@@ -4191,7 +4262,7 @@ void replicationCachePrimaryUsingMyself(void) {
                          "to synthesize a cached primary: I may be able to synchronize with "
                          "the new primary with just a partial transfer.");
 
-    /* This will be used to populate the field server.primary->reploff
+    /* This will be used to populate the field server.primary->repl_data->reploff
      * by replicationCreatePrimaryClient(). We'll later set the created
      * primary as server.cached_primary, so the replica will use such
      * offset for PSYNC. */
@@ -4202,7 +4273,7 @@ void replicationCachePrimaryUsingMyself(void) {
     replicationCreatePrimaryClient(NULL, -1);
 
     /* Use our own ID / offset. */
-    memcpy(server.primary->replid, server.replid, sizeof(server.replid));
+    memcpy(server.primary->repl_data->replid, server.replid, sizeof(server.replid));
 
     /* Set as cached primary. */
     unlinkClient(server.primary);
@@ -4283,11 +4354,11 @@ void replicationResurrectProvisionalPrimary(void) {
     /* Create a primary client, but do not initialize the read handler yet, as this replica still has a local buffer to
      * drain. */
     replicationCreatePrimaryClientWithHandler(server.repl_transfer_s, server.repl_provisional_primary.dbid, NULL);
-    memcpy(server.primary->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid));
-    server.primary->reploff = server.repl_provisional_primary.reploff;
-    server.primary->read_reploff = server.repl_provisional_primary.read_reploff;
-    server.primary_repl_offset = server.primary->reploff;
-    memcpy(server.replid, server.primary->replid, sizeof(server.primary->replid));
+    memcpy(server.primary->repl_data->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid));
+    server.primary->repl_data->reploff = server.repl_provisional_primary.reploff;
+    server.primary->repl_data->read_reploff = server.repl_provisional_primary.read_reploff;
+    server.primary_repl_offset = server.primary->repl_data->reploff;
+    memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.primary->repl_data->replid));
     establishPrimaryConnection();
 }
 
@@ -4306,9 +4377,9 @@ void refreshGoodReplicasCount(void) {
     listRewind(server.replicas, &li);
     while ((ln = listNext(&li))) {
         client *replica = ln->value;
-        time_t lag = server.unixtime - replica->repl_ack_time;
+        time_t lag = server.unixtime - replica->repl_data->repl_ack_time;
 
-        if (replica->repl_state == REPLICA_STATE_ONLINE && lag <= server.repl_min_replicas_max_lag) good++;
+        if (replica->repl_data->repl_state == REPLICA_STATE_ONLINE && lag <= server.repl_min_replicas_max_lag) good++;
     }
     server.repl_good_replicas_count = good;
 }
@@ -4378,8 +4449,8 @@ int replicationCountAcksByOffset(long long offset) {
     while ((ln = listNext(&li))) {
         client *replica = ln->value;
 
-        if (replica->repl_state != REPLICA_STATE_ONLINE) continue;
-        if (replica->repl_ack_off >= offset) count++;
+        if (replica->repl_data->repl_state != REPLICA_STATE_ONLINE) continue;
+        if (replica->repl_data->repl_ack_off >= offset) count++;
     }
     return count;
 }
@@ -4395,8 +4466,8 @@ int replicationCountAOFAcksByOffset(long long offset) {
     while ((ln = listNext(&li))) {
         client *replica = ln->value;
 
-        if (replica->repl_state != REPLICA_STATE_ONLINE) continue;
-        if (replica->repl_aof_off >= offset) count++;
+        if (replica->repl_data->repl_state != REPLICA_STATE_ONLINE) continue;
+        if (replica->repl_data->repl_aof_off >= offset) count++;
     }
     return count;
 }
@@ -4482,9 +4553,9 @@ void waitaofCommand(client *c) {
  * waiting for replica acks. Never call it directly, call unblockClient()
  * instead. */
 void unblockClientWaitingReplicas(client *c) {
-    serverAssert(c->bstate.client_waiting_acks_list_node);
-    listDelNode(server.clients_waiting_acks, c->bstate.client_waiting_acks_list_node);
-    c->bstate.client_waiting_acks_list_node = NULL;
+    serverAssert(c->bstate->client_waiting_acks_list_node);
+    listDelNode(server.clients_waiting_acks, c->bstate->client_waiting_acks_list_node);
+    c->bstate->client_waiting_acks_list_node = NULL;
     updateStatsOnUnblock(c, 0, 0, 0);
 }
 
@@ -4507,7 +4578,7 @@ void processClientsWaitingReplicas(void) {
         client *c = ln->value;
         int is_wait_aof = c->cmd->proc == waitaofCommand;
 
-        if (is_wait_aof && c->bstate.numlocal && !server.aof_enabled) {
+        if (is_wait_aof && c->bstate->numlocal && !server.aof_enabled) {
             addReplyError(c, "WAITAOF cannot be used when numlocal is set but appendonly is disabled.");
             unblockClient(c, 1);
             continue;
@@ -4518,32 +4589,32 @@ void processClientsWaitingReplicas(void) {
          * may be unblocked without calling replicationCountAcksByOffset()
          * or calling replicationCountAOFAcksByOffset()
          * if the requested offset / replicas were equal or less. */
-        if (!is_wait_aof && last_offset && last_offset >= c->bstate.reploffset &&
-            last_numreplicas >= c->bstate.numreplicas) {
+        if (!is_wait_aof && last_offset && last_offset >= c->bstate->reploffset &&
+            last_numreplicas >= c->bstate->numreplicas) {
             numreplicas = last_numreplicas;
-        } else if (is_wait_aof && last_aof_offset && last_aof_offset >= c->bstate.reploffset &&
-                   last_aof_numreplicas >= c->bstate.numreplicas) {
+        } else if (is_wait_aof && last_aof_offset && last_aof_offset >= c->bstate->reploffset &&
+                   last_aof_numreplicas >= c->bstate->numreplicas) {
             numreplicas = last_aof_numreplicas;
         } else {
-            numreplicas = is_wait_aof ? replicationCountAOFAcksByOffset(c->bstate.reploffset)
-                                      : replicationCountAcksByOffset(c->bstate.reploffset);
+            numreplicas = is_wait_aof ? replicationCountAOFAcksByOffset(c->bstate->reploffset)
+                                      : replicationCountAcksByOffset(c->bstate->reploffset);
 
             /* Check if the number of replicas is satisfied. */
-            if (numreplicas < c->bstate.numreplicas) continue;
+            if (numreplicas < c->bstate->numreplicas) continue;
 
             if (is_wait_aof) {
-                last_aof_offset = c->bstate.reploffset;
+                last_aof_offset = c->bstate->reploffset;
                 last_aof_numreplicas = numreplicas;
             } else {
-                last_offset = c->bstate.reploffset;
+                last_offset = c->bstate->reploffset;
                 last_numreplicas = numreplicas;
             }
         }
 
         /* Check if the local constraint of WAITAOF is served */
         if (is_wait_aof) {
-            numlocal = server.fsynced_reploff >= c->bstate.reploffset;
-            if (numlocal < c->bstate.numlocal) continue;
+            numlocal = server.fsynced_reploff >= c->bstate->reploffset;
+            if (numlocal < c->bstate->numlocal) continue;
         }
 
         /* Reply before unblocking, because unblock client calls reqresAppendResponse */
@@ -4569,9 +4640,9 @@ long long replicationGetReplicaOffset(void) {
 
     if (server.primary_host != NULL) {
         if (server.primary) {
-            offset = server.primary->reploff;
+            offset = server.primary->repl_data->reploff;
         } else if (server.cached_primary) {
-            offset = server.cached_primary->reploff;
+            offset = server.cached_primary->repl_data->reploff;
         }
     }
     /* offset may be -1 when the primary does not support it at all, however
@@ -4668,8 +4739,8 @@ void replicationCron(void) {
         client *replica = ln->value;
 
         int is_presync =
-            (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START ||
-             (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_child_type != RDB_CHILD_TYPE_SOCKET));
+            (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START ||
+             (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_child_type != RDB_CHILD_TYPE_SOCKET));
 
         if (is_presync) {
             connWrite(replica->conn, "\n", 1);
@@ -4685,9 +4756,9 @@ void replicationCron(void) {
         while ((ln = listNext(&li))) {
             client *replica = ln->value;
 
-            if (replica->repl_state == REPLICA_STATE_ONLINE) {
+            if (replica->repl_data->repl_state == REPLICA_STATE_ONLINE) {
                 if (replica->flag.pre_psync) continue;
-                if ((server.unixtime - replica->repl_ack_time) > server.repl_timeout) {
+                if ((server.unixtime - replica->repl_data->repl_ack_time) > server.repl_timeout) {
                     serverLog(LL_WARNING, "Disconnecting timedout replica (streaming sync): %s",
                               replicationGetReplicaName(replica));
                     freeClient(replica);
@@ -4697,10 +4768,10 @@ void replicationCron(void) {
             /* We consider disconnecting only diskless replicas because disk-based replicas aren't fed
              * by the fork child so if a disk-based replica is stuck it doesn't prevent the fork child
              * from terminating. */
-            if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_END &&
+            if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END &&
                 server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) {
-                if (replica->repl_last_partial_write != 0 &&
-                    (server.unixtime - replica->repl_last_partial_write) > server.repl_timeout) {
+                if (replica->repl_data->repl_last_partial_write != 0 &&
+                    (server.unixtime - replica->repl_data->repl_last_partial_write) > server.repl_timeout) {
                     serverLog(LL_WARNING, "Disconnecting timedout replica (full sync): %s",
                               replicationGetReplicaName(replica));
                     freeClient(replica);
@@ -4786,18 +4857,18 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) {
         listRewind(server.replicas, &li);
         while ((ln = listNext(&li))) {
             client *replica = ln->value;
-            if (replica->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
+            if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
                 if (first) {
                     /* Get first replica's requirements */
-                    req = replica->replica_req;
-                } else if (req != replica->replica_req) {
+                    req = replica->repl_data->replica_req;
+                } else if (req != replica->repl_data->replica_req) {
                     /* Skip replicas that don't match */
                     continue;
                 }
                 idle = server.unixtime - replica->last_interaction;
                 if (idle > max_idle) max_idle = idle;
                 replicas_waiting++;
-                mincapa = first ? replica->replica_capa : (mincapa & replica->replica_capa);
+                mincapa = first ? replica->repl_data->replica_capa : (mincapa & replica->repl_data->replica_capa);
                 first = 0;
             }
         }
@@ -4836,14 +4907,14 @@ static client *findReplica(char *host, int port) {
     listRewind(server.replicas, &li);
     while ((ln = listNext(&li))) {
         replica = ln->value;
-        char ip[NET_IP_STR_LEN], *replicaip = replica->replica_addr;
+        char ip[NET_IP_STR_LEN], *replicaip = replica->repl_data->replica_addr;
 
         if (!replicaip) {
             if (connAddrPeerName(replica->conn, ip, sizeof(ip), NULL) == -1) continue;
             replicaip = ip;
         }
 
-        if (!strcasecmp(host, replicaip) && (port == replica->replica_listening_port)) return replica;
+        if (!strcasecmp(host, replicaip) && (port == replica->repl_data->replica_listening_port)) return replica;
     }
 
     return NULL;
@@ -4989,7 +5060,7 @@ void failoverCommand(client *c) {
         }
 
         /* Check if requested replica is online */
-        if (replica->repl_state != REPLICA_STATE_ONLINE) {
+        if (replica->repl_data->repl_state != REPLICA_STATE_ONLINE) {
             addReplyError(c, "FAILOVER target replica is not online.");
             return;
         }
@@ -5052,8 +5123,8 @@ void updateFailoverStatus(void) {
         /* Find any replica that has matched our repl_offset */
         while ((ln = listNext(&li))) {
             replica = ln->value;
-            if (replica->repl_ack_off == server.primary_repl_offset) {
-                char ip[NET_IP_STR_LEN], *replicaaddr = replica->replica_addr;
+            if (replica->repl_data->repl_ack_off == server.primary_repl_offset) {
+                char ip[NET_IP_STR_LEN], *replicaaddr = replica->repl_data->replica_addr;
 
                 if (!replicaaddr) {
                     if (connAddrPeerName(replica->conn, ip, sizeof(ip), NULL) == -1) continue;
@@ -5062,14 +5133,14 @@ void updateFailoverStatus(void) {
 
                 /* We are now failing over to this specific node */
                 server.target_replica_host = zstrdup(replicaaddr);
-                server.target_replica_port = replica->replica_listening_port;
+                server.target_replica_port = replica->repl_data->replica_listening_port;
                 break;
             }
         }
     }
 
     /* We've found a replica that is caught up */
-    if (replica && (replica->repl_ack_off == server.primary_repl_offset)) {
+    if (replica && (replica->repl_data->repl_ack_off == server.primary_repl_offset)) {
         server.failover_state = FAILOVER_IN_PROGRESS;
         serverLog(LL_NOTICE, "Failover target %s:%d is synced, failing over.", server.target_replica_host,
                   server.target_replica_port);
diff --git a/src/script.c b/src/script.c
index f342d496fc..a8e5b18eb9 100644
--- a/src/script.c
+++ b/src/script.c
@@ -228,6 +228,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx,
     /* If we are in MULTI context, flag Lua client as CLIENT_MULTI. */
     if (curr_client->flag.multi) {
         script_client->flag.multi = 1;
+        initClientMultiState(script_client);
     }
 
     run_ctx->start_time = getMonotonicUs();
diff --git a/src/server.c b/src/server.c
index e53e7ff552..cf10621ae5 100644
--- a/src/server.c
+++ b/src/server.c
@@ -4027,21 +4027,20 @@ int processCommand(client *c) {
 
     uint64_t cmd_flags = getCommandFlags(c);
 
-    int is_read_command =
-        (cmd_flags & CMD_READONLY) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_READONLY));
-    int is_write_command =
-        (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE));
-    int is_denyoom_command =
-        (cmd_flags & CMD_DENYOOM) || (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_DENYOOM));
-    int is_denystale_command =
-        !(cmd_flags & CMD_STALE) || (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_STALE));
-    int is_denyloading_command =
-        !(cmd_flags & CMD_LOADING) || (c->cmd->proc == execCommand && (c->mstate.cmd_inv_flags & CMD_LOADING));
-    int is_may_replicate_command =
-        (cmd_flags & (CMD_WRITE | CMD_MAY_REPLICATE)) ||
-        (c->cmd->proc == execCommand && (c->mstate.cmd_flags & (CMD_WRITE | CMD_MAY_REPLICATE)));
-    int is_deny_async_loading_command = (cmd_flags & CMD_NO_ASYNC_LOADING) ||
-                                        (c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_NO_ASYNC_LOADING));
+    int is_exec = (c->mstate && c->cmd->proc == execCommand);
+    int ms_flags = is_exec ? c->mstate->cmd_flags : 0;
+    int ms_inv_flags = is_exec ? c->mstate->cmd_inv_flags : 0;
+    int combined_flags = cmd_flags | ms_flags;
+    int combined_inv_flags = (~cmd_flags | ms_inv_flags);
+
+    int is_read_command = (combined_flags & CMD_READONLY);
+    int is_write_command = (combined_flags & CMD_WRITE);
+    int is_denyoom_command = (combined_flags & CMD_DENYOOM);
+    int is_denystale_command = (combined_inv_flags & CMD_STALE);
+    int is_denyloading_command = (combined_inv_flags & CMD_LOADING);
+    int is_may_replicate_command = (combined_flags & (CMD_WRITE | CMD_MAY_REPLICATE));
+    int is_deny_async_loading_command = (combined_flags & CMD_NO_ASYNC_LOADING);
+
     const int obey_client = mustObeyClient(c);
 
     if (authRequired(c)) {
@@ -4414,7 +4413,7 @@ int isReadyToShutdown(void) {
     listRewind(server.replicas, &li);
     while ((ln = listNext(&li)) != NULL) {
         client *replica = listNodeValue(ln);
-        if (replica->repl_ack_off != server.primary_repl_offset) return 0;
+        if (replica->repl_data->repl_ack_off != server.primary_repl_offset) return 0;
     }
     return 1;
 }
@@ -4460,12 +4459,12 @@ int finishShutdown(void) {
     while ((replicas_list_node = listNext(&replicas_iter)) != NULL) {
         client *replica = listNodeValue(replicas_list_node);
         num_replicas++;
-        if (replica->repl_ack_off != server.primary_repl_offset) {
+        if (replica->repl_data->repl_ack_off != server.primary_repl_offset) {
             num_lagging_replicas++;
-            long lag = replica->repl_state == REPLICA_STATE_ONLINE ? time(NULL) - replica->repl_ack_time : 0;
+            long lag = replica->repl_data->repl_state == REPLICA_STATE_ONLINE ? time(NULL) - replica->repl_data->repl_ack_time : 0;
             serverLog(LL_NOTICE, "Lagging replica %s reported offset %lld behind master, lag=%ld, state=%s.",
-                      replicationGetReplicaName(replica), server.primary_repl_offset - replica->repl_ack_off, lag,
-                      replstateToString(replica->repl_state));
+                      replicationGetReplicaName(replica), server.primary_repl_offset - replica->repl_data->repl_ack_off, lag,
+                      replstateToString(replica->repl_data->repl_state));
         }
     }
     if (num_replicas > 0) {
@@ -5953,11 +5952,11 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
             long long replica_read_repl_offset = 1;
 
             if (server.primary) {
-                replica_repl_offset = server.primary->reploff;
-                replica_read_repl_offset = server.primary->read_reploff;
+                replica_repl_offset = server.primary->repl_data->reploff;
+                replica_read_repl_offset = server.primary->repl_data->read_reploff;
             } else if (server.cached_primary) {
-                replica_repl_offset = server.cached_primary->reploff;
-                replica_read_repl_offset = server.cached_primary->read_reploff;
+                replica_repl_offset = server.cached_primary->repl_data->reploff;
+                replica_read_repl_offset = server.cached_primary->repl_data->read_reploff;
             }
 
             info = sdscatprintf(
@@ -6016,7 +6015,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
             listRewind(server.replicas, &li);
             while ((ln = listNext(&li))) {
                 client *replica = listNodeValue(ln);
-                char ip[NET_IP_STR_LEN], *replica_ip = replica->replica_addr;
+                char ip[NET_IP_STR_LEN], *replica_ip = replica->repl_data->replica_addr;
                 int port;
                 long lag = 0;
 
@@ -6024,18 +6023,18 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                     if (connAddrPeerName(replica->conn, ip, sizeof(ip), &port) == -1) continue;
                     replica_ip = ip;
                 }
-                const char *state = replstateToString(replica->repl_state);
+                const char *state = replstateToString(replica->repl_data->repl_state);
                 if (state[0] == '\0') continue;
-                if (replica->repl_state == REPLICA_STATE_ONLINE) lag = time(NULL) - replica->repl_ack_time;
+                if (replica->repl_data->repl_state == REPLICA_STATE_ONLINE) lag = time(NULL) - replica->repl_data->repl_ack_time;
 
                 info = sdscatprintf(info,
                                     "slave%d:ip=%s,port=%d,state=%s,"
                                     "offset=%lld,lag=%ld,type=%s\r\n",
-                                    replica_id, replica_ip, replica->replica_listening_port, state,
-                                    replica->repl_ack_off, lag,
-                                    replica->flag.repl_rdb_channel                     ? "rdb-channel"
-                                    : replica->repl_state == REPLICA_STATE_BG_RDB_LOAD ? "main-channel"
-                                                                                       : "replica");
+                                    replica_id, replica_ip, replica->repl_data->replica_listening_port, state,
+                                    replica->repl_data->repl_ack_off, lag,
+                                    replica->flag.repl_rdb_channel                                ? "rdb-channel"
+                                    : replica->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD ? "main-channel"
+                                                                                                  : "replica");
                 replica_id++;
             }
         }
@@ -6202,6 +6201,8 @@ void monitorCommand(client *c) {
     /* ignore MONITOR if already replica or in monitor mode */
     if (c->flag.replica) return;
 
+    initClientReplicationData(c);
+
     c->flag.replica = 1;
     c->flag.monitor = 1;
     listAddNodeTail(server.monitors, c);
diff --git a/src/server.h b/src/server.h
index b65488aab8..9fb2a37d72 100644
--- a/src/server.h
+++ b/src/server.h
@@ -704,7 +704,7 @@ typedef enum {
 typedef struct ValkeyModuleType moduleType;
 
 /* Macro to check if the client is in the middle of module based authentication. */
-#define clientHasModuleAuthInProgress(c) ((c)->module_auth_ctx != NULL)
+#define clientHasModuleAuthInProgress(c) (((c)->module_data && (c)->module_data->module_auth_ctx != NULL))
 
 /* Objects encoding. Some kind of objects like Strings and Hashes can be
  * internally represented in multiple ways. The 'encoding' field of the object
@@ -850,6 +850,7 @@ typedef struct multiState {
                              certain flag. */
     size_t argv_len_sums; /* mem used by all commands arguments */
     int alloc_count;      /* total number of multiCmd struct memory reserved. */
+    list watched_keys;
 } multiState;
 
 /* This structure holds the blocking operation state for a client.
@@ -1090,93 +1091,52 @@ typedef struct ClientFlags {
     uint64_t reserved : 4;                 /* Reserved for future use */
 } ClientFlags;
 
-typedef struct client {
-    uint64_t id; /* Client incremental unique ID. */
-    union {
-        uint64_t raw_flag;
-        struct ClientFlags flag;
-    };
-    connection *conn;
-    int resp;                            /* RESP protocol version. Can be 2 or 3. */
-    uint32_t capa;                       /* Client capabilities: CLIENT_CAPA* macros. */
-    serverDb *db;                        /* Pointer to currently SELECTed DB. */
-    robj *name;                          /* As set by CLIENT SETNAME. */
-    robj *lib_name;                      /* The client library name as set by CLIENT SETINFO. */
-    robj *lib_ver;                       /* The client library version as set by CLIENT SETINFO. */
-    sds querybuf;                        /* Buffer we use to accumulate client queries. */
-    size_t qb_pos;                       /* The position we have read in querybuf. */
-    size_t querybuf_peak;                /* Recent (100ms or more) peak of querybuf size. */
-    int argc;                            /* Num of arguments of current command. */
-    robj **argv;                         /* Arguments of current command. */
-    int argv_len;                        /* Size of argv array (may be more than argc) */
-    int original_argc;                   /* Num of arguments of original command if arguments were rewritten. */
-    robj **original_argv;                /* Arguments of original command if arguments were rewritten. */
-    size_t argv_len_sum;                 /* Sum of lengths of objects in argv list. */
-    volatile uint8_t io_read_state;      /* Indicate the IO read state of the client */
-    volatile uint8_t io_write_state;     /* Indicate the IO write state of the client */
-    uint8_t cur_tid;                     /* ID of IO thread currently performing IO for this client */
-    int nread;                           /* Number of bytes of the last read. */
-    int nwritten;                        /* Number of bytes of the last write. */
-    int read_flags;                      /* Client Read flags - used to communicate the client read state. */
-    uint16_t write_flags;                /* Client Write flags - used to communicate the client write state. */
-    struct serverCommand *cmd, *lastcmd; /* Last command executed. */
-    struct serverCommand *realcmd;       /* The original command that was executed by the client,
-                                           Used to update error stats in case the c->cmd was modified
-                                           during the command invocation (like on GEOADD for example). */
-    struct serverCommand *io_parsed_cmd; /* The command that was parsed by the IO thread. */
-    user *user;                          /* User associated with this connection. If the
-                                            user is set to NULL the connection can do
-                                            anything (admin). */
-    int reqtype;                         /* Request protocol type: PROTO_REQ_* */
-    int multibulklen;                    /* Number of multi bulk arguments left to read. */
-    long bulklen;                        /* Length of bulk argument in multi bulk request. */
-    list *reply;                         /* List of reply objects to send to the client. */
-    listNode *io_last_reply_block;       /* Last client reply block when sent to IO thread */
-    unsigned long long reply_bytes;      /* Tot bytes of objects in reply list. */
-    list *deferred_reply_errors;         /* Used for module thread safe contexts. */
-    size_t sentlen;                      /* Amount of bytes already sent in the current
-                                            buffer or object being sent. */
-    time_t ctime;                        /* Client creation time. */
-    long duration;                       /* Current command duration. Used for measuring latency of blocking/non-blocking cmds */
-    int slot;                            /* The slot the client is executing against. Set to -1 if no slot is being used */
-    dictEntry *cur_script;               /* Cached pointer to the dictEntry of the script being executed. */
-    time_t last_interaction;             /* Time of the last interaction, used for timeout */
-    time_t obuf_soft_limit_reached_time;
-    int repl_state;                            /* Replication state if this is a replica. */
-    int repl_start_cmd_stream_on_ack;          /* Install replica write handler on first ACK. */
-    int repldbfd;                              /* Replication DB file descriptor. */
-    off_t repldboff;                           /* Replication DB file offset. */
-    off_t repldbsize;                          /* Replication DB file size. */
-    sds replpreamble;                          /* Replication DB preamble. */
-    long long read_reploff;                    /* Read replication offset if this is a primary. */
-    long long reploff;                         /* Applied replication offset if this is a primary. */
-    long long repl_applied;                    /* Applied replication data count in querybuf, if this is a replica. */
-    long long repl_ack_off;                    /* Replication ack offset, if this is a replica. */
-    long long repl_aof_off;                    /* Replication AOF fsync ack offset, if this is a replica. */
-    long long repl_ack_time;                   /* Replication ack time, if this is a replica. */
-    long long repl_last_partial_write;         /* The last time the server did a partial write from the RDB child pipe to this
-                                                  replica  */
-    long long psync_initial_offset;            /* FULLRESYNC reply offset other replicas
-                                                  copying this replica output buffer
-                                                  should use. */
-    char replid[CONFIG_RUN_ID_SIZE + 1];       /* primary replication ID (if primary). */
-    int replica_listening_port;                /* As configured with: REPLCONF listening-port */
-    char *replica_addr;                        /* Optionally given by REPLCONF ip-address */
-    int replica_version;                       /* Version on the form 0xMMmmpp. */
-    short replica_capa;                        /* Replica capabilities: REPLICA_CAPA_* bitwise OR. */
-    short replica_req;                         /* Replica requirements: REPLICA_REQ_* */
-    uint64_t associated_rdb_client_id;         /* The client id of this replica's rdb connection */
-    time_t rdb_client_disconnect_time;         /* Time of the first freeClient call on this client. Used for delaying free. */
-    multiState mstate;                         /* MULTI/EXEC state */
-    blockingState bstate;                      /* blocking state */
-    long long woff;                            /* Last write global replication offset. */
-    list *watched_keys;                        /* Keys WATCHED for MULTI/EXEC CAS */
-    dict *pubsub_channels;                     /* channels a client is interested in (SUBSCRIBE) */
-    dict *pubsub_patterns;                     /* patterns a client is interested in (PSUBSCRIBE) */
-    dict *pubsubshard_channels;                /* shard level channels a client is interested in (SSUBSCRIBE) */
-    sds peerid;                                /* Cached peer ID. */
-    sds sockname;                              /* Cached connection target address. */
-    listNode *client_list_node;                /* list node in client list */
+typedef struct ClientPubSubData {
+    dict *pubsub_channels;      /* channels a client is interested in (SUBSCRIBE) */
+    dict *pubsub_patterns;      /* patterns a client is interested in (PSUBSCRIBE) */
+    dict *pubsubshard_channels; /* shard level channels a client is interested in (SSUBSCRIBE) */
+    /* If this client is in tracking mode and this field is non zero,
+     * invalidation messages for keys fetched by this client will be sent to
+     * the specified client ID. */
+    uint64_t client_tracking_redirection;
+    rax *client_tracking_prefixes; /* A dictionary of prefixes we are already
+                                      subscribed to in BCAST mode, in the
+                                      context of client side caching. */
+} ClientPubSubData;
+
+typedef struct ClientReplicationData {
+    int repl_state;                      /* Replication state if this is a replica. */
+    int repl_start_cmd_stream_on_ack;    /* Install replica write handler on first ACK. */
+    int repldbfd;                        /* Replication DB file descriptor. */
+    off_t repldboff;                     /* Replication DB file offset. */
+    off_t repldbsize;                    /* Replication DB file size. */
+    sds replpreamble;                    /* Replication DB preamble. */
+    long long read_reploff;              /* Read replication offset if this is a primary. */
+    long long reploff;                   /* Applied replication offset if this is a primary. */
+    long long repl_applied;              /* Applied replication data count in querybuf, if this is a replica. */
+    long long repl_ack_off;              /* Replication ack offset, if this is a replica. */
+    long long repl_aof_off;              /* Replication AOF fsync ack offset, if this is a replica. */
+    long long repl_ack_time;             /* Replication ack time, if this is a replica. */
+    long long repl_last_partial_write;   /* The last time the server did a partial write from the RDB child pipe to this
+                                            replica  */
+    long long psync_initial_offset;      /* FULLRESYNC reply offset other replicas
+                                            copying this replica output buffer
+                                            should use. */
+    char replid[CONFIG_RUN_ID_SIZE + 1]; /* primary replication ID (if primary). */
+    int replica_listening_port;          /* As configured with: REPLCONF listening-port */
+    char *replica_addr;                  /* Optionally given by REPLCONF ip-address */
+    int replica_version;                 /* Version on the form 0xMMmmpp. */
+    short replica_capa;                  /* Replica capabilities: REPLICA_CAPA_* bitwise OR. */
+    short replica_req;                   /* Replica requirements: REPLICA_REQ_* */
+    uint64_t associated_rdb_client_id;   /* The client id of this replica's rdb connection */
+    time_t rdb_client_disconnect_time;   /* Time of the first freeClient call on this client. Used for delaying free. */
+    listNode *ref_repl_buf_node;         /* Referenced node of replication buffer blocks,
+                                           see the definition of replBufBlock. */
+    size_t ref_block_pos;                /* Access position of referenced buffer block,
+                                           i.e. the next offset to send. */
+} ClientReplicationData;
+
+typedef struct ClientModuleData {
     void *module_blocked_client;               /* Pointer to the ValkeyModuleBlockedClient associated with this
                                                 * client. This is set in case of module authentication before the
                                                 * unblocked client is reprocessed to handle reply callbacks. */
@@ -1192,50 +1152,103 @@ typedef struct client {
     void *auth_module;                         /* The module that owns the callback, which is used
                                                 * to disconnect the client if the module is
                                                 * unloaded for cleanup. Opaque for the Server Core.*/
+} ClientModuleData;
 
-    /* If this client is in tracking mode and this field is non zero,
-     * invalidation messages for keys fetched by this client will be sent to
-     * the specified client ID. */
-    uint64_t client_tracking_redirection;
-    rax *client_tracking_prefixes; /* A dictionary of prefixes we are already
-                                      subscribed to in BCAST mode, in the
-                                      context of client side caching. */
+typedef struct client {
+    /* Basic client information and connection. */
+    uint64_t id; /* Client incremental unique ID. */
+    connection *conn;
+    /* Input buffer and command parsing fields */
+    sds querybuf;        /* Buffer we use to accumulate client queries. */
+    size_t qb_pos;       /* The position we have read in querybuf. */
+    robj **argv;         /* Arguments of current command. */
+    int argc;            /* Num of arguments of current command. */
+    int argv_len;        /* Size of argv array (may be more than argc) */
+    size_t argv_len_sum; /* Sum of lengths of objects in argv list. */
+    int reqtype;         /* Request protocol type: PROTO_REQ_* */
+    int multibulklen;    /* Number of multi bulk arguments left to read. */
+    long bulklen;        /* Length of bulk argument in multi bulk request. */
+    long long woff;      /* Last write global replication offset. */
+    /* Command execution state and command information */
+    struct serverCommand *cmd;           /* Current command. */
+    struct serverCommand *lastcmd;       /* Last command executed. */
+    struct serverCommand *realcmd;       /* The original command that was executed by the client */
+    struct serverCommand *io_parsed_cmd; /* The command that was parsed by the IO thread. */
+    time_t last_interaction;             /* Time of the last interaction, used for timeout */
+    serverDb *db;                        /* Pointer to currently SELECTed DB. */
+    /* Client state structs. */
+    ClientPubSubData *pubsub_data;    /* Required for: pubsub commands and tracking. lazily initialized when first needed */
+    ClientReplicationData *repl_data; /* Required for Replication operations. lazily initialized when first needed */
+    ClientModuleData *module_data;    /* Required for Module operations. lazily initialized when first needed */
+    multiState *mstate;               /* MULTI/EXEC state, lazily initialized when first needed */
+    blockingState *bstate;            /* Blocking state, lazily initialized when first needed */
+    /* Output buffer and reply handling */
+    long duration;                       /* Current command duration. Used for measuring latency of blocking/non-blocking cmds */
+    char *buf;                           /* Output buffer */
+    size_t buf_usable_size;              /* Usable size of buffer. */
+    list *reply;                         /* List of reply objects to send to the client. */
+    listNode *io_last_reply_block;       /* Last client reply block when sent to IO thread */
+    size_t io_last_bufpos;               /* The client's bufpos at the time it was sent to the IO thread */
+    unsigned long long reply_bytes;      /* Tot bytes of objects in reply list. */
+    size_t sentlen;                      /* Amount of bytes already sent in the current buffer or object being sent. */
+    listNode clients_pending_write_node; /* list node in clients_pending_write or in clients_pending_io_write list */
+    int bufpos;
+    int original_argc;    /* Num of arguments of original command if arguments were rewritten. */
+    robj **original_argv; /* Arguments of original command if arguments were rewritten. */
+    /* Client flags and state indicators */
+    union {
+        uint64_t raw_flag;
+        struct ClientFlags flag;
+    };
+    uint16_t write_flags;            /* Client Write flags - used to communicate the client write state. */
+    volatile uint8_t io_read_state;  /* Indicate the IO read state of the client */
+    volatile uint8_t io_write_state; /* Indicate the IO write state of the client */
+    uint8_t resp;                    /* RESP protocol version. Can be 2 or 3. */
+    uint8_t cur_tid;                 /* ID of IO thread currently performing IO for this client */
     /* In updateClientMemoryUsage() we track the memory usage of
      * each client and add it to the sum of all the clients of a given type,
      * however we need to remember what was the old contribution of each
      * client, and in which category the client was, in order to remove it
      * before adding it the new value. */
-    size_t last_memory_usage;
-    int last_memory_type;
-
+    uint8_t last_memory_type;
+    uint8_t capa;                    /* Client capabilities: CLIENT_CAPA* macros. */
+    listNode pending_read_list_node; /* IO thread only ?*/
+    /* Statistics and metrics */
+    unsigned long long net_input_bytes;           /* Total network input bytes read from this client. */
+    unsigned long long net_input_bytes_curr_cmd;  /* Total network input bytes read for the* execution of this client's current command. */
+    unsigned long long net_output_bytes;          /* Total network output bytes sent to this client. */
+    unsigned long long commands_processed;        /* Total count of commands this client executed. */
+    unsigned long long net_output_bytes_curr_cmd; /* Total network output bytes sent to this client, by the current command. */
+    size_t buf_peak;                              /* Peak used size of buffer in last 5 sec interval. */
+    int nwritten;                                 /* Number of bytes of the last write. */
+    int nread;                                    /* Number of bytes of the last read. */
+    int read_flags;                               /* Client Read flags - used to communicate the client read state. */
+    int slot;                                     /* The slot the client is executing against. Set to -1 if no slot is being used */
     listNode *mem_usage_bucket_node;
     clientMemUsageBucket *mem_usage_bucket;
-
-    listNode *ref_repl_buf_node; /* Referenced node of replication buffer blocks,
-                                  * see the definition of replBufBlock. */
-    size_t ref_block_pos;        /* Access position of referenced buffer block,
-                                  * i.e. the next offset to send. */
-
-    /* list node in clients_pending_write or in clients_pending_io_write list */
-    listNode clients_pending_write_node;
-    listNode pending_read_list_node; /* list node in clients_pending_io_read list */
-    /* Response buffer */
-    size_t buf_peak;                   /* Peak used size of buffer in last 5 sec interval. */
+    /* In updateClientMemoryUsage() we track the memory usage of
+     * each client and add it to the sum of all the clients of a given type,
+     * however we need to remember what was the old contribution of each
+     * client, and in which category the client was, in order to remove it
+     * before adding it the new value. */
+    size_t last_memory_usage;
+    /* Fields after this point are less frequently used */
+    listNode *client_list_node;        /* list node in client list */
     mstime_t buf_peak_last_reset_time; /* keeps the last time the buffer peak value was reset */
-    int bufpos;
-    size_t io_last_bufpos;  /* The client's bufpos at the time it was sent to the IO thread */
-    size_t buf_usable_size; /* Usable size of buffer. */
-    char *buf;
+    size_t querybuf_peak;              /* Recent (100ms or more) peak of querybuf size. */
+    dictEntry *cur_script;             /* Cached pointer to the dictEntry of the script being executed. */
+    user *user;                        /* User associated with this connection */
+    time_t obuf_soft_limit_reached_time;
+    list *deferred_reply_errors; /* Used for module thread safe contexts. */
+    robj *name;                  /* As set by CLIENT SETNAME. */
+    robj *lib_name;              /* The client library name as set by CLIENT SETINFO. */
+    robj *lib_ver;               /* The client library version as set by CLIENT SETINFO. */
+    sds peerid;                  /* Cached peer ID. */
+    sds sockname;                /* Cached connection target address. */
+    time_t ctime;                /* Client creation time. */
 #ifdef LOG_REQ_RES
     clientReqResInfo reqres;
 #endif
-    unsigned long long net_input_bytes;          /* Total network input bytes read from this client. */
-    unsigned long long net_input_bytes_curr_cmd; /* Total network input bytes read for the
-                                                  * execution of this client's current command. */
-    unsigned long long net_output_bytes;         /* Total network output bytes sent to this client. */
-    unsigned long long commands_processed;       /* Total count of commands this client executed. */
-    unsigned long long
-        net_output_bytes_curr_cmd; /* Total network output bytes sent to this client, by the current command. */
 } client;
 
 /* When a command generates a lot of discrete elements to the client output buffer, it is much faster to
@@ -2920,6 +2933,8 @@ void abortFailover(const char *err);
 const char *getFailoverStateString(void);
 int sendCurrentOffsetToReplica(client *replica);
 void addRdbReplicaToPsyncWait(client *replica);
+void initClientReplicationData(client *c);
+void freeClientReplicationData(client *c);
 
 /* Generic persistence functions */
 void startLoadingFile(size_t size, char *filename, int rdbflags);
@@ -3258,6 +3273,8 @@ void unmarkClientAsPubSub(client *c);
 int pubsubTotalSubscriptions(void);
 dict *getClientPubSubChannels(client *c);
 dict *getClientPubSubShardChannels(client *c);
+void initClientPubSubData(client *c);
+void freeClientPubSubData(client *c);
 
 /* Keyspace events notification */
 void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid);
@@ -3500,6 +3517,7 @@ typedef struct luaScript {
 /* Blocked clients API */
 void processUnblockedClients(void);
 void initClientBlockingState(client *c);
+void freeClientBlockingState(client *c);
 void blockClient(client *c, int btype);
 void unblockClient(client *c, int queue_for_reprocessing);
 void unblockClientOnTimeout(client *c);
diff --git a/src/timeout.c b/src/timeout.c
index f5534e7e71..f421dda64e 100644
--- a/src/timeout.c
+++ b/src/timeout.c
@@ -37,7 +37,7 @@
  * not blocked right now). If so send a reply, unblock it, and return 1.
  * Otherwise 0 is returned and no operation is performed. */
 int checkBlockedClientTimeout(client *c, mstime_t now) {
-    if (c->flag.blocked && c->bstate.timeout != 0 && c->bstate.timeout < now) {
+    if (c->flag.blocked && c->bstate->timeout != 0 && c->bstate->timeout < now) {
         /* Handle blocking operation specific timeout. */
         unblockClientOnTimeout(c);
         return 1;
@@ -108,8 +108,8 @@ void decodeTimeoutKey(unsigned char *buf, uint64_t *toptr, client **cptr) {
  * to handle blocked clients timeouts. The client is not added to the list
  * if its timeout is zero (block forever). */
 void addClientToTimeoutTable(client *c) {
-    if (c->bstate.timeout == 0) return;
-    uint64_t timeout = c->bstate.timeout;
+    if (c->bstate->timeout == 0) return;
+    uint64_t timeout = c->bstate->timeout;
     unsigned char buf[CLIENT_ST_KEYLEN];
     encodeTimeoutKey(buf, timeout, c);
     if (raxTryInsert(server.clients_timeout_table, buf, sizeof(buf), NULL, NULL)) c->flag.in_to_table = 1;
@@ -120,7 +120,7 @@ void addClientToTimeoutTable(client *c) {
 void removeClientFromTimeoutTable(client *c) {
     if (!c->flag.in_to_table) return;
     c->flag.in_to_table = 0;
-    uint64_t timeout = c->bstate.timeout;
+    uint64_t timeout = c->bstate->timeout;
     unsigned char buf[CLIENT_ST_KEYLEN];
     encodeTimeoutKey(buf, timeout, c);
     raxRemove(server.clients_timeout_table, buf, sizeof(buf), NULL);
diff --git a/src/tracking.c b/src/tracking.c
index de1b34d850..021d948e99 100644
--- a/src/tracking.c
+++ b/src/tracking.c
@@ -69,7 +69,7 @@ void disableTracking(client *c) {
      * from all the prefixes it is registered to. */
     if (c->flag.tracking_bcast) {
         raxIterator ri;
-        raxStart(&ri, c->client_tracking_prefixes);
+        raxStart(&ri, c->pubsub_data->client_tracking_prefixes);
         raxSeek(&ri, "^", NULL, 0);
         while (raxNext(&ri)) {
             void *result;
@@ -87,8 +87,8 @@ void disableTracking(client *c) {
             }
         }
         raxStop(&ri);
-        raxFree(c->client_tracking_prefixes);
-        c->client_tracking_prefixes = NULL;
+        raxFree(c->pubsub_data->client_tracking_prefixes);
+        c->pubsub_data->client_tracking_prefixes = NULL;
     }
 
     /* Clear flags and adjust the count. */
@@ -117,9 +117,9 @@ static int stringCheckPrefix(unsigned char *s1, size_t s1_len, unsigned char *s2
 int checkPrefixCollisionsOrReply(client *c, robj **prefixes, size_t numprefix) {
     for (size_t i = 0; i < numprefix; i++) {
         /* Check input list has no overlap with existing prefixes. */
-        if (c->client_tracking_prefixes) {
+        if (c->pubsub_data->client_tracking_prefixes) {
             raxIterator ri;
-            raxStart(&ri, c->client_tracking_prefixes);
+            raxStart(&ri, c->pubsub_data->client_tracking_prefixes);
             raxSeek(&ri, "^", NULL, 0);
             while (raxNext(&ri)) {
                 if (stringCheckPrefix(ri.key, ri.key_len, prefixes[i]->ptr, sdslen(prefixes[i]->ptr))) {
@@ -166,8 +166,8 @@ void enableBcastTrackingForPrefix(client *c, char *prefix, size_t plen) {
         bs = result;
     }
     if (raxTryInsert(bs->clients, (unsigned char *)&c, sizeof(c), NULL, NULL)) {
-        if (c->client_tracking_prefixes == NULL) c->client_tracking_prefixes = raxNew();
-        raxInsert(c->client_tracking_prefixes, (unsigned char *)prefix, plen, NULL, NULL);
+        if (c->pubsub_data->client_tracking_prefixes == NULL) c->pubsub_data->client_tracking_prefixes = raxNew();
+        raxInsert(c->pubsub_data->client_tracking_prefixes, (unsigned char *)prefix, plen, NULL, NULL);
     }
 }
 
@@ -186,7 +186,8 @@ void enableTracking(client *c, uint64_t redirect_to, struct ClientFlags options,
     c->flag.tracking_optin = 0;
     c->flag.tracking_optout = 0;
     c->flag.tracking_noloop = 0;
-    c->client_tracking_redirection = redirect_to;
+    initClientPubSubData(c);
+    c->pubsub_data->client_tracking_redirection = redirect_to;
 
     /* This may be the first client we ever enable. Create the tracking
      * table if it does not exist. */
@@ -277,8 +278,8 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) {
     c->flag.pushing = 1;
 
     int using_redirection = 0;
-    if (c->client_tracking_redirection) {
-        client *redir = lookupClientByID(c->client_tracking_redirection);
+    if (c->pubsub_data->client_tracking_redirection) {
+        client *redir = lookupClientByID(c->pubsub_data->client_tracking_redirection);
         if (!redir) {
             c->flag.tracking_broken_redir = 1;
             /* We need to signal to the original connection that we
@@ -287,7 +288,7 @@ void sendTrackingMessage(client *c, char *keyname, size_t keylen, int proto) {
             if (c->resp > 2) {
                 addReplyPushLen(c, 2);
                 addReplyBulkCBuffer(c, "tracking-redir-broken", 21);
-                addReplyLongLong(c, c->client_tracking_redirection);
+                addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
             }
             if (!old_flags.pushing) c->flag.pushing = 0;
             return;

From bd18b283542cea013c2291142ab3508da4b9a615 Mon Sep 17 00:00:00 2001
From: Lipeng Zhu <lipeng.zhu@intel.com>
Date: Wed, 8 Jan 2025 19:33:02 +0800
Subject: [PATCH 062/101] Add build folder to gitignore. (#1488)

Default cmake build folder in vscode is `"cmake.buildDirectory": "${workspaceFolder}/build"`.

Signed-off-by: Lipeng Zhu <lipeng.zhu@intel.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index d5cac316e6..d85087c459 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ nodes*.conf
 tests/cluster/tmp/*
 tests/rdma/rdma-test
 tags
+build/
 build-debug/
 build-release/
 cmake-build-debug/

From 0d3118ac239f2c594971509504d1e32793fc89f6 Mon Sep 17 00:00:00 2001
From: Rain Valentine <rsg000@gmail.com>
Date: Wed, 8 Jan 2025 09:34:02 -0800
Subject: [PATCH 063/101] Replace dict with new hashtable: sorted set datatype
 (#1427)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR replaces dict with hashtable in the ZSET datatype. Instead of
mapping key to score as dict did, the hashtable maps key to a node in
the skiplist, which contains the score. This takes advantage of
hashtable performance improvements and saves 15 bytes per set item - 24
bytes overhead before, 9 bytes after.

Closes #1096

---------

Signed-off-by: Rain Valentine <rsg000@gmail.com>
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/aof.c    |  21 +-
 src/db.c     |  44 +++-
 src/debug.c  |  26 +--
 src/defrag.c | 117 +++++-----
 src/evict.c  |   6 +-
 src/geo.c    |   2 +-
 src/module.c |  24 ++-
 src/object.c |  19 +-
 src/rdb.c    |   4 +-
 src/server.c |  16 +-
 src/server.h |  11 +-
 src/sort.c   |  21 +-
 src/t_zset.c | 593 +++++++++++++++++++++++----------------------------
 13 files changed, 420 insertions(+), 484 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 3629fa1acf..28353e485c 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1896,30 +1896,29 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) {
         }
     } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = o->ptr;
-        dictIterator *di = dictGetIterator(zs->dict);
-        dictEntry *de;
-
-        while ((de = dictNext(di)) != NULL) {
-            sds ele = dictGetKey(de);
-            double *score = dictGetVal(de);
-
+        hashtableIterator iter;
+        hashtableInitIterator(&iter, zs->ht);
+        void *next;
+        while (hashtableNext(&iter, &next)) {
+            zskiplistNode *node = next;
             if (count == 0) {
                 int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? AOF_REWRITE_ITEMS_PER_CMD : items;
 
                 if (!rioWriteBulkCount(r, '*', 2 + cmd_items * 2) || !rioWriteBulkString(r, "ZADD", 4) ||
                     !rioWriteBulkObject(r, key)) {
-                    dictReleaseIterator(di);
+                    hashtableResetIterator(&iter);
                     return 0;
                 }
             }
-            if (!rioWriteBulkDouble(r, *score) || !rioWriteBulkString(r, ele, sdslen(ele))) {
-                dictReleaseIterator(di);
+            sds ele = node->ele;
+            if (!rioWriteBulkDouble(r, node->score) || !rioWriteBulkString(r, ele, sdslen(ele))) {
+                hashtableResetIterator(&iter);
                 return 0;
             }
             if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0;
             items--;
         }
-        dictReleaseIterator(di);
+        hashtableResetIterator(&iter);
     } else {
         serverPanic("Unknown sorted zset encoding");
     }
diff --git a/src/db.c b/src/db.c
index 9a53e6b4d1..55ffe5da5a 100644
--- a/src/db.c
+++ b/src/db.c
@@ -1004,13 +1004,6 @@ void dictScanCallback(void *privdata, const dictEntry *de) {
         if (!data->only_keys) {
             val = dictGetVal(de);
         }
-    } else if (o->type == OBJ_ZSET) {
-        key = sdsdup(keysds);
-        if (!data->only_keys) {
-            char buf[MAX_LONG_DOUBLE_CHARS];
-            int len = ld2string(buf, sizeof(buf), *(double *)dictGetVal(de), LD_STR_AUTO);
-            val = sdsnewlen(buf, len);
-        }
     } else {
         serverPanic("Type not handled in dict SCAN callback.");
     }
@@ -1021,13 +1014,26 @@ void dictScanCallback(void *privdata, const dictEntry *de) {
 
 void hashtableScanCallback(void *privdata, void *entry) {
     scanData *data = (scanData *)privdata;
+    sds val = NULL;
+    sds key = NULL;
+
     robj *o = data->o;
     list *keys = data->keys;
     data->sampled++;
 
-    /* currently only implemented for SET scan */
-    serverAssert(o && o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE);
-    sds key = (sds)entry; /* Specific for OBJ_SET */
+    /* This callback is only used for scanning elements within a key (hash
+     * fields, set elements, etc.) so o must be set here. */
+    serverAssert(o != NULL);
+
+    /* get key */
+    if (o->type == OBJ_SET) {
+        key = (sds)entry;
+    } else if (o->type == OBJ_ZSET) {
+        zskiplistNode *node = (zskiplistNode *)entry;
+        key = node->ele;
+    } else {
+        serverPanic("Type not handled in hashset SCAN callback.");
+    }
 
     /* Filter element if it does not match the pattern. */
     if (data->pattern) {
@@ -1036,7 +1042,23 @@ void hashtableScanCallback(void *privdata, void *entry) {
         }
     }
 
+    if (o->type == OBJ_SET) {
+        /* no value, key used by reference */
+    } else if (o->type == OBJ_ZSET) {
+        /* zset data is copied */
+        zskiplistNode *node = (zskiplistNode *)entry;
+        key = sdsdup(node->ele);
+        if (!data->only_keys) {
+            char buf[MAX_LONG_DOUBLE_CHARS];
+            int len = ld2string(buf, sizeof(buf), node->score, LD_STR_AUTO);
+            val = sdsnewlen(buf, len);
+        }
+    } else {
+        serverPanic("Type not handled in hashset SCAN callback.");
+    }
+
     listAddNodeTail(keys, key);
+    if (val) listAddNodeTail(keys, val);
 }
 
 /* Try to parse a SCAN cursor stored at object 'o':
@@ -1184,7 +1206,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
         shallow_copied_list_items = 1;
     } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = o->ptr;
-        dict_table = zs->dict;
+        hashtable_table = zs->ht;
         /* scanning ZSET allocates temporary strings even though it's a dict */
         shallow_copied_list_items = 0;
     }
diff --git a/src/debug.c b/src/debug.c
index 7e52874e30..c80ff5af39 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -206,20 +206,20 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o)
             }
         } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
             zset *zs = o->ptr;
-            dictIterator *di = dictGetIterator(zs->dict);
-            dictEntry *de;
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, zs->ht);
 
-            while ((de = dictNext(di)) != NULL) {
-                sds sdsele = dictGetKey(de);
-                double *score = dictGetVal(de);
-                const int len = fpconv_dtoa(*score, buf);
+            void *next;
+            while (hashtableNext(&iter, &next)) {
+                zskiplistNode *node = next;
+                const int len = fpconv_dtoa(node->score, buf);
                 buf[len] = '\0';
                 memset(eledigest, 0, 20);
-                mixDigest(eledigest, sdsele, sdslen(sdsele));
+                mixDigest(eledigest, node->ele, sdslen(node->ele));
                 mixDigest(eledigest, buf, strlen(buf));
                 xorDigest(digest, eledigest, 20);
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iter);
         } else {
             serverPanic("Unknown sorted set encoding");
         }
@@ -284,13 +284,11 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o)
  * a different digest. */
 void computeDatasetDigest(unsigned char *final) {
     unsigned char digest[20];
-    robj *o;
-    int j;
     uint32_t aux;
 
     memset(final, 0, 20); /* Start with a clean result */
 
-    for (j = 0; j < server.dbnum; j++) {
+    for (int j = 0; j < server.dbnum; j++) {
         serverDb *db = server.db + j;
         if (kvstoreSize(db->keys) == 0) continue;
         kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys);
@@ -300,7 +298,9 @@ void computeDatasetDigest(unsigned char *final) {
         mixDigest(final, &aux, sizeof(aux));
 
         /* Iterate this DB writing every entry */
-        while (kvstoreIteratorNext(kvs_it, (void **)&o)) {
+        void *next;
+        while (kvstoreIteratorNext(kvs_it, &next)) {
+            robj *o = next;
             sds key;
             robj *keyobj;
 
@@ -929,7 +929,7 @@ void debugCommand(client *c) {
         switch (o->encoding) {
         case OBJ_ENCODING_SKIPLIST: {
             zset *zs = o->ptr;
-            d = zs->dict;
+            ht = zs->ht;
         } break;
         case OBJ_ENCODING_HT: d = o->ptr; break;
         case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break;
diff --git a/src/defrag.c b/src/defrag.c
index a755db559a..103730ee14 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -297,55 +297,45 @@ static void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode
     }
 }
 
-/* Defrag helper for sorted set.
- * Update the robj pointer, defrag the skiplist struct and return the new score
- * reference. We may not access oldele pointer (not even the pointer stored in
- * the skiplist), as it was already freed. Newele may be null, in which case we
- * only need to defrag the skiplist, but not update the obj pointer.
- * When return value is non-NULL, it is the score reference that must be updated
- * in the dict record. */
-static double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) {
-    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x, *newx;
-    int i;
-    sds ele = newele ? newele : oldele;
-
-    /* find the skiplist node referring to the object that was moved,
-     * and all pointers that need to be updated if we'll end up moving the skiplist node. */
-    x = zsl->header;
-    for (i = zsl->level - 1; i >= 0; i--) {
-        while (x->level[i].forward && x->level[i].forward->ele != oldele && /* make sure not to access the
-                                                                               ->obj pointer if it matches
-                                                                               oldele */
-               (x->level[i].forward->score < score ||
-                (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0)))
-            x = x->level[i].forward;
+/* Hashtable scan callback for sorted set. It defragments a single skiplist
+ * node, updates skiplist pointers, and updates the hashtable pointer to the
+ * node. */
+static void activeDefragZsetNode(void *privdata, void *entry_ref) {
+    zskiplist *zsl = privdata;
+    zskiplistNode **node_ref = (zskiplistNode **)entry_ref;
+    zskiplistNode *node = *node_ref;
+
+    /* defragment node internals */
+    sds newsds = activeDefragSds(node->ele);
+    if (newsds) node->ele = newsds;
+
+    const double score = node->score;
+    const sds ele = node->ele;
+
+    /* find skiplist pointers that need to be updated if we end up moving the
+     * skiplist node. */
+    zskiplistNode *update[ZSKIPLIST_MAXLEVEL];
+    zskiplistNode *x = zsl->header;
+    for (int i = zsl->level - 1; i >= 0; i--) {
+        /* stop when we've reached the end of this level or the next node comes
+         * after our target in sorted order */
+        zskiplistNode *next = x->level[i].forward;
+        while (next &&
+               (next->score < score ||
+                (next->score == score && sdscmp(next->ele, ele) < 0))) {
+            x = next;
+            next = x->level[i].forward;
+        }
         update[i] = x;
     }
-
-    /* update the robj pointer inside the skip list record. */
-    x = x->level[0].forward;
-    serverAssert(x && score == x->score && x->ele == oldele);
-    if (newele) x->ele = newele;
+    /* should have arrived at intended node */
+    serverAssert(x->level[0].forward == node);
 
     /* try to defrag the skiplist record itself */
-    newx = activeDefragAlloc(x);
-    if (newx) {
-        zslUpdateNode(zsl, x, newx, update);
-        return &newx->score;
-    }
-    return NULL;
-}
-
-/* Defrag helper for sorted set.
- * Defrag a single dict entry key name, and corresponding skiplist struct */
-static void activeDefragZsetEntry(zset *zs, dictEntry *de) {
-    sds newsds;
-    double *newscore;
-    sds sdsele = dictGetKey(de);
-    if ((newsds = activeDefragSds(sdsele))) dictSetKey(zs->dict, de, newsds);
-    newscore = zslDefrag(zs->zsl, *(double *)dictGetVal(de), sdsele, newsds);
-    if (newscore) {
-        dictSetVal(zs->dict, de, newscore);
+    zskiplistNode *newnode = activeDefragAlloc(node);
+    if (newnode) {
+        zslUpdateNode(zsl, node, newnode, update);
+        *node_ref = newnode; /* update hashtable pointer */
     }
 }
 
@@ -472,24 +462,15 @@ static long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) {
     return bookmark_failed ? 1 : 0;
 }
 
-typedef struct {
-    zset *zs;
-} scanLaterZsetData;
-
-static void scanLaterZsetCallback(void *privdata, const dictEntry *_de) {
-    dictEntry *de = (dictEntry *)_de;
-    scanLaterZsetData *data = privdata;
-    activeDefragZsetEntry(data->zs, de);
+static void scanLaterZsetCallback(void *privdata, void *element_ref) {
+    activeDefragZsetNode(privdata, element_ref);
     server.stat_active_defrag_scanned++;
 }
 
 static void scanLaterZset(robj *ob, unsigned long *cursor) {
     if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return;
     zset *zs = (zset *)ob->ptr;
-    dict *d = zs->dict;
-    scanLaterZsetData data = {zs};
-    dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc};
-    *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, &defragfns, &data);
+    *cursor = hashtableScanDefrag(zs->ht, *cursor, scanLaterZsetCallback, zs->zsl, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
 }
 
 /* Used as hashtable scan callback when all we need is to defrag the hashtable
@@ -533,27 +514,27 @@ static void defragQuicklist(robj *ob) {
 }
 
 static void defragZsetSkiplist(robj *ob) {
+    serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST);
     zset *zs = (zset *)ob->ptr;
+
     zset *newzs;
     zskiplist *newzsl;
-    dict *newdict;
-    dictEntry *de;
     struct zskiplistNode *newheader;
-    serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST);
     if ((newzs = activeDefragAlloc(zs))) ob->ptr = zs = newzs;
     if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl;
     if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader;
-    if (dictSize(zs->dict) > server.active_defrag_max_scan_fields)
+
+    hashtable *newtable;
+    if ((newtable = hashtableDefragTables(zs->ht, activeDefragAlloc))) zs->ht = newtable;
+
+    if (hashtableSize(zs->ht) > server.active_defrag_max_scan_fields)
         defragLater(ob);
     else {
-        dictIterator *di = dictGetIterator(zs->dict);
-        while ((de = dictNext(di)) != NULL) {
-            activeDefragZsetEntry(zs, de);
-        }
-        dictReleaseIterator(di);
+        unsigned long cursor = 0;
+        do {
+            cursor = hashtableScanDefrag(zs->ht, cursor, activeDefragZsetNode, zs->zsl, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
+        } while (cursor != 0);
     }
-    /* defrag the dict struct and tables */
-    if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict;
 }
 
 static void defragHash(robj *ob) {
diff --git a/src/evict.c b/src/evict.c
index eecd000a4b..d4bfade4fc 100644
--- a/src/evict.c
+++ b/src/evict.c
@@ -642,9 +642,9 @@ int performEvictions(void) {
                     kvs = db->expires;
                 }
                 int slot = kvstoreGetFairRandomHashtableIndex(kvs);
-                int found = kvstoreHashtableRandomEntry(kvs, slot, (void **)&valkey);
-                if (found) {
-                    bestkey = objectGetKey(valkey);
+                void *entry;
+                if (kvstoreHashtableRandomEntry(kvs, slot, &entry)) {
+                    bestkey = objectGetKey((robj *)entry);
                     bestdbid = j;
                     break;
                 }
diff --git a/src/geo.c b/src/geo.c
index 75654f85a5..65f17c81db 100644
--- a/src/geo.c
+++ b/src/geo.c
@@ -774,7 +774,7 @@ void georadiusGeneric(client *c, int srcKeyIndex, int flags) {
             if (maxelelen < elelen) maxelelen = elelen;
             totelelen += elelen;
             znode = zslInsert(zs->zsl, score, gp->member);
-            serverAssert(dictAdd(zs->dict, gp->member, &znode->score) == DICT_OK);
+            serverAssert(hashtableAdd(zs->ht, znode));
             gp->member = NULL;
         }
 
diff --git a/src/module.c b/src/module.c
index 7388dc6a20..58555839f2 100644
--- a/src/module.c
+++ b/src/module.c
@@ -11096,12 +11096,10 @@ static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) {
     robj *o = data->key->value;
     robj *field = createStringObject(key, sdslen(key));
     robj *value = NULL;
+
     if (o->type == OBJ_HASH) {
         sds val = dictGetVal(de);
         value = createStringObject(val, sdslen(val));
-    } else if (o->type == OBJ_ZSET) {
-        double *val = (double *)dictGetVal(de);
-        value = createStringObjectFromLongDouble(*val, 0);
     } else {
         serverPanic("unexpected object type");
     }
@@ -11114,12 +11112,24 @@ static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) {
 static void moduleScanKeyHashtableCallback(void *privdata, void *entry) {
     ScanKeyCBData *data = privdata;
     robj *o = data->key->value;
-    serverAssert(o->type == OBJ_SET);
-    sds key = entry;
+    robj *value = NULL;
+    sds key = NULL;
+
+    if (o->type == OBJ_SET) {
+        key = entry;
+        /* no value */
+    } else if (o->type == OBJ_ZSET) {
+        zskiplistNode *node = (zskiplistNode *)entry;
+        key = node->ele;
+        value = createStringObjectFromLongDouble(node->score, 0);
+    } else {
+        serverPanic("unexpected object type");
+    }
     robj *field = createStringObject(key, sdslen(key));
 
-    data->fn(data->key, field, NULL, data->user_data);
+    data->fn(data->key, field, value, data->user_data);
     decrRefCount(field);
+    if (value) decrRefCount(value);
 }
 
 /* Scan api that allows a module to scan the elements in a hash, set or sorted set key
@@ -11183,7 +11193,7 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
     } else if (o->type == OBJ_HASH) {
         if (o->encoding == OBJ_ENCODING_HT) d = o->ptr;
     } else if (o->type == OBJ_ZSET) {
-        if (o->encoding == OBJ_ENCODING_SKIPLIST) d = ((zset *)o->ptr)->dict;
+        if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->ht;
     } else {
         errno = EINVAL;
         return 0;
diff --git a/src/object.c b/src/object.c
index 637b25e30c..86eefe43a3 100644
--- a/src/object.c
+++ b/src/object.c
@@ -461,7 +461,7 @@ robj *createZsetObject(void) {
     zset *zs = zmalloc(sizeof(*zs));
     robj *o;
 
-    zs->dict = dictCreate(&zsetDictType);
+    zs->ht = hashtableCreate(&zsetHashtableType);
     zs->zsl = zslCreate();
     o = createObject(OBJ_ZSET, zs);
     o->encoding = OBJ_ENCODING_SKIPLIST;
@@ -519,7 +519,7 @@ void freeZsetObject(robj *o) {
     switch (o->encoding) {
     case OBJ_ENCODING_SKIPLIST:
         zs = o->ptr;
-        dictRelease(zs->dict);
+        hashtableRelease(zs->ht);
         zslFree(zs->zsl);
         zfree(zs);
         break;
@@ -665,10 +665,7 @@ void dismissZsetObject(robj *o, size_t size_hint) {
             }
         }
 
-        /* Dismiss hash table memory. */
-        dict *d = zs->dict;
-        dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry *));
-        dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry *));
+        dismissHashtable(zs->ht);
     } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
         dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr));
     } else {
@@ -1187,18 +1184,18 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
         if (o->encoding == OBJ_ENCODING_LISTPACK) {
             asize = sizeof(*o) + zmalloc_size(o->ptr);
         } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
-            d = ((zset *)o->ptr)->dict;
+            hashtable *ht = ((zset *)o->ptr)->ht;
             zskiplist *zsl = ((zset *)o->ptr)->zsl;
             zskiplistNode *znode = zsl->header->level[0].forward;
-            asize = sizeof(*o) + sizeof(zset) + sizeof(zskiplist) + sizeof(dict) +
-                    (sizeof(struct dictEntry *) * dictBuckets(d)) + zmalloc_size(zsl->header);
+            asize = sizeof(*o) + sizeof(zset) + sizeof(zskiplist) +
+                    hashtableMemUsage(ht) + zmalloc_size(zsl->header);
             while (znode != NULL && samples < sample_size) {
                 elesize += sdsAllocSize(znode->ele);
-                elesize += dictEntryMemUsage(NULL) + zmalloc_size(znode);
+                elesize += zmalloc_size(znode);
                 samples++;
                 znode = znode->level[0].forward;
             }
-            if (samples) asize += (double)elesize / samples * dictSize(d);
+            if (samples) asize += (double)elesize / samples * hashtableSize(ht);
         } else {
             serverPanic("Unknown sorted set encoding");
         }
diff --git a/src/rdb.c b/src/rdb.c
index 32c9021669..6a2ec78d71 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -2005,7 +2005,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
         o = createZsetObject();
         zs = o->ptr;
 
-        if (zsetlen > DICT_HT_INITIAL_SIZE && dictTryExpand(zs->dict, zsetlen) != DICT_OK) {
+        if (!hashtableTryExpand(zs->ht, zsetlen)) {
             rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)zsetlen);
             decrRefCount(o);
             return NULL;
@@ -2048,7 +2048,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             totelelen += sdslen(sdsele);
 
             znode = zslInsert(zs->zsl, score, sdsele);
-            if (dictAdd(zs->dict, sdsele, &znode->score) != DICT_OK) {
+            if (!hashtableAdd(zs->ht, znode)) {
                 rdbReportCorruptRDB("Duplicate zset fields detected");
                 decrRefCount(o);
                 /* no need to free 'sdsele', will be released by zslFree together with 'o' */
diff --git a/src/server.c b/src/server.c
index cf10621ae5..7db95a1822 100644
--- a/src/server.c
+++ b/src/server.c
@@ -556,14 +556,16 @@ hashtableType setHashtableType = {
     .keyCompare = hashtableSdsKeyCompare,
     .entryDestructor = dictSdsDestructor};
 
+const void *zsetHashtableGetKey(const void *element) {
+    const zskiplistNode *node = element;
+    return node->ele;
+}
+
 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
-dictType zsetDictType = {
-    dictSdsHash,       /* hash function */
-    NULL,              /* key dup */
-    dictSdsKeyCompare, /* key compare */
-    NULL,              /* Note: SDS string shared & freed by skiplist */
-    NULL,              /* val destructor */
-    NULL,              /* allow to expand */
+hashtableType zsetHashtableType = {
+    .hashFunction = dictSdsHash,
+    .entryGetKey = zsetHashtableGetKey,
+    .keyCompare = hashtableSdsKeyCompare,
 };
 
 uint64_t hashtableSdsHash(const void *key) {
diff --git a/src/server.h b/src/server.h
index 9fb2a37d72..0e645cdc11 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1323,7 +1323,7 @@ typedef struct zskiplist {
 } zskiplist;
 
 typedef struct zset {
-    dict *dict;
+    hashtable *ht;
     zskiplist *zsl;
 } zset;
 
@@ -2539,7 +2539,7 @@ extern dictType objectKeyPointerValueDictType;
 extern dictType objectKeyHeapPointerValueDictType;
 extern hashtableType setHashtableType;
 extern dictType BenchmarkDictType;
-extern dictType zsetDictType;
+extern hashtableType zsetHashtableType;
 extern hashtableType kvstoreKeysHashtableType;
 extern hashtableType kvstoreExpiresHashtableType;
 extern double R_Zero, R_PosInf, R_NegInf, R_Nan;
@@ -3085,8 +3085,6 @@ typedef struct {
 zskiplist *zslCreate(void);
 void zslFree(zskiplist *zsl);
 zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele);
-unsigned char *zzlInsert(unsigned char *zl, sds ele, double score);
-int zslDelete(zskiplist *zsl, double score, sds ele, zskiplistNode **node);
 zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n);
 double zzlGetScore(unsigned char *sptr);
 void zzlNext(unsigned char *zl, unsigned char **eptr, unsigned char **sptr);
@@ -3097,9 +3095,7 @@ unsigned long zsetLength(const robj *zobj);
 void zsetConvert(robj *zobj, int encoding);
 void zsetConvertToListpackIfNeeded(robj *zobj, size_t maxelelen, size_t totelelen);
 int zsetScore(robj *zobj, sds member, double *score);
-unsigned long zslGetRank(zskiplist *zsl, double score, sds o);
 int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, double *newscore);
-long zsetRank(robj *zobj, sds ele, int reverse, double *score);
 int zsetDel(robj *zobj, sds ele);
 robj *zsetDup(robj *o);
 void genericZpopCommand(client *c,
@@ -3566,10 +3562,11 @@ unsigned long LFUDecrAndReturn(robj *o);
 int performEvictions(void);
 void startEvictionTimeProc(void);
 
-/* Keys hashing / comparison functions for dict.c hash tables. */
+/* Keys hashing/comparison functions for dict.c and hashtable.c hash tables. */
 uint64_t dictSdsHash(const void *key);
 uint64_t dictSdsCaseHash(const void *key);
 int dictSdsKeyCompare(const void *key1, const void *key2);
+int hashtableSdsKeyCompare(const void *key1, const void *key2);
 int dictSdsKeyCaseCompare(const void *key1, const void *key2);
 void dictSdsDestructor(void *val);
 void dictListDestructor(void *val);
diff --git a/src/sort.c b/src/sort.c
index b1723daff0..7af96141e8 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -330,7 +330,7 @@ void sortCommandGeneric(client *c, int readonly) {
     switch (sortval->type) {
     case OBJ_LIST: vectorlen = listTypeLength(sortval); break;
     case OBJ_SET: vectorlen = setTypeSize(sortval); break;
-    case OBJ_ZSET: vectorlen = dictSize(((zset *)sortval->ptr)->dict); break;
+    case OBJ_ZSET: vectorlen = hashtableSize(((zset *)sortval->ptr)->ht); break;
     default: vectorlen = 0; serverPanic("Bad SORT type"); /* Avoid GCC warning */
     }
 
@@ -423,7 +423,7 @@ void sortCommandGeneric(client *c, int readonly) {
 
         /* Check if starting point is trivial, before doing log(N) lookup. */
         if (desc) {
-            long zsetlen = dictSize(((zset *)sortval->ptr)->dict);
+            long zsetlen = hashtableSize(((zset *)sortval->ptr)->ht);
 
             ln = zsl->tail;
             if (start > 0) ln = zslGetElementByRank(zsl, zsetlen - start);
@@ -445,19 +445,18 @@ void sortCommandGeneric(client *c, int readonly) {
         end -= start;
         start = 0;
     } else if (sortval->type == OBJ_ZSET) {
-        dict *set = ((zset *)sortval->ptr)->dict;
-        dictIterator *di;
-        dictEntry *setele;
-        sds sdsele;
-        di = dictGetIterator(set);
-        while ((setele = dictNext(di)) != NULL) {
-            sdsele = dictGetKey(setele);
-            vector[j].obj = createStringObject(sdsele, sdslen(sdsele));
+        hashtable *ht = ((zset *)sortval->ptr)->ht;
+        hashtableIterator iter;
+        hashtableInitIterator(&iter, ht);
+        void *next;
+        while (hashtableNext(&iter, &next)) {
+            zskiplistNode *node = next;
+            vector[j].obj = createStringObject(node->ele, sdslen(node->ele));
             vector[j].u.score = 0;
             vector[j].u.cmpobj = NULL;
             j++;
         }
-        dictReleaseIterator(di);
+        hashtableResetIterator(&iter);
     } else {
         serverPanic("Unknown type");
     }
diff --git a/src/t_zset.c b/src/t_zset.c
index e8c5a369b7..77c96613b7 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -69,10 +69,10 @@
 int zslLexValueGteMin(sds value, zlexrangespec *spec);
 int zslLexValueLteMax(sds value, zlexrangespec *spec);
 void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap);
-zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank);
+static zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank);
 zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank);
 
-static inline unsigned long zslGetNodeSpanAtLevel(zskiplistNode *x, int level) {
+static inline unsigned long zslGetNodeSpanAtLevel(const zskiplistNode *x, int level) {
     /* We use the level 0 span in order to hold the node height, so in case the span is requested on
      * level 0 and this is not the last node we return 1 and 0 otherwise. For the rest of the levels we just return
      * the recorded span in that level. */
@@ -98,7 +98,7 @@ static inline void zslDecrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned
         x->level[level].span -= decr;
 }
 
-static inline unsigned long zslGetNodeHeight(zskiplistNode *x) {
+static inline unsigned long zslGetNodeHeight(const zskiplistNode *x) {
     /* Since the span at level 0 is always 1 (or 0 for the last node), this
      * field is instead used for storing the height of the node. */
     return x->level[0].span;
@@ -112,7 +112,7 @@ static inline void zslSetNodeHeight(zskiplistNode *x, int height) {
 
 /* Create a skiplist node with the specified number of levels.
  * The SDS string 'ele' is referenced by the node after the call. */
-zskiplistNode *zslCreateNode(int height, double score, sds ele) {
+static zskiplistNode *zslCreateNode(int height, double score, sds ele) {
     zskiplistNode *zn = zmalloc(sizeof(*zn) + height * sizeof(struct zskiplistLevel));
     zn->score = score;
     zn->ele = ele;
@@ -141,7 +141,7 @@ zskiplist *zslCreate(void) {
 /* Free the specified skiplist node. The referenced SDS string representation
  * of the element is freed too, unless node->ele is set to NULL before calling
  * this function. */
-void zslFreeNode(zskiplistNode *node) {
+static void zslFreeNode(zskiplistNode *node) {
     sdsfree(node->ele);
     zfree(node);
 }
@@ -163,29 +163,44 @@ void zslFree(zskiplist *zsl) {
  * The return value of this function is between 1 and ZSKIPLIST_MAXLEVEL
  * (both inclusive), with a powerlaw-alike distribution where higher
  * levels are less likely to be returned. */
-int zslRandomLevel(void) {
+static int zslRandomLevel(void) {
     static const int threshold = ZSKIPLIST_P * RAND_MAX;
     int level = 1;
     while (random() < threshold) level += 1;
     return (level < ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
 }
 
-/* Insert a new node in the skiplist. Assumes the element does not already
- * exist (up to the caller to enforce that). The skiplist takes ownership
- * of the passed SDS string 'ele'. */
-zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
-    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
+/* Compares node and score/ele; defines zset ordering. Return value:
+ *     positive if a comes after b.
+ *     negative if a comes before b.
+ *     0 if a's score and ele are both equal to b's. */
+static int zslCompareNodes(const zskiplistNode *a, const zskiplistNode *b) {
+    if (a == b) return 0;
+
+    /* null indicates end of list - ordered after any score/ele */
+    if (a == NULL) return 1;
+    if (b == NULL) return -1;
+
+    if (a->score > b->score) return 1;
+    if (a->score < b->score) return -1;
+
+    return sdscmp(a->ele, b->ele);
+}
+
+/* Insert a node in the skiplist. Assumes the element does not already exist in
+ * the skiplist (up to the caller to enforce that). The skiplist takes ownership
+ * of the passed node. */
+static zskiplistNode *zslInsertNode(zskiplist *zsl, zskiplistNode *node) {
+    zskiplistNode *update[ZSKIPLIST_MAXLEVEL];
     unsigned long rank[ZSKIPLIST_MAXLEVEL];
-    int i, level;
+    const int level = zslGetNodeHeight(node);
 
-    serverAssert(!isnan(score));
-    x = zsl->header;
-    for (i = zsl->level - 1; i >= 0; i--) {
+    serverAssert(!isnan(node->score));
+    zskiplistNode *x = zsl->header;
+    for (int i = zsl->level - 1; i >= 0; i--) {
         /* store rank that is crossed to reach the insert position */
         rank[i] = i == (zsl->level - 1) ? 0 : rank[i + 1];
-        while (x->level[i].forward &&
-               (x->level[i].forward->score < score ||
-                (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) {
+        while (zslCompareNodes(x->level[i].forward, node) < 0) {
             rank[i] += zslGetNodeSpanAtLevel(x, i);
             x = x->level[i].forward;
         }
@@ -193,11 +208,10 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
     }
     /* we assume the element is not already inside, since we allow duplicated
      * scores, reinserting the same element should never happen since the
-     * caller of zslInsert() should test in the hash table if the element is
+     * caller should test in the hash table if the element is
      * already inside or not. */
-    level = zslRandomLevel();
     if (level > zsl->level) {
-        for (i = zsl->level; i < level; i++) {
+        for (int i = zsl->level; i < level; i++) {
             rank[i] = 0;
             update[i] = zsl->header;
             zslSetNodeSpanAtLevel(update[i], i, zsl->length);
@@ -205,33 +219,42 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
         zsl->level = level;
         zslSetNodeHeight(zsl->header, level);
     }
-    x = zslCreateNode(level, score, ele);
-    for (i = 0; i < level; i++) {
-        x->level[i].forward = update[i]->level[i].forward;
-        update[i]->level[i].forward = x;
+    for (int i = 0; i < level; i++) {
+        node->level[i].forward = update[i]->level[i].forward;
+        update[i]->level[i].forward = node;
 
         /* update span covered by update[i] as x is inserted here */
-        zslSetNodeSpanAtLevel(x, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i]));
+        zslSetNodeSpanAtLevel(node, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i]));
         zslSetNodeSpanAtLevel(update[i], i, (rank[0] - rank[i]) + 1);
     }
 
     /* increment span for untouched levels */
-    for (i = level; i < zsl->level; i++) {
+    for (int i = level; i < zsl->level; i++) {
         zslIncrNodeSpanAtLevel(update[i], i, 1);
     }
 
-    x->backward = (update[0] == zsl->header) ? NULL : update[0];
-    if (x->level[0].forward)
-        x->level[0].forward->backward = x;
+    node->backward = (update[0] == zsl->header) ? NULL : update[0];
+    if (node->level[0].forward)
+        node->level[0].forward->backward = node;
     else
-        zsl->tail = x;
+        zsl->tail = node;
     zsl->length++;
-    return x;
+    return node;
+}
+
+/* Insert a new node in the skiplist. Assumes the element does not already
+ * exist (up to the caller to enforce that). The skiplist takes ownership
+ * of the passed SDS string 'ele'. */
+zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
+    const int level = zslRandomLevel();
+    zskiplistNode *node = zslCreateNode(level, score, ele);
+    zslInsertNode(zsl, node);
+    return node;
 }
 
 /* Internal function used by zslDelete, zslDeleteRangeByScore and
  * zslDeleteRangeByRank. */
-void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
+static void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
     int i;
     for (i = 0; i < zsl->level; i++) {
         if (update[i]->level[i].forward == x) {
@@ -250,91 +273,64 @@ void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
     zsl->length--;
 }
 
-/* Delete an element with matching score/element from the skiplist.
- * The function returns 1 if the node was found and deleted, otherwise
- * 0 is returned.
- *
- * If 'node' is NULL the deleted node is freed by zslFreeNode(), otherwise
- * it is not freed (but just unlinked) and *node is set to the node pointer,
- * so that it is possible for the caller to reuse the node (including the
- * referenced SDS string at node->ele). */
-int zslDelete(zskiplist *zsl, double score, sds ele, zskiplistNode **node) {
-    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
-    int i;
-
-    x = zsl->header;
-    for (i = zsl->level - 1; i >= 0; i--) {
-        while (x->level[i].forward &&
-               (x->level[i].forward->score < score ||
-                (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) {
+/* Delete specified node from the skiplist. */
+static void zslDelete(zskiplist *zsl, zskiplistNode *node) {
+    zskiplistNode *update[ZSKIPLIST_MAXLEVEL];
+    zskiplistNode *x = zsl->header;
+    for (int i = zsl->level - 1; i >= 0; i--) {
+        while (zslCompareNodes(x->level[i].forward, node) < 0) {
             x = x->level[i].forward;
         }
         update[i] = x;
     }
-    /* We may have multiple elements with the same score, what we need
-     * is to find the element with both the right score and object. */
-    x = x->level[0].forward;
-    if (x && score == x->score && sdscmp(x->ele, ele) == 0) {
-        zslDeleteNode(zsl, x, update);
-        if (!node)
-            zslFreeNode(x);
-        else
-            *node = x;
-        return 1;
-    }
-    return 0; /* not found */
+
+    /* We should have arrived at the correct node */
+    serverAssert(x->level[0].forward == node);
+
+    zslDeleteNode(zsl, node, update);
+    zslFreeNode(node);
 }
 
 /* Update the score of an element inside the sorted set skiplist.
- * Note that the element must exist and must match 'score'.
- * This function does not update the score in the hash table side, the
- * caller should take care of it.
+ * Note that the element must exist in the skiplist.
  *
  * Note that this function attempts to just update the node, in case after
- * the score update, the node would be exactly at the same position.
+ * the score update, the node would be exactly at the same position. If the old
+ * node can be kept it returns NULL.
  * Otherwise the skiplist is modified by removing and re-adding a new
- * element, which is more costly.
- *
- * The function returns the updated element skiplist node pointer. */
-zskiplistNode *zslUpdateScore(zskiplist *zsl, double curscore, sds ele, double newscore) {
-    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
-    int i;
-
-    /* We need to seek to element to update to start: this is useful anyway,
-     * we'll have to update or remove it. */
-    x = zsl->header;
-    for (i = zsl->level - 1; i >= 0; i--) {
-        while (x->level[i].forward &&
-               (x->level[i].forward->score < curscore ||
-                (x->level[i].forward->score == curscore && sdscmp(x->level[i].forward->ele, ele) < 0))) {
+ * element, which is more costly. A pointer to the new node is returned. */
+static zskiplistNode *zslUpdateScore(zskiplist *zsl, zskiplistNode *node, double newscore) {
+    /* If the node, after the score update, would be still exactly
+     * at the same position, we can just update the score without
+     * actually removing and re-inserting the element in the skiplist.
+     * (TODO: The check can be extended to check also equality of the
+     * score, but then we'll also need to compare the key order). */
+    if ((node->backward == NULL || node->backward->score < newscore) &&
+        (node->level[0].forward == NULL || node->level[0].forward->score > newscore)) {
+        node->score = newscore;
+        return NULL;
+    }
+
+    /* We need to remove the node from the skiplist and insert a new one */
+    zskiplistNode *update[ZSKIPLIST_MAXLEVEL];
+    zskiplistNode *x = zsl->header;
+    for (int i = zsl->level - 1; i >= 0; i--) {
+        while (zslCompareNodes(x->level[i].forward, node) < 0) {
             x = x->level[i].forward;
         }
         update[i] = x;
     }
+    /* We assume that the node exists in the skiplist */
+    serverAssert(x->level[0].forward == node);
 
-    /* Jump to our element: note that this function assumes that the
-     * element with the matching score exists. */
-    x = x->level[0].forward;
-    serverAssert(x && curscore == x->score && sdscmp(x->ele, ele) == 0);
-
-    /* If the node, after the score update, would be still exactly
-     * at the same position, we can just update the score without
-     * actually removing and re-inserting the element in the skiplist. */
-    if ((x->backward == NULL || x->backward->score < newscore) &&
-        (x->level[0].forward == NULL || x->level[0].forward->score > newscore)) {
-        x->score = newscore;
-        return x;
-    }
-
-    /* No way to reuse the old node: we need to remove and insert a new
-     * one at a different place. */
-    zslDeleteNode(zsl, x, update);
-    zskiplistNode *newnode = zslInsert(zsl, newscore, x->ele);
-    /* We reused the old node x->ele SDS string, free the node now
-     * since zslInsert created a new one. */
-    x->ele = NULL;
-    zslFreeNode(x);
-    return newnode;
+    zslDeleteNode(zsl, node, update);
+    /* update pointer inside hashtable with new node */
+    zskiplistNode *new_node = zslInsert(zsl, newscore, node->ele);
+    /* We reused the old node->ele SDS string, free the node now
+     * since zslInsert created a new node */
+    node->ele = NULL;
+    zslFreeNode(node);
+    return new_node;
 }
 
 int zslValueGteMin(double value, zrangespec *spec) {
@@ -442,7 +438,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) {
  * range->maxex). When inclusive a score >= min && score <= max is deleted.
  * Note that this function takes the reference to the hash table view of the
  * sorted set, in order to remove the elements from the hash table too. */
-unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dict) {
+static unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, hashtable *ht) {
     zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
     unsigned long removed = 0;
     int i;
@@ -460,7 +456,7 @@ unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dic
     while (x && zslValueLteMax(x->score, range)) {
         zskiplistNode *next = x->level[0].forward;
         zslDeleteNode(zsl, x, update);
-        dictDelete(dict, x->ele);
+        hashtableDelete(ht, x->ele);
         zslFreeNode(x); /* Here is where x->ele is actually released. */
         removed++;
         x = next;
@@ -468,7 +464,7 @@ unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dic
     return removed;
 }
 
-unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, dict *dict) {
+static unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, hashtable *ht) {
     zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
     unsigned long removed = 0;
     int i;
@@ -487,7 +483,7 @@ unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, dict *di
     while (x && zslLexValueLteMax(x->ele, range)) {
         zskiplistNode *next = x->level[0].forward;
         zslDeleteNode(zsl, x, update);
-        dictDelete(dict, x->ele);
+        hashtableDelete(ht, x->ele);
         zslFreeNode(x); /* Here is where x->ele is actually released. */
         removed++;
         x = next;
@@ -497,7 +493,7 @@ unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, dict *di
 
 /* Delete all the elements with rank between start and end from the skiplist.
  * Start and end are inclusive. Note that start and end need to be 1-based */
-unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
+static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, hashtable *ht) {
     zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
     unsigned long traversed = 0, removed = 0;
     int i;
@@ -516,7 +512,7 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned
     while (x && traversed <= end) {
         zskiplistNode *next = x->level[0].forward;
         zslDeleteNode(zsl, x, update);
-        dictDelete(dict, x->ele);
+        hashtableDelete(ht, x->ele);
         zslFreeNode(x);
         removed++;
         traversed++;
@@ -525,46 +521,46 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned
     return removed;
 }
 
-/* Find the rank for an element by both score and key.
+/* Find the rank for a specific skiplist node.
  * Returns 0 when the element cannot be found, rank otherwise.
  * Note that the rank is 1-based due to the span of zsl->header to the
  * first element. */
-unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) {
-    zskiplistNode *x;
+static unsigned long zslGetRank(zskiplist *zsl, const zskiplistNode *node) {
     unsigned long rank = 0;
-    int i;
 
-    x = zsl->header;
-    for (i = zsl->level - 1; i >= 0; i--) {
-        while (x->level[i].forward &&
-               (x->level[i].forward->score < score ||
-                (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) <= 0))) {
+    /* Count up nodes that come before */
+    zskiplistNode *x = zsl->header;
+    for (int i = zsl->level - 1; i >= 0; i--) {
+        while (zslCompareNodes(x->level[i].forward, node) <= 0) {
             rank += zslGetNodeSpanAtLevel(x, i);
             x = x->level[i].forward;
         }
 
-        /* x might be equal to zsl->header, so test if obj is non-NULL */
-        if (x->ele && x->score == score && sdscmp(x->ele, ele) == 0) {
+        if (x == node) {
             return rank;
         }
     }
     return 0;
 }
 
-/* Find the rank for a specific skiplist node. */
-unsigned long zslGetRankByNode(zskiplist *zsl, zskiplistNode *x) {
-    int i = zslGetNodeHeight(x) - 1;
-    unsigned long rank = zslGetNodeSpanAtLevel(x, i);
-    while (x->level[zslGetNodeHeight(x) - 1].forward) {
-        x = x->level[zslGetNodeHeight(x) - 1].forward;
-        rank += zslGetNodeSpanAtLevel(x, zslGetNodeHeight(x) - 1);
+/* Find the rank for a specific skiplist node. Alternate method that counts
+ * nodes after the one specified and subtracts from list length.
+ * TODO investigate whether both rank methods are needed and compare perf */
+static unsigned long zslGetRankCountingNodesAfter(zskiplist *zsl, const zskiplistNode *node) {
+    int highest_node_span = zslGetNodeHeight(node) - 1;
+    unsigned long count_after_node = zslGetNodeSpanAtLevel(node, highest_node_span);
+    while (node->level[highest_node_span].forward) {
+        node = node->level[highest_node_span].forward;
+        highest_node_span = zslGetNodeHeight(node) - 1;
+        count_after_node += zslGetNodeSpanAtLevel(node, highest_node_span);
     }
-    rank = zsl->length - rank;
+
+    unsigned long rank = zsl->length - count_after_node;
     return rank;
 }
 
 /* Finds an element by its rank from start node. The rank argument needs to be 1-based. */
-zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) {
+static zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) {
     zskiplistNode *x;
     unsigned long traversed = 0;
     int i;
@@ -639,7 +635,7 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) {
  *
  * If the string is not a valid range C_ERR is returned, and the value
  * of *dest and *ex is undefined. */
-int zslParseLexRangeItem(robj *item, sds *dest, int *ex) {
+static int zslParseLexRangeItem(robj *item, sds *dest, int *ex) {
     char *c = item->ptr;
 
     switch (c[0]) {
@@ -695,7 +691,7 @@ int zslParseLexRange(robj *min, robj *max, zlexrangespec *spec) {
 /* This is just a wrapper to sdscmp() that is able to
  * handle shared.minstring and shared.maxstring as the equivalent of
  * -inf and +inf for strings */
-int sdscmplex(sds a, sds b) {
+static int sdscmplex(sds a, sds b) {
     if (a == b) return 0;
     if (a == shared.minstring || b == shared.maxstring) return -1;
     if (a == shared.maxstring || b == shared.minstring) return 1;
@@ -711,7 +707,7 @@ int zslLexValueLteMax(sds value, zlexrangespec *spec) {
 }
 
 /* Returns if there is a part of the zset is in the lex range. */
-int zslIsInLexRange(zskiplist *zsl, zlexrangespec *range) {
+static int zslIsInLexRange(zskiplist *zsl, zlexrangespec *range) {
     zskiplistNode *x;
 
     /* Test for ranges that will always be empty. */
@@ -806,7 +802,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) {
  * Listpack-backed sorted set API
  *----------------------------------------------------------------------------*/
 
-double zzlStrtod(unsigned char *vstr, unsigned int vlen) {
+static double zzlStrtod(unsigned char *vstr, unsigned int vlen) {
     char buf[128];
     if (vlen > sizeof(buf) - 1) vlen = sizeof(buf) - 1;
     memcpy(buf, vstr, vlen);
@@ -849,7 +845,7 @@ sds lpGetObject(unsigned char *sptr) {
 }
 
 /* Compare element in sorted set with given element. */
-int zzlCompareElements(unsigned char *eptr, unsigned char *cstr, unsigned int clen) {
+static int zzlCompareElements(unsigned char *eptr, unsigned char *cstr, unsigned int clen) {
     unsigned char *vstr;
     unsigned int vlen;
     long long vlong;
@@ -869,7 +865,7 @@ int zzlCompareElements(unsigned char *eptr, unsigned char *cstr, unsigned int cl
     return cmp;
 }
 
-unsigned int zzlLength(unsigned char *zl) {
+static unsigned int zzlLength(unsigned char *zl) {
     return lpLength(zl) / 2;
 }
 
@@ -1077,7 +1073,7 @@ unsigned char *zzlLastInLexRange(unsigned char *zl, zlexrangespec *range) {
     return NULL;
 }
 
-unsigned char *zzlFind(unsigned char *lp, sds ele, double *score) {
+static unsigned char *zzlFind(unsigned char *lp, sds ele, double *score) {
     unsigned char *eptr, *sptr;
 
     if ((eptr = lpFirst(lp)) == NULL) return NULL;
@@ -1096,11 +1092,11 @@ unsigned char *zzlFind(unsigned char *lp, sds ele, double *score) {
 
 /* Delete (element,score) pair from listpack. Use local copy of eptr because we
  * don't want to modify the one given as argument. */
-unsigned char *zzlDelete(unsigned char *zl, unsigned char *eptr) {
+static unsigned char *zzlDelete(unsigned char *zl, unsigned char *eptr) {
     return lpDeleteRangeWithEntry(zl, &eptr, 2);
 }
 
-unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, sds ele, double score) {
+static unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, sds ele, double score) {
     unsigned char *sptr;
     char scorebuf[MAX_D2STRING_CHARS];
     int scorelen = 0;
@@ -1128,7 +1124,7 @@ unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, sds ele, doub
 
 /* Insert (element,score) pair in listpack. This function assumes the element is
  * not yet present in the list. */
-unsigned char *zzlInsert(unsigned char *zl, sds ele, double score) {
+static unsigned char *zzlInsert(unsigned char *zl, sds ele, double score) {
     unsigned char *eptr = lpSeek(zl, 0), *sptr;
     double s;
 
@@ -1160,7 +1156,7 @@ unsigned char *zzlInsert(unsigned char *zl, sds ele, double score) {
     return zl;
 }
 
-unsigned char *zzlDeleteRangeByScore(unsigned char *zl, zrangespec *range, unsigned long *deleted) {
+static unsigned char *zzlDeleteRangeByScore(unsigned char *zl, zrangespec *range, unsigned long *deleted) {
     unsigned char *eptr, *sptr;
     double score;
     unsigned long num = 0;
@@ -1187,7 +1183,7 @@ unsigned char *zzlDeleteRangeByScore(unsigned char *zl, zrangespec *range, unsig
     return zl;
 }
 
-unsigned char *zzlDeleteRangeByLex(unsigned char *zl, zlexrangespec *range, unsigned long *deleted) {
+static unsigned char *zzlDeleteRangeByLex(unsigned char *zl, zlexrangespec *range, unsigned long *deleted) {
     unsigned char *eptr, *sptr;
     unsigned long num = 0;
 
@@ -1225,13 +1221,6 @@ unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsig
  * Common sorted set API
  *----------------------------------------------------------------------------*/
 
-/* Utility function used for mapping the hashtable entry to the matching skiplist node.
- * For example, this is used in case of ZRANK query. */
-static inline zskiplistNode *zsetGetSLNodeByEntry(dictEntry *de) {
-    char *score_ref = ((char *)dictGetVal(de));
-    return (zskiplistNode *)(score_ref - offsetof(zskiplistNode, score));
-}
-
 unsigned long zsetLength(const robj *zobj) {
     unsigned long length = 0;
     if (zobj->encoding == OBJ_ENCODING_LISTPACK) {
@@ -1259,7 +1248,7 @@ robj *zsetTypeCreate(size_t size_hint, size_t val_len_hint) {
 
     robj *zobj = createZsetObject();
     zset *zs = zobj->ptr;
-    dictExpand(zs->dict, size_hint);
+    hashtableExpand(zs->ht, size_hint);
     return zobj;
 }
 
@@ -1297,11 +1286,11 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap) {
         if (encoding != OBJ_ENCODING_SKIPLIST) serverPanic("Unknown target encoding");
 
         zs = zmalloc(sizeof(*zs));
-        zs->dict = dictCreate(&zsetDictType);
+        zs->ht = hashtableCreate(&zsetHashtableType);
         zs->zsl = zslCreate();
 
         /* Presize the dict to avoid rehashing */
-        dictExpand(zs->dict, cap);
+        hashtableExpand(zs->ht, cap);
 
         eptr = lpSeek(zl, 0);
         if (eptr != NULL) {
@@ -1318,7 +1307,7 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap) {
                 ele = sdsnewlen((char *)vstr, vlen);
 
             node = zslInsert(zs->zsl, score, ele);
-            serverAssert(dictAdd(zs->dict, ele, &node->score) == DICT_OK);
+            serverAssert(hashtableAdd(zs->ht, node));
             zzlNext(zl, &eptr, &sptr);
         }
 
@@ -1333,7 +1322,7 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap) {
         /* Approach similar to zslFree(), since we want to free the skiplist at
          * the same time as creating the listpack. */
         zs = zobj->ptr;
-        dictRelease(zs->dict);
+        hashtableRelease(zs->ht);
         node = zs->zsl->header->level[0].forward;
         zfree(zs->zsl->header);
         zfree(zs->zsl);
@@ -1377,9 +1366,10 @@ int zsetScore(robj *zobj, sds member, double *score) {
         if (zzlFind(zobj->ptr, member, score) == NULL) return C_ERR;
     } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = zobj->ptr;
-        dictEntry *de = dictFind(zs->dict, member);
-        if (de == NULL) return C_ERR;
-        *score = *(double *)dictGetVal(de);
+        void *entry;
+        if (!hashtableFind(zs->ht, member, &entry)) return C_ERR;
+        zskiplistNode *setElement = entry;
+        *score = setElement->score;
     } else {
         serverPanic("Unknown sorted set encoding");
     }
@@ -1504,18 +1494,17 @@ int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, dou
      * converted the key to skiplist. */
     if (zobj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = zobj->ptr;
-        zskiplistNode *znode;
-        dictEntry *de;
 
-        de = dictFind(zs->dict, ele);
-        if (de != NULL) {
+        void **node_ref_in_hashtable = hashtableFindRef(zs->ht, ele);
+        if (node_ref_in_hashtable != NULL) {
             /* NX? Return, same element already exists. */
             if (nx) {
                 *out_flags |= ZADD_OUT_NOP;
                 return 1;
             }
 
-            curscore = *(double *)dictGetVal(de);
+            zskiplistNode *old_node = *node_ref_in_hashtable;
+            curscore = old_node->score;
 
             /* Prepare the score for the increment if needed. */
             if (incr) {
@@ -1536,18 +1525,17 @@ int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, dou
 
             /* Remove and re-insert when score changes. */
             if (score != curscore) {
-                znode = zslUpdateScore(zs->zsl, curscore, ele, score);
-                /* Note that we did not removed the original element from
-                 * the hash table representing the sorted set, so we just
-                 * update the score. */
-                dictSetVal(zs->dict, de, &znode->score); /* Update score ptr. */
+                zskiplistNode *new_node = zslUpdateScore(zs->zsl, old_node, score);
+                /* Note that this assignment updates the node pointer stored in
+                 * the hashtable */
+                if (new_node) *node_ref_in_hashtable = new_node;
                 *out_flags |= ZADD_OUT_UPDATED;
             }
             return 1;
         } else if (!xx) {
             ele = sdsdup(ele);
-            znode = zslInsert(zs->zsl, score, ele);
-            serverAssert(dictAdd(zs->dict, ele, &znode->score) == DICT_OK);
+            zskiplistNode *new_node = zslInsert(zs->zsl, score, ele);
+            serverAssert(hashtableAdd(zs->ht, new_node));
             *out_flags |= ZADD_OUT_ADDED;
             if (newscore) *newscore = score;
             return 1;
@@ -1561,34 +1549,20 @@ int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, dou
     return 0; /* Never reached. */
 }
 
-/* Deletes the element 'ele' from the sorted set encoded as a skiplist+dict,
+/* Deletes the element 'ele' from the sorted set encoded as a skiplist+hashtable,
  * returning 1 if the element existed and was deleted, 0 otherwise (the
- * element was not there). It does not resize the dict after deleting the
- * element. */
+ * element was not there). */
 static int zsetRemoveFromSkiplist(zset *zs, sds ele) {
-    dictEntry *de;
-    double score;
-
-    de = dictUnlink(zs->dict, ele);
-    if (de != NULL) {
-        /* Get the score in order to delete from the skiplist later. */
-        score = *(double *)dictGetVal(de);
+    void *entry;
+    if (!hashtablePop(zs->ht, ele, &entry)) return 0;
+    zskiplistNode *node = entry;
 
-        /* Delete from the hash table and later from the skiplist.
-         * Note that the order is important: deleting from the skiplist
-         * actually releases the SDS string representing the element,
-         * which is shared between the skiplist and the hash table, so
-         * we need to delete from the skiplist as the final step. */
-        dictFreeUnlinkedEntry(zs->dict, de);
+    /* hashtable only contains pointers to skiplist nodes. Nothing to free. */
 
-        /* Delete from skiplist. */
-        int retval = zslDelete(zs->zsl, score, ele, NULL);
-        serverAssert(retval);
+    /* Delete from skiplist. */
+    zslDelete(zs->zsl, node);
 
-        return 1;
-    }
-
-    return 0;
+    return 1;
 }
 
 /* Delete the element 'ele' from the sorted set, returning 1 if the element
@@ -1623,7 +1597,7 @@ int zsetDel(robj *zobj, sds ele) {
  * the one with the lowest score. Otherwise if 'reverse' is non-zero
  * the rank is computed considering as element with rank 0 the one with
  * the highest score. */
-long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) {
+static long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) {
     unsigned long llen;
     unsigned long rank;
 
@@ -1656,25 +1630,19 @@ long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) {
         }
     } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = zobj->ptr;
-        zskiplist *zsl = zs->zsl;
-        dictEntry *de;
-        double score;
 
-        de = dictFind(zs->dict, ele);
-        if (de != NULL) {
-            zskiplistNode *n = zsetGetSLNodeByEntry(de);
-            score = n->score;
-            rank = zslGetRankByNode(zsl, n);
-            /* Existing elements always have a rank. */
-            serverAssert(rank != 0);
-            if (output_score) *output_score = score;
-            if (reverse)
-                return llen - rank;
-            else
-                return rank - 1;
-        } else {
-            return -1;
-        }
+        void *entry;
+        if (!hashtableFind(zs->ht, ele, &entry)) return -1;
+        zskiplistNode *node = entry;
+
+        rank = zslGetRankCountingNodesAfter(zs->zsl, node);
+        /* Existing elements always have a rank. */
+        serverAssert(rank != 0);
+        if (output_score) *output_score = node->score;
+        if (reverse)
+            return llen - rank;
+        else
+            return rank - 1;
     } else {
         serverPanic("Unknown sorted set encoding");
     }
@@ -1704,7 +1672,7 @@ robj *zsetDup(robj *o) {
         zobj = createZsetObject();
         zs = o->ptr;
         new_zs = zobj->ptr;
-        dictExpand(new_zs->dict, dictSize(zs->dict));
+        hashtableExpand(new_zs->ht, hashtableSize(zs->ht));
         zskiplist *zsl = zs->zsl;
         zskiplistNode *ln;
         sds ele;
@@ -1721,7 +1689,7 @@ robj *zsetDup(robj *o) {
             ele = ln->ele;
             sds new_ele = sdsdup(ele);
             zskiplistNode *znode = zslInsert(new_zs->zsl, ln->score, new_ele);
-            dictAdd(new_zs->dict, new_ele, &znode->score);
+            hashtableAdd(new_zs->ht, znode);
             ln = ln->backward;
         }
     } else {
@@ -1748,14 +1716,15 @@ void zsetReplyFromListpackEntry(client *c, listpackEntry *e) {
  * 'key' and 'val' will be set to hold the element.
  * The memory in `key` is not to be freed or modified by the caller.
  * 'score' can be NULL in which case it's not extracted. */
-void zsetTypeRandomElement(robj *zsetobj, unsigned long zsetsize, listpackEntry *key, double *score) {
+static void zsetTypeRandomElement(robj *zsetobj, unsigned long zsetsize, listpackEntry *key, double *score) {
     if (zsetobj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = zsetobj->ptr;
-        dictEntry *de = dictGetFairRandomKey(zs->dict);
-        sds s = dictGetKey(de);
-        key->sval = (unsigned char *)s;
-        key->slen = sdslen(s);
-        if (score) *score = *(double *)dictGetVal(de);
+        void *entry;
+        hashtableFairRandomEntry(zs->ht, &entry);
+        zskiplistNode *node = entry;
+        key->sval = (unsigned char *)node->ele;
+        key->slen = sdslen(node->ele);
+        if (score) *score = node->score;
     } else if (zsetobj->encoding == OBJ_ENCODING_LISTPACK) {
         listpackEntry val;
         lpRandomPair(zsetobj->ptr, zsetsize, key, &val);
@@ -1776,7 +1745,7 @@ void zsetTypeRandomElement(robj *zsetobj, unsigned long zsetsize, listpackEntry
  *----------------------------------------------------------------------------*/
 
 /* This generic command implements both ZADD and ZINCRBY. */
-void zaddGenericCommand(client *c, int flags) {
+static void zaddGenericCommand(client *c, int flags) {
     static char *nanerr = "resulting score is not a number (NaN)";
     robj *key = c->argv[1];
     robj *zobj;
@@ -2012,19 +1981,17 @@ void zremrangeGenericCommand(client *c, zrange_type rangetype) {
         }
     } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = zobj->ptr;
-        dictPauseAutoResize(zs->dict);
+        hashtablePauseAutoShrink(zs->ht);
         switch (rangetype) {
         case ZRANGE_AUTO:
-        case ZRANGE_RANK: deleted = zslDeleteRangeByRank(zs->zsl, start + 1, end + 1, zs->dict); break;
-        case ZRANGE_SCORE: deleted = zslDeleteRangeByScore(zs->zsl, &range, zs->dict); break;
-        case ZRANGE_LEX: deleted = zslDeleteRangeByLex(zs->zsl, &lexrange, zs->dict); break;
+        case ZRANGE_RANK: deleted = zslDeleteRangeByRank(zs->zsl, start + 1, end + 1, zs->ht); break;
+        case ZRANGE_SCORE: deleted = zslDeleteRangeByScore(zs->zsl, &range, zs->ht); break;
+        case ZRANGE_LEX: deleted = zslDeleteRangeByLex(zs->zsl, &lexrange, zs->ht); break;
         }
-        dictResumeAutoResize(zs->dict);
-        if (dictSize(zs->dict) == 0) {
+        hashtableResumeAutoShrink(zs->ht);
+        if (hashtableSize(zs->ht) == 0) {
             dbDelete(c->db, key);
             keyremoved = 1;
-        } else {
-            dictShrinkIfNeeded(zs->dict);
         }
     } else {
         serverPanic("Unknown sorted set encoding");
@@ -2116,7 +2083,7 @@ typedef struct {
 typedef union _iterset iterset;
 typedef union _iterzset iterzset;
 
-void zuiInitIterator(zsetopsrc *op) {
+static void zuiInitIterator(zsetopsrc *op) {
     if (op->subject == NULL) return;
 
     if (op->type == OBJ_SET) {
@@ -2155,7 +2122,7 @@ void zuiInitIterator(zsetopsrc *op) {
     }
 }
 
-void zuiClearIterator(zsetopsrc *op) {
+static void zuiClearIterator(zsetopsrc *op) {
     if (op->subject == NULL) return;
 
     if (op->type == OBJ_SET) {
@@ -2183,7 +2150,7 @@ void zuiClearIterator(zsetopsrc *op) {
     }
 }
 
-void zuiDiscardDirtyValue(zsetopval *val) {
+static void zuiDiscardDirtyValue(zsetopval *val) {
     if (val->flags & OPVAL_DIRTY_SDS) {
         sdsfree(val->ele);
         val->ele = NULL;
@@ -2191,7 +2158,7 @@ void zuiDiscardDirtyValue(zsetopval *val) {
     }
 }
 
-unsigned long zuiLength(zsetopsrc *op) {
+static unsigned long zuiLength(zsetopsrc *op) {
     if (op->subject == NULL) return 0;
 
     if (op->type == OBJ_SET) {
@@ -2213,7 +2180,7 @@ unsigned long zuiLength(zsetopsrc *op) {
 /* Check if the current value is valid. If so, store it in the passed structure
  * and move to the next element. If not valid, this means we have reached the
  * end of the structure and can abort. */
-int zuiNext(zsetopsrc *op, zsetopval *val) {
+static int zuiNext(zsetopsrc *op, zsetopval *val) {
     if (op->subject == NULL) return 0;
 
     zuiDiscardDirtyValue(val);
@@ -2272,23 +2239,7 @@ int zuiNext(zsetopsrc *op, zsetopval *val) {
     return 1;
 }
 
-int zuiLongLongFromValue(zsetopval *val) {
-    if (!(val->flags & OPVAL_DIRTY_LL)) {
-        val->flags |= OPVAL_DIRTY_LL;
-
-        if (val->ele != NULL) {
-            if (string2ll(val->ele, sdslen(val->ele), &val->ell)) val->flags |= OPVAL_VALID_LL;
-        } else if (val->estr != NULL) {
-            if (string2ll((char *)val->estr, val->elen, &val->ell)) val->flags |= OPVAL_VALID_LL;
-        } else {
-            /* The long long was already set, flag as valid. */
-            val->flags |= OPVAL_VALID_LL;
-        }
-    }
-    return val->flags & OPVAL_VALID_LL;
-}
-
-sds zuiSdsFromValue(zsetopval *val) {
+static sds zuiSdsFromValue(zsetopval *val) {
     if (val->ele == NULL) {
         if (val->estr != NULL) {
             val->ele = sdsnewlen((char *)val->estr, val->elen);
@@ -2302,7 +2253,7 @@ sds zuiSdsFromValue(zsetopval *val) {
 
 /* This is different from zuiSdsFromValue since returns a new SDS string
  * which is up to the caller to free. */
-sds zuiNewSdsFromValue(zsetopval *val) {
+static sds zuiNewSdsFromValue(zsetopval *val) {
     if (val->flags & OPVAL_DIRTY_SDS) {
         /* We have already one to return! */
         sds ele = val->ele;
@@ -2318,22 +2269,9 @@ sds zuiNewSdsFromValue(zsetopval *val) {
     }
 }
 
-int zuiBufferFromValue(zsetopval *val) {
-    if (val->estr == NULL) {
-        if (val->ele != NULL) {
-            val->elen = sdslen(val->ele);
-            val->estr = (unsigned char *)val->ele;
-        } else {
-            val->elen = ll2string((char *)val->_buf, sizeof(val->_buf), val->ell);
-            val->estr = val->_buf;
-        }
-    }
-    return 1;
-}
-
 /* Find value pointed to by val in the source pointer to by op. When found,
  * return 1 and store its score in target. Return 0 otherwise. */
-int zuiFind(zsetopsrc *op, zsetopval *val, double *score) {
+static int zuiFind(zsetopsrc *op, zsetopval *val, double *score) {
     if (op->subject == NULL) return 0;
 
     if (op->type == OBJ_SET) {
@@ -2357,9 +2295,10 @@ int zuiFind(zsetopsrc *op, zsetopval *val, double *score) {
             }
         } else if (op->encoding == OBJ_ENCODING_SKIPLIST) {
             zset *zs = op->subject->ptr;
-            dictEntry *de;
-            if ((de = dictFind(zs->dict, val->ele)) != NULL) {
-                *score = *(double *)dictGetVal(de);
+            void *entry;
+            if (hashtableFind(zs->ht, val->ele, &entry)) {
+                zskiplistNode *node = entry;
+                *score = node->score;
                 return 1;
             } else {
                 return 0;
@@ -2372,7 +2311,7 @@ int zuiFind(zsetopsrc *op, zsetopval *val, double *score) {
     }
 }
 
-int zuiCompareByCardinality(const void *s1, const void *s2) {
+static int zuiCompareByCardinality(const void *s1, const void *s2) {
     unsigned long first = zuiLength((zsetopsrc *)s1);
     unsigned long second = zuiLength((zsetopsrc *)s2);
     if (first > second) return 1;
@@ -2406,20 +2345,19 @@ inline static void zunionInterAggregate(double *target, double val, int aggregat
     }
 }
 
-static size_t zsetDictGetMaxElementLength(dict *d, size_t *totallen) {
-    dictIterator *di;
-    dictEntry *de;
+static size_t zsetHashtableGetMaxElementLength(hashtable *ht, size_t *totallen) {
     size_t maxelelen = 0;
 
-    di = dictGetIterator(d);
-
-    while ((de = dictNext(di)) != NULL) {
-        sds ele = dictGetKey(de);
-        if (sdslen(ele) > maxelelen) maxelelen = sdslen(ele);
-        if (totallen) (*totallen) += sdslen(ele);
+    hashtableIterator iter;
+    hashtableInitIterator(&iter, ht);
+    void *next;
+    while (hashtableNext(&iter, &next)) {
+        zskiplistNode *node = next;
+        size_t elelen = sdslen(node->ele);
+        if (elelen > maxelelen) maxelelen = elelen;
+        if (totallen) (*totallen) += elelen;
     }
-
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 
     return maxelelen;
 }
@@ -2469,7 +2407,7 @@ static void zdiffAlgorithm1(zsetopsrc *src, long setnum, zset *dstzset, size_t *
         if (!exists) {
             tmp = zuiNewSdsFromValue(&zval);
             znode = zslInsert(dstzset->zsl, zval.score, tmp);
-            dictAdd(dstzset->dict, tmp, &znode->score);
+            hashtableAdd(dstzset->ht, znode);
             if (sdslen(tmp) > *maxelelen) *maxelelen = sdslen(tmp);
             (*totelelen) += sdslen(tmp);
         }
@@ -2500,6 +2438,7 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t *
     zskiplistNode *znode;
     sds tmp;
 
+    hashtablePauseAutoShrink(dstzset->ht);
     for (j = 0; j < setnum; j++) {
         if (zuiLength(&src[j]) == 0) continue;
 
@@ -2509,15 +2448,13 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t *
             if (j == 0) {
                 tmp = zuiNewSdsFromValue(&zval);
                 znode = zslInsert(dstzset->zsl, zval.score, tmp);
-                dictAdd(dstzset->dict, tmp, &znode->score);
+                hashtableAdd(dstzset->ht, znode);
                 cardinality++;
             } else {
-                dictPauseAutoResize(dstzset->dict);
                 tmp = zuiSdsFromValue(&zval);
                 if (zsetRemoveFromSkiplist(dstzset, tmp)) {
                     cardinality--;
                 }
-                dictResumeAutoResize(dstzset->dict);
             }
 
             /* Exit if result set is empty as any additional removal
@@ -2530,16 +2467,14 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t *
     }
 
     /* Resize dict if needed after removing multiple elements */
-    dictShrinkIfNeeded(dstzset->dict);
+    hashtableResumeAutoShrink(dstzset->ht);
 
     /* Using this algorithm, we can't calculate the max element as we go,
      * we have to iterate through all elements to find the max one after. */
-    *maxelelen = zsetDictGetMaxElementLength(dstzset->dict, totelelen);
+    *maxelelen = zsetHashtableGetMaxElementLength(dstzset->ht, totelelen);
 }
 
 static int zsetChooseDiffAlgorithm(zsetopsrc *src, long setnum) {
-    int j;
-
     /* Select what DIFF algorithm to use.
      *
      * Algorithm 1 is O(N*M + K*log(K)) where N is the size of the
@@ -2554,7 +2489,7 @@ static int zsetChooseDiffAlgorithm(zsetopsrc *src, long setnum) {
     long long algo_one_work = 0;
     long long algo_two_work = 0;
 
-    for (j = 0; j < setnum; j++) {
+    for (int j = 0; j < setnum; j++) {
         /* If any other set is equal to the first set, there is nothing to be
          * done, since we would remove all elements anyway. */
         if (j > 0 && src[0].subject == src[j].subject) {
@@ -2597,7 +2532,7 @@ static void zdiff(zsetopsrc *src, long setnum, zset *dstzset, size_t *maxelelen,
  * 'cardinality_only' is currently only applicable when 'op' is SET_OP_INTER.
  * Work for SINTERCARD, only return the cardinality with minimum processing and memory overheads.
  */
-void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, int op, int cardinality_only) {
+static void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, int op, int cardinality_only) {
     int i, j;
     long setnum;
     int aggregate = REDIS_AGGR_SUM;
@@ -2607,7 +2542,6 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in
     size_t maxelelen = 0, totelelen = 0;
     robj *dstobj = NULL;
     zset *dstzset = NULL;
-    zskiplistNode *znode;
     int withscores = 0;
     unsigned long cardinality = 0;
     long limit = 0; /* Stop searching after reaching the limit. 0 means unlimited. */
@@ -2762,8 +2696,8 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in
                     }
                 } else if (j == setnum) {
                     tmp = zuiNewSdsFromValue(&zval);
-                    znode = zslInsert(dstzset->zsl, score, tmp);
-                    dictAdd(dstzset->dict, tmp, &znode->score);
+                    zskiplistNode *znode = zslInsert(dstzset->zsl, score, tmp);
+                    hashtableAdd(dstzset->ht, znode);
                     totelelen += sdslen(tmp);
                     if (sdslen(tmp) > maxelelen) maxelelen = sdslen(tmp);
                 }
@@ -2771,64 +2705,58 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in
             zuiClearIterator(&src[0]);
         }
     } else if (op == SET_OP_UNION) {
-        dictIterator *di;
-        dictEntry *de, *existing;
-        double score;
-
+        /* Step 1: Create the hash table first by iterating one sorted set after
+         * the other. We wait to create the skiplist until scores/ordering are
+         * finalized. */
         if (setnum) {
             /* Our union is at least as large as the largest set.
              * Resize the dictionary ASAP to avoid useless rehashing. */
-            dictExpand(dstzset->dict, zuiLength(&src[setnum - 1]));
+            hashtableExpand(dstzset->ht, zuiLength(&src[setnum - 1]));
         }
-
-        /* Step 1: Create a dictionary of elements -> aggregated-scores
-         * by iterating one sorted set after the other. */
         for (i = 0; i < setnum; i++) {
             if (zuiLength(&src[i]) == 0) continue;
 
             zuiInitIterator(&src[i]);
             while (zuiNext(&src[i], &zval)) {
                 /* Initialize value */
-                score = src[i].weight * zval.score;
+                double score = src[i].weight * zval.score;
                 if (isnan(score)) score = 0;
 
                 /* Search for this element in the accumulating dictionary. */
-                de = dictAddRaw(dstzset->dict, zuiSdsFromValue(&zval), &existing);
+                sds sdsval = zuiSdsFromValue(&zval);
+                hashtablePosition position;
                 /* If we don't have it, we need to create a new entry. */
-                if (!existing) {
-                    tmp = zuiNewSdsFromValue(&zval);
+                void *existing;
+                if (hashtableFindPositionForInsert(dstzset->ht, sdsval, &position, &existing)) {
+                    zskiplistNode *new_node = zslCreateNode(zslRandomLevel(), score, zuiNewSdsFromValue(&zval));
+                    hashtableInsertAtPosition(dstzset->ht, new_node, &position);
                     /* Remember the longest single element encountered,
                      * to understand if it's possible to convert to listpack
                      * at the end. */
-                    totelelen += sdslen(tmp);
-                    if (sdslen(tmp) > maxelelen) maxelelen = sdslen(tmp);
-                    /* Update the element with its initial score. */
-                    dictSetKey(dstzset->dict, de, tmp);
-                    dictSetDoubleVal(de, score);
+                    totelelen += sdslen(new_node->ele);
+                    if (sdslen(new_node->ele) > maxelelen) {
+                        maxelelen = sdslen(new_node->ele);
+                    }
                 } else {
                     /* Update the score with the score of the new instance
-                     * of the element found in the current sorted set.
-                     *
-                     * Here we access directly the dictEntry double
-                     * value inside the union as it is a big speedup
-                     * compared to using the getDouble/setDouble API. */
-                    double *existing_score_ptr = dictGetDoubleValPtr(existing);
-                    zunionInterAggregate(existing_score_ptr, score, aggregate);
+                     * of the element found in the current sorted set. */
+                    zskiplistNode *node = existing;
+                    zunionInterAggregate(&node->score, score, aggregate);
                 }
             }
             zuiClearIterator(&src[i]);
         }
 
-        /* Step 2: convert the dictionary into the final sorted set. */
-        di = dictGetIterator(dstzset->dict);
+        /* Step 2: Create the skiplist using final score ordering */
+        hashtableIterator iter;
+        hashtableInitIterator(&iter, dstzset->ht);
 
-        while ((de = dictNext(di)) != NULL) {
-            sds ele = dictGetKey(de);
-            score = dictGetDoubleVal(de);
-            znode = zslInsert(dstzset->zsl, score, ele);
-            dictSetVal(dstzset->dict, de, &znode->score);
+        void *next;
+        while (hashtableNext(&iter, &next)) {
+            zskiplistNode *node = next;
+            zslInsertNode(dstzset->zsl, node);
         }
-        dictReleaseIterator(di);
+        hashtableResetIterator(&iter);
     } else if (op == SET_OP_DIFF) {
         zdiff(src, setnum, dstzset, &maxelelen, &totelelen);
     } else {
@@ -3381,7 +3309,7 @@ void zcountCommand(client *c) {
 
         /* Use rank of first element, if any, to determine preliminary count */
         if (zn != NULL) {
-            rank = zslGetRank(zsl, zn->score, zn->ele);
+            rank = zslGetRank(zsl, zn);
             count = (zsl->length - (rank - 1));
 
             /* Find last element in range */
@@ -3389,7 +3317,7 @@ void zcountCommand(client *c) {
 
             /* Use rank of last element, if any, to determine the actual count */
             if (zn != NULL) {
-                rank = zslGetRank(zsl, zn->score, zn->ele);
+                rank = zslGetRank(zsl, zn);
                 count -= (zsl->length - rank);
             }
         }
@@ -3457,7 +3385,7 @@ void zlexcountCommand(client *c) {
 
         /* Use rank of first element, if any, to determine preliminary count */
         if (zn != NULL) {
-            rank = zslGetRank(zsl, zn->score, zn->ele);
+            rank = zslGetRank(zsl, zn);
             count = (zsl->length - (rank - 1));
 
             /* Find last element in range */
@@ -3465,7 +3393,7 @@ void zlexcountCommand(client *c) {
 
             /* Use rank of last element, if any, to determine the actual count */
             if (zn != NULL) {
-                rank = zslGetRank(zsl, zn->score, zn->ele);
+                rank = zslGetRank(zsl, zn);
                 count -= (zsl->length - rank);
             }
         }
@@ -4170,11 +4098,12 @@ void zrandmemberWithCountCommand(client *c, long l, int withscores) {
         if (zsetobj->encoding == OBJ_ENCODING_SKIPLIST) {
             zset *zs = zsetobj->ptr;
             while (count--) {
-                dictEntry *de = dictGetFairRandomKey(zs->dict);
-                sds key = dictGetKey(de);
+                void *entry;
+                serverAssert(hashtableFairRandomEntry(zs->ht, &entry));
+                zskiplistNode *node = entry;
                 if (withscores && c->resp > 2) addReplyArrayLen(c, 2);
-                addReplyBulkCBuffer(c, key, sdslen(key));
-                if (withscores) addReplyDouble(c, *(double *)dictGetVal(de));
+                addReplyBulkCBuffer(c, node->ele, sdslen(node->ele));
+                if (withscores) addReplyDouble(c, node->score);
                 if (c->flag.close_asap) break;
             }
         } else if (zsetobj->encoding == OBJ_ENCODING_LISTPACK) {

From aa3f256021bb3dcae7698cdad04fd37ca604e89e Mon Sep 17 00:00:00 2001
From: Rueian <rueiancsie@gmail.com>
Date: Wed, 8 Jan 2025 10:05:20 -0800
Subject: [PATCH 064/101] Skip logreqres on tests for the HELLO command (#1528)

Skip logreqres on tests for the HELLO command

Signed-off-by: Rueian <rueiancsie@gmail.com>
---
 tests/unit/protocol.tcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl
index f0e64368cc..0d05f6dee5 100644
--- a/tests/unit/protocol.tcl
+++ b/tests/unit/protocol.tcl
@@ -232,7 +232,7 @@ start_server {tags {"protocol network"}} {
 
 }
 
-start_server {tags {"protocol hello"}} {
+start_server {tags {"protocol hello logreqres:skip"}} {
     test {HELLO without protover} {
         set reply [r HELLO 3]
         assert_equal [dict get $reply proto] 3

From 984cb0aa4a70d5d7bab7406479a24b0f83ad71d8 Mon Sep 17 00:00:00 2001
From: Nikhil Manglore <nikhilmanglore9@gmail.com>
Date: Wed, 8 Jan 2025 12:03:06 -0800
Subject: [PATCH 065/101] valkey-cli auto-exit from subscribed mode (#1432)

Resolves issue with valkey-cli not auto exiting from subscribed mode on
reaching zero pub/sub subscription (previously filed on Redis)
https://github.com/redis/redis/issues/12592

---------

Signed-off-by: Nikhil Manglore <nmanglor@amazon.com>
---
 src/valkey-cli.c                 |  45 +++++-
 tests/integration/valkey-cli.tcl | 226 +++++++++++++++++++++++++++++++
 2 files changed, 267 insertions(+), 4 deletions(-)

diff --git a/src/valkey-cli.c b/src/valkey-cli.c
index 0a4f1affa2..d2fa537036 100644
--- a/src/valkey-cli.c
+++ b/src/valkey-cli.c
@@ -218,6 +218,8 @@ static struct config {
     int shutdown;
     int monitor_mode;
     int pubsub_mode;
+    int pubsub_unsharded_count; /* channels and patterns */
+    int pubsub_sharded_count;   /* shard channels */
     int blocking_state_aborted; /* used to abort monitor_mode and pubsub_mode. */
     int latency_mode;
     int latency_dist_mode;
@@ -2229,6 +2231,28 @@ static int cliReadReply(int output_raw_strings) {
     return REDIS_OK;
 }
 
+/* Helper method to handle pubsub subscription/unsubscription. */
+static void handlePubSubMode(redisReply *reply) {
+    char *cmd = reply->element[0]->str;
+    int count = reply->element[2]->integer;
+
+    /* Update counts based on the command type */
+    if (strcmp(cmd, "subscribe") == 0 || strcmp(cmd, "psubscribe") == 0 || strcmp(cmd, "unsubscribe") == 0 || strcmp(cmd, "punsubscribe") == 0) {
+        config.pubsub_unsharded_count = count;
+    } else if (strcmp(cmd, "ssubscribe") == 0 || strcmp(cmd, "sunsubscribe") == 0) {
+        config.pubsub_sharded_count = count;
+    }
+
+    /* Update pubsub mode based on the current counts */
+    if (config.pubsub_unsharded_count + config.pubsub_sharded_count == 0 && config.pubsub_mode) {
+        config.pubsub_mode = 0;
+        cliRefreshPrompt();
+    } else if (config.pubsub_unsharded_count + config.pubsub_sharded_count > 0 && !config.pubsub_mode) {
+        config.pubsub_mode = 1;
+        cliRefreshPrompt();
+    }
+}
+
 /* Simultaneously wait for pubsub messages from the server and input on stdin. */
 static void cliWaitForMessagesOrStdin(void) {
     int show_info = config.output != OUTPUT_RAW && (isatty(STDOUT_FILENO) || getenv("FAKETTY"));
@@ -2246,7 +2270,13 @@ static void cliWaitForMessagesOrStdin(void) {
                 sds out = cliFormatReply(reply, config.output, 0);
                 fwrite(out, sdslen(out), 1, stdout);
                 fflush(stdout);
+
+                if (isPubsubPush(reply)) {
+                    handlePubSubMode(reply);
+                }
+
                 sdsfree(out);
+                freeReplyObject(reply);
             }
         } while (reply);
 
@@ -2397,13 +2427,11 @@ static int cliSendCommand(int argc, char **argv, long repeat) {
             fflush(stdout);
             if (config.pubsub_mode || num_expected_pubsub_push > 0) {
                 if (isPubsubPush(config.last_reply)) {
+                    handlePubSubMode(config.last_reply);
+
                     if (num_expected_pubsub_push > 0 && !strcasecmp(config.last_reply->element[0]->str, command)) {
                         /* This pushed message confirms the
                          * [p|s][un]subscribe command. */
-                        if (is_subscribe && !config.pubsub_mode) {
-                            config.pubsub_mode = 1;
-                            cliRefreshPrompt();
-                        }
                         if (--num_expected_pubsub_push > 0) {
                             continue; /* We need more of these. */
                         }
@@ -3117,6 +3145,13 @@ void cliSetPreferences(char **argv, int argc, int interactive) {
         else {
             printf("%sunknown valkey-cli preference '%s'\n", interactive ? "" : ".valkeyclirc: ", argv[1]);
         }
+    } else if (!strcasecmp(argv[0], ":get") && argc >= 2) {
+        if (!strcasecmp(argv[1], "pubsub")) {
+            printf("%d\n", config.pubsub_mode);
+        } else {
+            printf("%sunknown valkey-cli get option '%s'\n", interactive ? "" : ".valkeyclirc: ", argv[1]);
+        }
+        fflush(stdout);
     } else {
         printf("%sunknown valkey-cli internal command '%s'\n", interactive ? "" : ".valkeyclirc: ", argv[0]);
     }
@@ -9495,6 +9530,8 @@ int main(int argc, char **argv) {
     config.shutdown = 0;
     config.monitor_mode = 0;
     config.pubsub_mode = 0;
+    config.pubsub_unsharded_count = 0;
+    config.pubsub_sharded_count = 0;
     config.blocking_state_aborted = 0;
     config.latency_mode = 0;
     config.latency_dist_mode = 0;
diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl
index 0c15af74f9..a56818b8c2 100644
--- a/tests/integration/valkey-cli.tcl
+++ b/tests/integration/valkey-cli.tcl
@@ -608,6 +608,232 @@ if {!$::tls} { ;# fake_redis_node doesn't support TLS
         assert_equal "a\n1\nb\n2\nc\n3" [exec {*}$cmdline ZRANGE new_zset 0 -1 WITHSCORES]
     }
 
+    test "valkey-cli pubsub mode with single standard channel subscription" {
+        set fd [open_cli]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+        
+        write_cli $fd "SUBSCRIBE ch1"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "UNSUBSCRIBE ch1"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        close_cli $fd
+    }
+
+    test "valkey-cli pubsub mode with multiple standard channel subscriptions" {
+        set fd [open_cli]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        write_cli $fd "SUBSCRIBE ch1 ch2 ch3"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "UNSUBSCRIBE"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        close_cli $fd
+    }
+
+    test "valkey-cli pubsub mode with single shard channel subscription" {
+        set fd [open_cli]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        write_cli $fd "SSUBSCRIBE schannel1"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "SUNSUBSCRIBE schannel1"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        close_cli $fd
+    }
+
+    test "valkey-cli pubsub mode with multiple shard channel subscriptions" {
+
+        set fd [open_cli]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        write_cli $fd "SSUBSCRIBE schannel1 schannel2 schannel3"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "SUNSUBSCRIBE"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        close_cli $fd
+    }
+
+    test "valkey-cli pubsub mode with single pattern channel subscription" {
+        set fd [open_cli]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        write_cli $fd "PSUBSCRIBE pattern1*"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "PUNSUBSCRIBE pattern1*"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        close_cli $fd
+    }
+
+    test "valkey-cli pubsub mode with multiple pattern channel subscriptions" {
+        set fd [open_cli]
+
+        write_cli $fd "PSUBSCRIBE pattern1* pattern2* pattern3*"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "PUNSUBSCRIBE"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        close_cli $fd
+    }
+
+    test "valkey-cli pubsub mode when subscribing to the same channel" {
+        set fd [open_cli]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        write_cli $fd "SUBSCRIBE ch1 ch1"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "UNSUBSCRIBE"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+        
+        close_cli $fd
+    }
+
+    test "valkey-cli pubsub mode with multiple subscription types" {
+        set fd [open_cli]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+    
+        write_cli $fd "SUBSCRIBE ch1 ch2 ch3"
+        set response [read_cli $fd]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "PSUBSCRIBE pattern*"
+        set response [read_cli $fd]
+        set lines [split $response "\n"]
+        assert_equal "psubscribe" [lindex $lines 0]
+        assert_equal "pattern*" [lindex $lines 1]
+        assert_equal "4" [lindex $lines 2]
+
+        write_cli $fd "SSUBSCRIBE schannel"
+        set response [read_cli $fd]
+        set lines [split $response "\n"]
+        assert_equal "ssubscribe" [lindex $lines 0]
+        assert_equal "schannel" [lindex $lines 1]
+        assert_equal "1" [lindex $lines 2]
+
+        write_cli $fd "PUNSUBSCRIBE pattern*"
+        set response [read_cli $fd]
+        set lines [split $response "\n"]
+        assert_equal "punsubscribe" [lindex $lines 0]
+        assert_equal "pattern*" [lindex $lines 1]
+        assert_equal "3" [lindex $lines 2]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "SUNSUBSCRIBE schannel"
+        set response [read_cli $fd]
+        set lines [split $response "\n"]
+        assert_equal "sunsubscribe" [lindex $lines 0]
+        assert_equal "schannel" [lindex $lines 1]
+        assert_equal "0" [lindex $lines 2]
+
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "1" $pubsub_status
+
+        write_cli $fd "UNSUBSCRIBE"
+        set response [read_cli $fd]
+
+        # Verify pubsub mode is no longer active
+        write_cli $fd ":get pubsub"
+        set pubsub_status [string trim [read_cli $fd]]
+        assert_equal "0" $pubsub_status
+
+        close_cli $fd
+    }
+
     test "Valid Connection Scheme: redis://" {
         set cmdline [valkeycliuri "redis://" [srv host] [srv port]]
         assert_equal {PONG} [exec {*}$cmdline PING]

From 35c87a6f356b199445ada351aaee24270aa38663 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= <viktor@szepe.net>
Date: Wed, 8 Jan 2025 22:39:45 +0100
Subject: [PATCH 066/101] Improve Typos configuration (#1456)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- remove old ignores
- fix a "new" typo 🎁

Signed-off-by: Viktor Szépe <viktor@szepe.net>
---
 .cmake-format.yaml |  2 +-
 .config/typos.toml | 21 ++++++++-------------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/.cmake-format.yaml b/.cmake-format.yaml
index 98ab11753a..60b1b46f1e 100644
--- a/.cmake-format.yaml
+++ b/.cmake-format.yaml
@@ -53,7 +53,7 @@ format:
   _help_require_valid_layout:
     - By default, if cmake-format cannot successfully fit
     - everything into the desired linewidth it will apply the
-    - last, most agressive attempt that it made. If this flag is
+    - last, most aggressive attempt that it made. If this flag is
     - True, however, cmake-format will print error, exit with non-
     - zero status code, and write-out nothing
   require_valid_layout: false
diff --git a/.config/typos.toml b/.config/typos.toml
index 1dc44ea0e9..a8ecab921c 100644
--- a/.config/typos.toml
+++ b/.config/typos.toml
@@ -2,16 +2,16 @@
 
 [files]
 extend-exclude = [
+    ".git/",
     "deps/",
     # crc16_slottable is primarily pre-generated random strings.
     "src/crc16_slottable.h",
 ]
+ignore-hidden = false
 
 [default.extend-words]
-advices = "advices"
 exat = "exat"
 optin = "optin"
-ro = "ro"
 smove = "smove"
 
 [type.c]
@@ -20,7 +20,7 @@ extend-ignore-re = [
     "D4C4DAA4", # sha1.c
     "Georg Nees",
     "\\[l\\]ist", # eval.c
-    "LKE", # test_rax.c
+    '"LKE"', # test_rax.c
 ]
 
 [type.tcl]
@@ -28,26 +28,23 @@ extend-ignore-re = [
     "DUMPed",
 ]
 
-[type.sv.extend-identifiers]
-# sv = .h
-module_gil_acquring = "module_gil_acquring"
-
 [type.c.extend-identifiers]
-ang = "ang"
+advices = "advices"
 clen = "clen"
 fle = "fle"
-module_gil_acquring = "module_gil_acquring"
 nd = "nd"
 ot = "ot"
 
 [type.tcl.extend-identifiers]
-fo = "fo"
 oll = "oll"
 stressers = "stressers"
 
-[type.sv.extend-words]
+[type.sv.extend-identifiers]
 # sv = .h
 fo = "fo"
+
+[type.sv.extend-words]
+# sv = .h
 seeked = "seeked"
 
 [type.c.extend-words]
@@ -58,7 +55,6 @@ limite = "limite"
 pn = "pn"
 seeked = "seeked"
 tre = "tre"
-ws = "ws"
 
 [type.systemd.extend-words]
 # systemd = .conf
@@ -66,5 +62,4 @@ ake = "ake"
 
 [type.tcl.extend-words]
 fo = "fo"
-lst = "lst"
 tre = "tre"

From b4de750269481a45343ac05d6575e1ad253ddc7d Mon Sep 17 00:00:00 2001
From: Nadav Gigi <95503908+NadavGigi@users.noreply.github.com>
Date: Thu, 9 Jan 2025 00:18:55 +0200
Subject: [PATCH 067/101] Accelerate hash table iterator with prefetching
 (#1501)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR introduces improvements to the hashtable iterator, implementing
prefetching technique described in the blog post [Unlock One Million RPS
- Part 2](https://valkey.io/blog/unlock-one-million-rps-part2/) . The
changes lay the groundwork for further enhancements in use cases
involving iterators. Future PRs will build upon this foundation to
improve performance and functionality in various iterator-dependent
operations.

In the pursuit of maximizing iterator performance, I conducted a
comprehensive series of experiments. My tests encompassed a wide range
of approaches, including processing multiple bucket indices in parallel,
prefetching the next bucket upon completion of the current one, and
several other timing and quantity variations. Surprisingly, after
rigorous testing and performance analysis, the simplest implementation
presented in this PR consistently outperformed all other more complex
strategies.

## Implementation

Each time we start iterating over a bucket, we prefetch data for future
iterations:

- We prefetch the entries of the next bucket (if it exists).
- We prefetch the structure (but not the entries) of the bucket after
  the next.

This prefetching is done when we pick up a new bucket, increasing the
chance that the data will be in cache by the time we need it.

## Performance

The data below was taken by conducting keys command on 64cores Graviton
3 Amazon EC2 instance with 50 mil keys in size of 100 bytes each. The
results regarding the duration of “keys *” command was taken from “info
all” command.

```
+--------------------+------------------+-----------------------------+
| prefetching        | Time (seconds)   | Keys Processed per Second   |
+--------------------+------------------+-----------------------------+
| No                 | 11.112279        | 4,499,529                   |
| Yes                | 3.141916         | 15,913,862                  |
+--------------------+------------------+-----------------------------+
Improvement:
Comparing the iterator without prefetching to the one with prefetching,
we can see a speed improvement of 11.112279 / 3.141916 ≈ 3.54 times faster.
```


### Save command improvment

#### Setup:
- 64cores Graviton 3 Amazon EC2 instance.
-  50 mil keys in size of 100 bytes each.
-  Running valkey server over RAM file system.
-  crc checksum and comperssion off.

#### Results

```
+--------------------+------------------+-----------------------------+
| prefetching        | Time (seconds)   | Keys Processed per Second   |
+--------------------+------------------+-----------------------------+
| No                 | 28               | 1,785,700                   |
| Yes                | 19.6             | 2,550,000                   |
+--------------------+------------------+-----------------------------+
Improvement:
- Reduced SAVE time by 30% (8.4 seconds faster)
- Increased key processing rate by 42.8% (764,300 more keys/second)
```

Signed-off-by: NadavGigi <nadavgigi102@gmail.com>
---
 src/hashtable.c | 92 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 70 insertions(+), 22 deletions(-)

diff --git a/src/hashtable.c b/src/hashtable.c
index 11ba360800..3f1eff19c1 100644
--- a/src/hashtable.c
+++ b/src/hashtable.c
@@ -498,7 +498,7 @@ size_t nextCursor(size_t v, size_t mask) {
 }
 
 /* Returns the next bucket in a bucket chain, or NULL if there's no next. */
-static bucket *bucketNext(bucket *b) {
+static bucket *getChildBucket(bucket *b) {
     return b->chained ? b->entries[ENTRIES_PER_BUCKET - 1] : NULL;
 }
 
@@ -548,12 +548,12 @@ static void rehashStep(hashtable *ht) {
     rehashBucket(ht, b);
     if (b->chained) {
         /* Rehash and free child buckets. */
-        bucket *next = bucketNext(b);
+        bucket *next = getChildBucket(b);
         b->chained = 0;
         b = next;
         while (b != NULL) {
             rehashBucket(ht, b);
-            next = bucketNext(b);
+            next = getChildBucket(b);
             zfree(b);
             if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket));
             ht->child_buckets[0]--;
@@ -708,7 +708,7 @@ static bucket *findBucket(hashtable *ht, uint64_t hash, const void *key, int *po
                     }
                 }
             }
-            b = bucketNext(b);
+            b = getChildBucket(b);
         } while (b != NULL);
     }
     return NULL;
@@ -753,7 +753,7 @@ static void bucketConvertToUnchained(bucket *b) {
  * This function needs the penultimate 'before_last' bucket in the chain, to be
  * able to update it when the last bucket is freed. */
 static void pruneLastBucket(hashtable *ht, bucket *before_last, bucket *last, int table_index) {
-    assert(before_last->chained && bucketNext(before_last) == last);
+    assert(before_last->chained && getChildBucket(before_last) == last);
     assert(!last->chained);
     assert(last->presence == 0 || __builtin_popcount(last->presence) == 1);
     bucketConvertToUnchained(before_last);
@@ -775,10 +775,10 @@ static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int tabl
     assert(b->chained && !isPositionFilled(b, pos_in_bucket));
     /* Find the last bucket */
     bucket *before_last = b;
-    bucket *last = bucketNext(b);
+    bucket *last = getChildBucket(b);
     while (last->chained) {
         before_last = last;
-        last = bucketNext(last);
+        last = getChildBucket(last);
     }
     /* Unless the last bucket is empty, find an entry in the last bucket and
      * move it to the hole in b. */
@@ -800,10 +800,10 @@ static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int tabl
 static void compactBucketChain(hashtable *ht, size_t bucket_index, int table_index) {
     bucket *b = &ht->tables[table_index][bucket_index];
     while (b->chained) {
-        bucket *next = bucketNext(b);
+        bucket *next = getChildBucket(b);
         if (next->chained && next->presence == 0) {
             /* Empty bucket in the middle of the chain. Remove it from the chain. */
-            bucket *next_next = bucketNext(next);
+            bucket *next_next = getChildBucket(next);
             b->entries[ENTRIES_PER_BUCKET - 1] = next_next;
             zfree(next);
             if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket));
@@ -846,7 +846,7 @@ static bucket *findBucketForInsert(hashtable *ht, uint64_t hash, int *pos_in_buc
             bucketConvertToChained(ht, b);
             ht->child_buckets[table]++;
         }
-        b = bucketNext(b);
+        b = getChildBucket(b);
     }
     /* Find a free slot in the bucket. There must be at least one. */
     int pos;
@@ -934,6 +934,51 @@ static inline incrementalFind *incrementalFindFromOpaque(hashtableIncrementalFin
     return (incrementalFind *)(void *)state;
 }
 
+/* Prefetches all filled entries in the given bucket to optimize future memory access. */
+static void prefetchBucketEntries(bucket *b) {
+    for (int pos = 0; pos < numBucketPositions(b); pos++) {
+        if (isPositionFilled(b, pos)) {
+            valkey_prefetch(b->entries[pos]);
+        }
+    }
+}
+
+/* Returns the child bucket if chained, otherwise the next bucket in the table. returns NULL if neither exists. */
+static bucket *getNextBucket(bucket *current_bucket, size_t bucket_index, hashtable *ht, int table_index) {
+    bucket *next_bucket = NULL;
+    if (current_bucket->chained) {
+        next_bucket = getChildBucket(current_bucket);
+    } else {
+        size_t table_size = numBuckets(ht->bucket_exp[table_index]);
+        size_t next_index = bucket_index + 1;
+        if (next_index < table_size) {
+            next_bucket = &ht->tables[table_index][next_index];
+        }
+    }
+    return next_bucket;
+}
+
+/* This function prefetches data that will be needed in subsequent iterations:
+ * - The entries of the next bucket
+ * - The next of the next bucket
+ * It attempts to bring this data closer to the L1 cache to reduce future memory access latency.
+ *
+ * Cache state before this function is called(due to last call for this function):
+ * 1. The current bucket and its entries are likely already in cache.
+ * 2. The next bucket is in cache.
+ */
+static void prefetchNextBucketEntries(iter *iter, bucket *current_bucket) {
+    size_t next_index = iter->index + 1;
+    bucket *next_bucket = getNextBucket(current_bucket, next_index, iter->hashtable, iter->table);
+    if (next_bucket) {
+        prefetchBucketEntries(next_bucket);
+        bucket *next_next_bucket = getNextBucket(next_bucket, next_index + 1, iter->hashtable, iter->table);
+        if (next_next_bucket) {
+            valkey_prefetch(next_next_bucket);
+        }
+    }
+}
+
 /* --- API functions --- */
 
 /* Allocates and initializes a new hashtable specified by the given type. */
@@ -979,7 +1024,7 @@ void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)) {
                             }
                         }
                     }
-                    bucket *next = bucketNext(b);
+                    bucket *next = getChildBucket(b);
 
                     /* Free allocated bucket. */
                     if (b != &ht->tables[table_index][idx]) {
@@ -1375,7 +1420,7 @@ int hashtableReplaceReallocatedEntry(hashtable *ht, const void *old_entry, void
                     return 1;
                 }
             }
-            b = bucketNext(b);
+            b = getChildBucket(b);
         } while (b != NULL);
     }
     return 0;
@@ -1538,8 +1583,8 @@ int hashtableIncrementalFindStep(hashtableIncrementalFindState *state) {
                     bucket_idx = data->hash & mask;
                 }
                 data->bucket = &ht->tables[data->table][bucket_idx];
-            } else if (bucketNext(data->bucket) != NULL) {
-                data->bucket = bucketNext(data->bucket);
+            } else if (getChildBucket(data->bucket) != NULL) {
+                data->bucket = getChildBucket(data->bucket);
             } else if (data->table == 0 && ht->rehash_idx >= 0) {
                 data->table = 1;
                 size_t mask = expToMask(ht->bucket_exp[1]);
@@ -1656,7 +1701,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f
                     }
                 }
             }
-            bucket *next = bucketNext(b);
+            bucket *next = getChildBucket(b);
             if (next != NULL && defragfn != NULL) {
                 next = bucketDefrag(b, next, defragfn);
             }
@@ -1693,7 +1738,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f
                         }
                     }
                 }
-                bucket *next = bucketNext(b);
+                bucket *next = getChildBucket(b);
                 if (next != NULL && defragfn != NULL) {
                     next = bucketDefrag(b, next, defragfn);
                 }
@@ -1723,7 +1768,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f
                             }
                         }
                     }
-                    bucket *next = bucketNext(b);
+                    bucket *next = getChildBucket(b);
                     if (next != NULL && defragfn != NULL) {
                         next = bucketDefrag(b, next, defragfn);
                     }
@@ -1859,7 +1904,7 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) {
             iter->pos_in_bucket++;
             if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) {
                 iter->pos_in_bucket = 0;
-                iter->bucket = bucketNext(iter->bucket);
+                iter->bucket = getChildBucket(iter->bucket);
             } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) {
                 /* Bucket index done. */
                 if (iter->safe) {
@@ -1890,6 +1935,9 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) {
             }
         }
         bucket *b = iter->bucket;
+        if (iter->pos_in_bucket == 0) {
+            prefetchNextBucketEntries(iter, b);
+        }
         if (!isPositionFilled(b, iter->pos_in_bucket)) {
             /* No entry here. */
             continue;
@@ -1988,7 +2036,7 @@ hashtableStats *hashtableGetStatsHt(hashtable *ht, int table_index, int full) {
         unsigned long chainlen = 0;
         while (b->chained) {
             chainlen++;
-            b = bucketNext(b);
+            b = getChildBucket(b);
         }
         if (chainlen > stats->max_chain_len) {
             stats->max_chain_len = chainlen;
@@ -2083,7 +2131,7 @@ void hashtableDump(hashtable *ht) {
                         printf("(empty)\n");
                     }
                 }
-                b = bucketNext(b);
+                b = getChildBucket(b);
                 level++;
             } while (b != NULL);
         }
@@ -2117,7 +2165,7 @@ void hashtableHistogram(hashtable *ht) {
                     continue;
                 }
                 printf("%X", __builtin_popcount(b->presence));
-                buckets[idx] = bucketNext(b);
+                buckets[idx] = getChildBucket(b);
                 if (buckets[idx] == NULL) chains_left--;
             }
             printf("\n");
@@ -2138,7 +2186,7 @@ int hashtableLongestBucketChain(hashtable *ht) {
                 if (++chainlen > maxlen) {
                     maxlen = chainlen;
                 }
-                b = bucketNext(b);
+                b = getChildBucket(b);
             }
         }
     }

From 80f11bebf2801a07bdfefdefa08de560c6be3841 Mon Sep 17 00:00:00 2001
From: Karthick Ariyaratnam <karthyuom@gmail.com>
Date: Wed, 8 Jan 2025 22:52:45 -0500
Subject: [PATCH 068/101] Remove legacy SERVER_TEST compiler flag from cmake.
 (#1530)

This PR is to cleanup the `SERVER_TEST` compiler flag from cmake compile
definitions, as it is no longer required in the new unit test framework, see #428.

Signed-off-by: Karthick Ariyaratnam <karthyuom@gmail.com>
---
 src/unit/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt
index 7d80c533cf..49960e1f4b 100644
--- a/src/unit/CMakeLists.txt
+++ b/src/unit/CMakeLists.txt
@@ -7,7 +7,6 @@ get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS)
 
 # Build unit tests only
 message(STATUS "Building unit tests")
-list(APPEND COMPILE_DEFINITIONS "SERVER_TEST=1")
 if (USE_TLS)
     if (BUILD_TLS_MODULE)
         # TLS as a module

From cae899a30fbe5e78471f3710bd75dd4b052cd094 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 9 Jan 2025 12:21:31 +0800
Subject: [PATCH 069/101] Fix new cli subscribed mode test in cluster mode
 (#1533)

We need to add a hash tag in cluster mode.
Fixes #1531.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/integration/valkey-cli.tcl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl
index a56818b8c2..095f1e77bc 100644
--- a/tests/integration/valkey-cli.tcl
+++ b/tests/integration/valkey-cli.tcl
@@ -681,14 +681,13 @@ if {!$::tls} { ;# fake_redis_node doesn't support TLS
     }
 
     test "valkey-cli pubsub mode with multiple shard channel subscriptions" {
-
         set fd [open_cli]
 
         write_cli $fd ":get pubsub"
         set pubsub_status [string trim [read_cli $fd]]
         assert_equal "0" $pubsub_status
 
-        write_cli $fd "SSUBSCRIBE schannel1 schannel2 schannel3"
+        write_cli $fd "SSUBSCRIBE {schannel}1 {schannel}2 {schannel}3"
         set response [read_cli $fd]
 
         write_cli $fd ":get pubsub"

From 04bdeea2836be9c39df4ea34164153c5cc724466 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Wed, 8 Jan 2025 22:35:48 -0800
Subject: [PATCH 070/101] Free the passed in lua context instead of the global
 (#1536)

The fix that Redis gave us for the CVE-2024-46981 was freeing lctx.lua,
and I didn't merge it correctly. We made some changes so that we are
able to async free the lua context, so we need to free the passed in
context. This was applied correctly on the two released versions (8.0
and 7.2) just not on unstable.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/eval.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eval.c b/src/eval.c
index 9aa185d77b..62780447a9 100644
--- a/src/eval.c
+++ b/src/eval.c
@@ -285,7 +285,7 @@ void scriptingInit(int setup) {
 void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) {
     dictRelease(lua_scripts);
     listRelease(lua_scripts_lru_list);
-    lua_gc(lctx.lua, LUA_GCCOLLECT, 0);
+    lua_gc(lua, LUA_GCCOLLECT, 0);
     lua_close(lua);
 
 #if !defined(USE_LIBC)

From 974c3eb96bc99f44d2fa9eee0a1a5e58e684ace6 Mon Sep 17 00:00:00 2001
From: Harkrishn Patro <harkrisp@amazon.com>
Date: Thu, 9 Jan 2025 17:19:36 -0800
Subject: [PATCH 071/101] Update upload artifacts to v4 (#1539)

Fixes #1538

Signed-off-by: Harkrishn Patro <harkrisp@amazon.com>
---
 .github/workflows/external.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/external.yml b/.github/workflows/external.yml
index cfcab995d5..eeacf80609 100644
--- a/.github/workflows/external.yml
+++ b/.github/workflows/external.yml
@@ -34,7 +34,7 @@ jobs:
             --tags -slow
       - name: Archive server log
         if: ${{ failure() }}
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: test-external-standalone-log
           path: external-server.log
@@ -62,7 +62,7 @@ jobs:
             --tags -slow
       - name: Archive server log
         if: ${{ failure() }}
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: test-external-cluster-log
           path: external-server.log
@@ -86,7 +86,7 @@ jobs:
             --tags "-slow -needs:debug"
       - name: Archive server log
         if: ${{ failure() }}
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
         with:
           name: test-external-nodebug-log
           path: external-server.log

From 113abc95428f7744acb528aa0a23c8f591fc63e5 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 10 Jan 2025 10:19:04 +0800
Subject: [PATCH 072/101] Fix crash when freeing newly created node when
 nodeIp2String fail (#1535)

In #1441, we found a assert, and decided remove this assert and instead
just free the newly created node and close the link, since if we cannot
get the IP from the link it probably means the connection was closed.
```
=== VALKEY BUG REPORT START: Cut & paste starting from here ===
17847:M 19 Dec 2024 00:15:58.021 # === ASSERTION FAILED ===
17847:M 19 Dec 2024 00:15:58.021 # ==> cluster_legacy.c:3252 'nodeIp2String(node->ip, link, hdr->myip) == C_OK' is not true

------ STACK TRACE ------

17847 valkey-server *
src/valkey-server 127.0.0.1:27131 [cluster](clusterProcessPacket+0x1304) [0x4e5634]
src/valkey-server 127.0.0.1:27131 [cluster](clusterReadHandler+0x11e) [0x4e59de]
/__w/valkey/valkey/src/valkey-tls.so(+0x2f1e) [0x7f083983ff1e]
src/valkey-server 127.0.0.1:27131 [cluster](aeMain+0x8a) [0x41afea]
src/valkey-server 127.0.0.1:27131 [cluster](main+0x4d7) [0x40f547]
/lib64/libc.so.6(+0x40c8) [0x7f083985a0c8]
/lib64/libc.so.6(__libc_start_main+0x8b) [0x7f083985a18b]
src/valkey-server 127.0.0.1:27131 [cluster](_start+0x25) [0x410ef5]
```

But it also introduces another assert. The reason is that this new node
is not added to the cluster nodes dict.
```
17128:M 08 Jan 2025 10:51:44.061 # === ASSERTION FAILED ===
17128:M 08 Jan 2025 10:51:44.061 # ==> cluster_legacy.c:1693 'dictDelete(server.cluster->nodes, nodename) == DICT_OK' is not true

------ STACK TRACE ------

17128 valkey-server *
src/valkey-server 127.0.0.1:28627 [cluster][0x4ebdc4]
src/valkey-server 127.0.0.1:28627 [cluster][0x4e81d2]
src/valkey-server 127.0.0.1:28627 [cluster](clusterReadHandler+0x268)[0x4e8618]
/__w/valkey/valkey/src/valkey-tls.so(+0xb278)[0x7f109480b278]
src/valkey-server 127.0.0.1:28627 [cluster](aeMain+0x89)[0x592b09]
src/valkey-server 127.0.0.1:28627 [cluster](main+0x4b3)[0x453e23]
/lib64/libc.so.6(__libc_start_main+0xe5)[0x7f10958bf7e5]
src/valkey-server 127.0.0.1:28627 [cluster](_start+0x2e)[0x454a5e]
```

This closes #1527.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 0777d6d8c6..bf5d314908 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3245,6 +3245,17 @@ int clusterProcessPacket(clusterLink *link) {
         if (type == CLUSTERMSG_TYPE_MEET) {
             if (!sender) {
                 if (!link->node) {
+                    char ip[NET_IP_STR_LEN] = {0};
+                    if (nodeIp2String(ip, link, hdr->myip) != C_OK) {
+                        /* Unable to retrieve the node's IP address from the connection. Without a
+                         * valid IP, the node becomes unusable in the cluster. This failure might be
+                         * due to the connection being closed. */
+                        serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, "
+                                             "because the connection has an error");
+                        freeClusterLink(link);
+                        return 0;
+                    }
+
                     /* Add this node if it is new for us and the msg type is MEET.
                      * In this stage we don't try to add the node with the right
                      * flags, replicaof pointer, and so forth, as this details will be
@@ -3253,14 +3264,7 @@ int clusterProcessPacket(clusterLink *link) {
                      * we want to send extensions right away in the return PONG in order
                      * to reduce the amount of time needed to stabilize the shard ID. */
                     clusterNode *node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
-                    if (nodeIp2String(node->ip, link, hdr->myip) != C_OK) {
-                        /* We cannot get the IP info from the link, it probably means the connection is closed. */
-                        serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, "
-                                             "because the connection has an error");
-                        freeClusterLink(link);
-                        freeClusterNode(node);
-                        return 0;
-                    }
+                    memcpy(node->ip, ip, sizeof(ip));
                     getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
                     node->cport = ntohs(hdr->cport);
                     if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {

From 4346daf83be8ffa43ee839c70eed172af30343d4 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 11 Jan 2025 10:32:58 +0800
Subject: [PATCH 073/101] Fix module LatencyAddSample still work when
 latency-monitor-threshold is 0 (#1541)

When latency-monitor-threshold is set to 0, it means the latency monitor
is disabled, and in VM_LatencyAddSample, we wrote the condition
incorrectly, causing us to record latency when latency was turned off.

This bug was introduced in the very first day, see e3b1d6d, it was merged
in 2019.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/module.c                    |  2 +-
 tests/modules/basics.c          | 21 +++++++++++++++++++++
 tests/unit/moduleapi/basics.tcl | 17 +++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/module.c b/src/module.c
index 58555839f2..76c362b758 100644
--- a/src/module.c
+++ b/src/module.c
@@ -7680,7 +7680,7 @@ void VM__Assert(const char *estr, const char *file, int line) {
  * command. The call is skipped if the latency is smaller than the configured
  * latency-monitor-threshold. */
 void VM_LatencyAddSample(const char *event, mstime_t latency) {
-    if (latency >= server.latency_monitor_threshold) latencyAddSample(event, latency);
+    latencyAddSampleIfNeeded(event, latency);
 }
 
 /* --------------------------------------------------------------------------
diff --git a/tests/modules/basics.c b/tests/modules/basics.c
index 36f88becbe..c5b5565686 100644
--- a/tests/modules/basics.c
+++ b/tests/modules/basics.c
@@ -669,6 +669,24 @@ int TestNotifications(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc)
     return ValkeyModule_ReplyWithSimpleString(ctx, "ERR");
 }
 
+/* test.latency latency_ms */
+int TestLatency(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) {
+    if (argc != 2) {
+        ValkeyModule_WrongArity(ctx);
+        return VALKEYMODULE_OK;
+    }
+
+    long long latency_ms;
+    if (ValkeyModule_StringToLongLong(argv[1], &latency_ms) != VALKEYMODULE_OK) {
+        ValkeyModule_ReplyWithError(ctx, "Invalid integer value");
+        return VALKEYMODULE_OK;
+    }
+
+    ValkeyModule_LatencyAddSample("test", latency_ms);
+    ValkeyModule_ReplyWithSimpleString(ctx, "OK");
+    return VALKEYMODULE_OK;
+}
+
 /* TEST.CTXFLAGS -- Test GetContextFlags. */
 int TestCtxFlags(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) {
     VALKEYMODULE_NOT_USED(argc);
@@ -1048,5 +1066,8 @@ int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int arg
         TestNotifications,"write deny-oom",1,1,1) == VALKEYMODULE_ERR)
         return VALKEYMODULE_ERR;
 
+    if (ValkeyModule_CreateCommand(ctx, "test.latency", TestLatency, "readonly", 0, 0, 0) == VALKEYMODULE_ERR)
+        return VALKEYMODULE_ERR;
+
     return VALKEYMODULE_OK;
 }
diff --git a/tests/unit/moduleapi/basics.tcl b/tests/unit/moduleapi/basics.tcl
index 733e8e1962..0696e30050 100644
--- a/tests/unit/moduleapi/basics.tcl
+++ b/tests/unit/moduleapi/basics.tcl
@@ -39,6 +39,23 @@ start_server {tags {"modules"}} {
         verify_log_message 0 "*Module name is busy*" 0
     }
 
+    test "test latency" {
+        r config set latency-monitor-threshold 0
+        r latency reset
+        r test.latency 0
+        r test.latency 1
+        assert_equal {} [r latency latest]
+        assert_equal {} [r latency history test]
+
+        r config set latency-monitor-threshold 1
+        r test.latency 0
+        assert_equal 0 [llength [r latency history test]]
+        r test.latency 1
+        assert_match {*test * 1 1*} [r latency latest]
+        r test.latency 2
+        assert_match {*test * 2 2*} [r latency latest]
+    }
+
     test "Unload the module - basics" {
         assert_equal {OK} [r module unload test]
     }

From 9c58fb533b65dbf2a5f3e109ea39ba94960c663b Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 11 Jan 2025 10:43:18 +0800
Subject: [PATCH 074/101] Do election in order based on failed primary rank to
 avoid voting conflicts (#1018)

When multiple primary nodes fail simultaneously, the cluster can not recover
within the default effective time (data_age limit). The main reason is that
the vote is without ranking among multiple replica nodes, which case too many
epoch conflicts.

Therefore, we introduced into ranking based on the failed primary shard-id.
Introduced a new failed_primary_rank var, this var means the rank of this
myself instance in the context of all failed primary list. This var will be
used in failover and we will do the failover election packets in order based
on the rank, this can effectively avoid the voting conflicts.

If a single primary is down, the behavior is the same as before. If multiple
primaries are down, their replica election initiation time will be delayed
by 500ms according to the ranking.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c             | 61 ++++++++++++++++++++++++++++++--
 src/cluster_legacy.h             | 15 ++++----
 tests/unit/cluster/failover2.tcl | 35 ++++++++++++++++--
 3 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index bf5d314908..01b92f71c1 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1103,6 +1103,7 @@ void clusterInit(void) {
     server.cluster->failover_auth_time = 0;
     server.cluster->failover_auth_count = 0;
     server.cluster->failover_auth_rank = 0;
+    server.cluster->failover_failed_primary_rank = 0;
     server.cluster->failover_auth_epoch = 0;
     server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
     server.cluster->lastVoteEpoch = 0;
@@ -4514,6 +4515,45 @@ int clusterGetReplicaRank(void) {
     return rank;
 }
 
+/* This function returns the "rank" of this instance's primary, in the context
+ * of all failed primary list. The primary node will be ignored if failed time
+ * exceeds cluster-node-timeout * cluster-replica-validity-factor.
+ *
+ * If multiple primary nodes go down at the same time, there is a certain
+ * probability that their replicas will initiate the elections at the same time,
+ * and lead to insufficient votes.
+ *
+ * The failed primary rank is used to add a delay to start an election in order
+ * to avoid simultaneous elections of replicas. */
+int clusterGetFailedPrimaryRank(void) {
+    serverAssert(nodeIsReplica(myself));
+    serverAssert(myself->replicaof);
+
+    int rank = 0;
+    mstime_t now = mstime();
+    dictIterator *di;
+    dictEntry *de;
+
+    di = dictGetSafeIterator(server.cluster->nodes);
+    while ((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+
+        /* Skip nodes that do not need to participate in the rank. */
+        if (!nodeFailed(node) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue;
+
+        /* If cluster-replica-validity-factor is enabled, skip the invalid nodes. */
+        if (server.cluster_replica_validity_factor) {
+            if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor))
+                continue;
+        }
+
+        if (memcmp(node->shard_id, myself->shard_id, CLUSTER_NAMELEN) < 0) rank++;
+    }
+    dictReleaseIterator(di);
+
+    return rank;
+}
+
 /* This function is called by clusterHandleReplicaFailover() in order to
  * let the replica log why it is not able to failover. Sometimes there are
  * not the conditions, but since the failover function is called again and
@@ -4695,6 +4735,11 @@ void clusterHandleReplicaFailover(void) {
          * Specifically 1 second * rank. This way replicas that have a probably
          * less updated replication offset, are penalized. */
         server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000;
+        /* We add another delay that is proportional to the failed primary rank.
+         * Specifically 0.5 second * rank. This way those failed primaries will be
+         * elected in rank to avoid the vote conflicts. */
+        server.cluster->failover_failed_primary_rank = clusterGetFailedPrimaryRank();
+        server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * 500;
         /* However if this is a manual failover, no delay is needed. */
         if (server.cluster->mf_end) {
             server.cluster->failover_auth_time = now;
@@ -4705,9 +4750,9 @@ void clusterHandleReplicaFailover(void) {
         }
         serverLog(LL_NOTICE,
                   "Start of election delayed for %lld milliseconds "
-                  "(rank #%d, offset %lld).",
+                  "(rank #%d, primary rank #%d, offset %lld).",
                   server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank,
-                  replicationGetReplicaOffset());
+                  server.cluster->failover_failed_primary_rank, replicationGetReplicaOffset());
         /* Now that we have a scheduled election, broadcast our offset
          * to all the other replicas so that they'll updated their offsets
          * if our offset is better. */
@@ -4723,6 +4768,9 @@ void clusterHandleReplicaFailover(void) {
      * replicas for the same primary since we computed our election delay.
      * Update the delay if our rank changed.
      *
+     * It is also possible that we received the message that telling a
+     * shard is up. Update the delay if our failed_primary_rank changed.
+     *
      * Not performed if this is a manual failover. */
     if (server.cluster->failover_auth_sent == 0 && server.cluster->mf_end == 0) {
         int newrank = clusterGetReplicaRank();
@@ -4733,6 +4781,15 @@ void clusterHandleReplicaFailover(void) {
             serverLog(LL_NOTICE, "Replica rank updated to #%d, added %lld milliseconds of delay.", newrank,
                       added_delay);
         }
+
+        int new_failed_primary_rank = clusterGetFailedPrimaryRank();
+        if (new_failed_primary_rank != server.cluster->failover_failed_primary_rank) {
+            long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * 500;
+            server.cluster->failover_auth_time += added_delay;
+            server.cluster->failover_failed_primary_rank = new_failed_primary_rank;
+            serverLog(LL_NOTICE, "Failed primary rank updated to #%d, added %lld milliseconds of delay.",
+                      new_failed_primary_rank, added_delay);
+        }
     }
 
     /* Return ASAP if we can't still start the election. */
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index ac14bd583c..226842c5dc 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -382,13 +382,14 @@ struct clusterState {
     clusterNode *importing_slots_from[CLUSTER_SLOTS];
     clusterNode *slots[CLUSTER_SLOTS];
     /* The following fields are used to take the replica state on elections. */
-    mstime_t failover_auth_time;  /* Time of previous or next election. */
-    int failover_auth_count;      /* Number of votes received so far. */
-    int failover_auth_sent;       /* True if we already asked for votes. */
-    int failover_auth_rank;       /* This replica rank for current auth request. */
-    uint64_t failover_auth_epoch; /* Epoch of the current election. */
-    int cant_failover_reason;     /* Why a replica is currently not able to
-                                     failover. See the CANT_FAILOVER_* macros. */
+    mstime_t failover_auth_time;      /* Time of previous or next election. */
+    int failover_auth_count;          /* Number of votes received so far. */
+    int failover_auth_sent;           /* True if we already asked for votes. */
+    int failover_auth_rank;           /* This replica rank for current auth request. */
+    int failover_failed_primary_rank; /* The rank of this instance in the context of all failed primary list. */
+    uint64_t failover_auth_epoch;     /* Epoch of the current election. */
+    int cant_failover_reason;         /* Why a replica is currently not able to
+                                       * failover. See the CANT_FAILOVER_* macros. */
     /* Manual failover state in common. */
     mstime_t mf_end; /* Manual failover time limit (ms unixtime).
                         It is zero if there is no MF in progress. */
diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
index 9262049e4e..2272a150ee 100644
--- a/tests/unit/cluster/failover2.tcl
+++ b/tests/unit/cluster/failover2.tcl
@@ -62,10 +62,8 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval
         verify_no_log_message -3 "*Failover attempt expired*" 0
         verify_no_log_message -6 "*Failover attempt expired*" 0
     }
-
 } ;# start_cluster
 
-
 start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
     test "Primaries will not time out then they are elected in the same epoch" {
         # Since we have the delay time, so these node may not initiate the
@@ -102,3 +100,36 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval
         resume_process [srv -2 pid]
     }
 } ;# start_cluster
+
+run_solo {cluster} {
+    start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
+        test "Multiple primary nodes are down, rank them based on the failed primary" {
+            # Killing these primary nodes.
+            for {set j 0} {$j < 15} {incr j} {
+                pause_process [srv -$j pid]
+            }
+
+            # Make sure that a node starts failover.
+            wait_for_condition 1000 100 {
+                [s -40 role] == "master"
+            } else {
+                fail "No failover detected"
+            }
+
+            # Wait for the cluster state to become ok.
+            for {set j 0} {$j < [llength $::servers]} {incr j} {
+                if {[process_is_paused [srv -$j pid]]} continue
+                wait_for_condition 1000 100 {
+                    [CI $j cluster_state] eq "ok"
+                } else {
+                    fail "Cluster node $j cluster_state:[CI $j cluster_state]"
+                }
+            }
+
+            # Resuming these primary nodes, speed up the shutdown.
+            for {set j 0} {$j < 15} {incr j} {
+                resume_process [srv -$j pid]
+            }
+        }
+    } ;# start_cluster
+} ;# run_solo

From 7fdd0e6bd6a26f31a692d9f29b1c82df46c54051 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 11 Jan 2025 11:02:05 +0800
Subject: [PATCH 075/101] Mark the node as FAIL when the node is marked as
 NOADDR and broadcast the FAIL (#1191)

Imagine we have a cluster, for example a three-shard cluster,
if shard 1 doing a CLUSTER RESET HARD, it will change the node
name, and then other nodes will mark it as NOADR since the node
name received by PONG has changed.

In the eyes of other nodes, there is one working primary node
left but with no address, and in this case, the address report
in MOVED will be invalid and will confuse the clients. And in
the same time, the replica will not failover since its primary
is not in the FAIL state. And the cluster looks OK to everyone.

This leaves a cluster that appears OK, but with no coverage for
shard 1, obviously we should do something like CLUSTER FORGET
to remove the node and fix the cluster before using it.

But the point in here, we can mark the NOADDR node as FAIL to
advance the cluster state. If a node is NOADDR means it does
not have a valid address, so we won't reconnect it, we won't
send PING, we won't gossip it, it seems reasonable to mark it
as FAIL.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c          | 21 +++++++++++++---
 tests/unit/cluster/noaddr.tcl | 47 +++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/cluster/noaddr.tcl

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 01b92f71c1..9251667584 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -666,7 +666,8 @@ int clusterLoadConfig(char *filename) {
             } else if (!strcasecmp(s, "handshake")) {
                 n->flags |= CLUSTER_NODE_HANDSHAKE;
             } else if (!strcasecmp(s, "noaddr")) {
-                n->flags |= CLUSTER_NODE_NOADDR;
+                n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL);
+                n->fail_time = mstime();
             } else if (!strcasecmp(s, "nofailover")) {
                 n->flags |= CLUSTER_NODE_NOFAILOVER;
             } else if (!strcasecmp(s, "noflags")) {
@@ -3349,7 +3350,9 @@ int clusterProcessPacket(clusterLink *link) {
             } else if (memcmp(link->node->name, hdr->sender, CLUSTER_NAMELEN) != 0) {
                 /* If the reply has a non matching node ID we
                  * disconnect this node and set it as not having an associated
-                 * address. */
+                 * address. This can happen if the node did CLUSTER RESET and changed
+                 * its node ID. In this case, the old node ID will not come back. */
+                clusterNode *noaddr_node = link->node;
                 serverLog(LL_NOTICE,
                           "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, "
                           "having flags %d",
@@ -3361,7 +3364,19 @@ int clusterProcessPacket(clusterLink *link) {
                 link->node->tls_port = 0;
                 link->node->cport = 0;
                 freeClusterLink(link);
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+                /* We will also mark the node as fail because we have disconnected from it,
+                 * and will not reconnect, and obviously we will not gossip NOADDR nodes.
+                 * Marking it as FAIL can help us advance the state, such as the cluster
+                 * state becomes FAIL or the replica can do the failover. Otherwise, the
+                 * NOADDR node will provide an invalid address in redirection and confuse
+                 * the clients, and the replica will never initiate a failover since the
+                 * node is not actually in FAIL state. */
+                if (!nodeFailed(noaddr_node)) {
+                    noaddr_node->flags |= CLUSTER_NODE_FAIL;
+                    noaddr_node->fail_time = now;
+                    clusterSendFail(noaddr_node->name);
+                }
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
                 return 0;
             }
         }
diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl
new file mode 100644
index 0000000000..fb4e501809
--- /dev/null
+++ b/tests/unit/cluster/noaddr.tcl
@@ -0,0 +1,47 @@
+start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} {
+    test "NOADDR nodes will be marked as FAIL" {
+        set primary0_id [R 0 CLUSTER MYID]
+
+        # R 0 is a primary, after doing a CLUSTER RESET, the node name will be modified,
+        # and other nodes will set it to NOADDR and FAIL.
+        R 0 cluster reset hard
+        wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10
+        wait_for_log_messages -2 {"*PONG contains mismatching sender ID*"} 0 1000 10
+        wait_for_condition 1000 10 {
+            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 &&
+            [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1 &&
+            [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] noaddr] eq 1 &&
+            [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] fail] eq 1
+        } else {
+            fail "The node is not marked with the correct flag"
+        }
+
+        # Also we will set the node to FAIL, so the cluster will eventually be down.
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {fail} &&
+            [CI 2 cluster_state] eq {fail} &&
+            [CI 3 cluster_state] eq {fail} &&
+            [CI 4 cluster_state] eq {fail} &&
+            [CI 5 cluster_state] eq {fail}
+        } else {
+            fail "Cluster doesn't fail"
+        }
+
+        # key_977613 belong to slot 0 and belong to R 0.
+        # Make sure we get a CLUSTER DOWN instead of an invalid MOVED.
+        assert_error {CLUSTERDOWN*} {R 1 set key_977613 bar}
+
+        # Let the replica 3 do the failover.
+        R 3 config set cluster-replica-validity-factor 0
+        R 3 config set cluster-replica-no-failover no
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok} &&
+            [CI 3 cluster_state] eq {ok} &&
+            [CI 4 cluster_state] eq {ok} &&
+            [CI 5 cluster_state] eq {ok}
+        } else {
+            fail "Cluster doesn't stabilize"
+        }
+    }
+} ;# start_cluster

From 52357edac5ffc9454e1264ca2321f9b3b72a98de Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 11 Jan 2025 11:03:10 +0800
Subject: [PATCH 076/101] Add latency stats around cluster config file
 operations (#1534)

When the cluster changes, we need to persist the cluster configuration,
and these file IO operations may cause latency.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c           | 30 ++++++++++++++++++++++++++++--
 tests/unit/latency-monitor.tcl | 12 ++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 9251667584..5c4bb65aae 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -817,6 +817,7 @@ int clusterSaveConfig(int do_fsync) {
     ssize_t written_bytes;
     int fd = -1;
     int retval = C_ERR;
+    mstime_t latency;
 
     server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG;
 
@@ -830,11 +831,15 @@ int clusterSaveConfig(int do_fsync) {
 
     /* Create a temp file with the new content. */
     tmpfilename = sdscatfmt(sdsempty(), "%s.tmp-%i-%I", server.cluster_configfile, (int)getpid(), mstime());
+    latencyStartMonitor(latency);
     if ((fd = open(tmpfilename, O_WRONLY | O_CREAT, 0644)) == -1) {
         serverLog(LL_WARNING, "Could not open temp cluster config file: %s", strerror(errno));
         goto cleanup;
     }
+    latencyEndMonitor(latency);
+    latencyAddSampleIfNeeded("cluster-config-open", latency);
 
+    latencyStartMonitor(latency);
     while (offset < content_size) {
         written_bytes = write(fd, ci + offset, content_size - offset);
         if (written_bytes <= 0) {
@@ -845,31 +850,52 @@ int clusterSaveConfig(int do_fsync) {
         }
         offset += written_bytes;
     }
+    latencyEndMonitor(latency);
+    latencyAddSampleIfNeeded("cluster-config-write", latency);
 
     if (do_fsync) {
+        latencyStartMonitor(latency);
         server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG;
         if (valkey_fsync(fd) == -1) {
             serverLog(LL_WARNING, "Could not sync tmp cluster config file: %s", strerror(errno));
             goto cleanup;
         }
+        latencyEndMonitor(latency);
+        latencyAddSampleIfNeeded("cluster-config-fsync", latency);
     }
 
+    latencyStartMonitor(latency);
     if (rename(tmpfilename, server.cluster_configfile) == -1) {
         serverLog(LL_WARNING, "Could not rename tmp cluster config file: %s", strerror(errno));
         goto cleanup;
     }
+    latencyEndMonitor(latency);
+    latencyAddSampleIfNeeded("cluster-config-rename", latency);
 
     if (do_fsync) {
+        latencyStartMonitor(latency);
         if (fsyncFileDir(server.cluster_configfile) == -1) {
             serverLog(LL_WARNING, "Could not sync cluster config file dir: %s", strerror(errno));
             goto cleanup;
         }
+        latencyEndMonitor(latency);
+        latencyAddSampleIfNeeded("cluster-config-dir-fsync", latency);
     }
     retval = C_OK; /* If we reached this point, everything is fine. */
 
 cleanup:
-    if (fd != -1) close(fd);
-    if (retval == C_ERR) unlink(tmpfilename);
+    if (fd != -1) {
+        latencyStartMonitor(latency);
+        close(fd);
+        latencyEndMonitor(latency);
+        latencyAddSampleIfNeeded("cluster-config-close", latency);
+    }
+    if (retval == C_ERR) {
+        latencyStartMonitor(latency);
+        unlink(tmpfilename);
+        latencyEndMonitor(latency);
+        latencyAddSampleIfNeeded("cluster-config-unlink", latency);
+    }
     sdsfree(tmpfilename);
     sdsfree(ci);
     return retval;
diff --git a/tests/unit/latency-monitor.tcl b/tests/unit/latency-monitor.tcl
index 9048192a44..e4f45389d7 100644
--- a/tests/unit/latency-monitor.tcl
+++ b/tests/unit/latency-monitor.tcl
@@ -189,3 +189,15 @@ tags {"needs:debug"} {
         assert_match "*wrong number of arguments for 'latency|help' command" $e
     }
 }
+
+start_cluster 1 1 {tags {"latency-monitor cluster external:skip needs:latency"} overrides {latency-monitor-threshold 1}} {
+    test "Cluster config file latency" {
+        # This test just a sanity test so that we can make sure the code path is cover.
+        # We don't assert anything since we can't be sure whether it will be counted.
+        R 0 cluster saveconfig
+        R 1 cluster saveconfig
+        R 1 cluster failover force
+        R 0 latency latest
+        R 1 latency latest
+    }
+}

From 2b73a9f209c924fd5aef0d0d9bd4cd087722ea02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Sun, 12 Jan 2025 01:02:39 +0100
Subject: [PATCH 077/101] Skip CLI tests with reply schema validation (#1545)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commands used in valkey-cli tests are not important the reply schema
validation. Skip them to avoid the problem if tests hanging. This has
failed lately in the daily job:

```
[TIMEOUT]: clients state report follows.
sock55fedcc19be0 => (IN PROGRESS) valkey-cli pubsub mode with single standard channel subscription
Killing still running Valkey server 33357
```

These test cases use a special valkey-cli command `:get pubsub` command,
which is an internal command to valkey-cli rather than a Valkey server
command. This command hangs when compiled with with logreqres enabled.
Easy solution is to skip the tests in this setup.

The test cases were introduced in #1432.

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 tests/integration/valkey-cli.tcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl
index 095f1e77bc..d4adf9c071 100644
--- a/tests/integration/valkey-cli.tcl
+++ b/tests/integration/valkey-cli.tcl
@@ -6,7 +6,7 @@ if {$::singledb} {
     set ::dbnum 9
 }
 
-start_server {tags {"cli"}} {
+start_server {tags {"cli logreqres:skip"}} {
     proc open_cli {{opts ""} {infile ""}} {
         if { $opts == "" } {
             set opts "-n $::dbnum"

From df879e322fbeb61730e1e433a4504217f9427fc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Mon, 13 Jan 2025 03:14:09 +0100
Subject: [PATCH 078/101] Test coverage for ECHO for reply schema validation
 (#1549)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After #1545 disabled some tests for reply schema validation, we now have
another issue that ECHO is not covered.

```
WARNING! The following commands were not hit at all:
  echo
ERROR! at least one command was not hit by the tests
```

This patch adds a test case for ECHO in the unit/other test suite. I
haven't checked if there are more commands that aren't covered.

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 .github/workflows/daily.yml | 2 +-
 tests/unit/other.tcl        | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index e1d577b51b..c0e7e4b446 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -1220,7 +1220,7 @@ jobs:
     if: |
       (github.event_name == 'workflow_dispatch' ||
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
-        (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
+        (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'reply-schema')
     steps:
       - name: prep
diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl
index bb08c67471..225287aa3a 100644
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@@ -30,6 +30,10 @@ start_server {tags {"other"}} {
         }
     }
 
+    test {Coverage: ECHO} {
+        assert_equal bang [r ECHO bang]
+    }
+
     test {SAVE - make sure there are all the types as values} {
         # Wait for a background saving in progress to terminate
         waitForBgsave r

From 36707d5513e5cd7acfa61686017bdb131cc9c110 Mon Sep 17 00:00:00 2001
From: Rain Valentine <rsg000@gmail.com>
Date: Mon, 13 Jan 2025 02:17:16 -0800
Subject: [PATCH 079/101] Replace dict with new hashtable: hash datatype
 (#1502)

This PR replaces dict with the new hashtable data structure in the HASH
datatype. There is a new struct for hashtable items which contains a
pointer to value sds string and the embedded key sds string. These
values were previously stored in dictEntry. This structure is kept
opaque so we can easily add small value embedding or other optimizations
in the future.

closes #1095

---------

Signed-off-by: Rain Valentine <rsg000@gmail.com>
---
 src/aof.c      |   6 +-
 src/db.c       |  68 +++-------
 src/debug.c    |  12 +-
 src/defrag.c   |  63 +++++----
 src/lazyfree.c |   6 +-
 src/module.c   |  35 +----
 src/object.c   |  54 ++++----
 src/rdb.c      |  62 ++++-----
 src/server.c   |  18 +++
 src/server.h   |  20 ++-
 src/t_hash.c   | 339 +++++++++++++++++++++++++++++--------------------
 11 files changed, 350 insertions(+), 333 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 28353e485c..17c462169e 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1928,7 +1928,7 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) {
 /* Write either the key or the value of the currently selected item of a hash.
  * The 'hi' argument passes a valid hash iterator.
  * The 'what' filed specifies if to write a key or a value and can be
- * either OBJ_HASH_KEY or OBJ_HASH_VALUE.
+ * either OBJ_HASH_FIELD or OBJ_HASH_VALUE.
  *
  * The function returns 0 on error, non-zero on success. */
 static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) {
@@ -1942,7 +1942,7 @@ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) {
             return rioWriteBulkString(r, (char *)vstr, vlen);
         else
             return rioWriteBulkLongLong(r, vll);
-    } else if (hi->encoding == OBJ_ENCODING_HT) {
+    } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) {
         sds value = hashTypeCurrentFromHashTable(hi, what);
         return rioWriteBulkString(r, value, sdslen(value));
     }
@@ -1969,7 +1969,7 @@ int rewriteHashObject(rio *r, robj *key, robj *o) {
             }
         }
 
-        if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_KEY) || !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) {
+        if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_FIELD) || !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) {
             hashTypeResetIterator(&hi);
             return 0;
         }
diff --git a/src/db.c b/src/db.c
index 55ffe5da5a..94074bf668 100644
--- a/src/db.c
+++ b/src/db.c
@@ -979,39 +979,6 @@ void keysScanCallback(void *privdata, void *entry) {
 
 /* This callback is used by scanGenericCommand in order to collect elements
  * returned by the dictionary iterator into a list. */
-void dictScanCallback(void *privdata, const dictEntry *de) {
-    scanData *data = (scanData *)privdata;
-    list *keys = data->keys;
-    robj *o = data->o;
-    sds val = NULL;
-    sds key = NULL;
-    data->sampled++;
-
-    /* This callback is only used for scanning elements within a key (hash
-     * fields, set elements, etc.) so o must be set here. */
-    serverAssert(o != NULL);
-
-    /* Filter element if it does not match the pattern. */
-    sds keysds = dictGetKey(de);
-    if (data->pattern) {
-        if (!stringmatchlen(data->pattern, sdslen(data->pattern), keysds, sdslen(keysds), 0)) {
-            return;
-        }
-    }
-
-    if (o->type == OBJ_HASH) {
-        key = keysds;
-        if (!data->only_keys) {
-            val = dictGetVal(de);
-        }
-    } else {
-        serverPanic("Type not handled in dict SCAN callback.");
-    }
-
-    listAddNodeTail(keys, key);
-    if (val) listAddNodeTail(keys, val);
-}
-
 void hashtableScanCallback(void *privdata, void *entry) {
     scanData *data = (scanData *)privdata;
     sds val = NULL;
@@ -1025,14 +992,20 @@ void hashtableScanCallback(void *privdata, void *entry) {
      * fields, set elements, etc.) so o must be set here. */
     serverAssert(o != NULL);
 
-    /* get key */
+    /* get key, value */
     if (o->type == OBJ_SET) {
         key = (sds)entry;
     } else if (o->type == OBJ_ZSET) {
         zskiplistNode *node = (zskiplistNode *)entry;
         key = node->ele;
+        /* zset data is copied after filtering by key */
+    } else if (o->type == OBJ_HASH) {
+        key = hashTypeEntryGetField(entry);
+        if (!data->only_keys) {
+            val = hashTypeEntryGetValue(entry);
+        }
     } else {
-        serverPanic("Type not handled in hashset SCAN callback.");
+        serverPanic("Type not handled in hashtable SCAN callback.");
     }
 
     /* Filter element if it does not match the pattern. */
@@ -1042,9 +1015,9 @@ void hashtableScanCallback(void *privdata, void *entry) {
         }
     }
 
-    if (o->type == OBJ_SET) {
-        /* no value, key used by reference */
-    } else if (o->type == OBJ_ZSET) {
+    /* zset data must be copied. Do this after filtering to avoid unneeded
+     * allocations. */
+    if (o->type == OBJ_ZSET) {
         /* zset data is copied */
         zskiplistNode *node = (zskiplistNode *)entry;
         key = sdsdup(node->ele);
@@ -1053,8 +1026,6 @@ void hashtableScanCallback(void *privdata, void *entry) {
             int len = ld2string(buf, sizeof(buf), node->score, LD_STR_AUTO);
             val = sdsnewlen(buf, len);
         }
-    } else {
-        serverPanic("Type not handled in hashset SCAN callback.");
     }
 
     listAddNodeTail(keys, key);
@@ -1193,20 +1164,19 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
      * cursor to zero to signal the end of the iteration. */
 
     /* Handle the case of kvstore, dict or hashtable. */
-    dict *dict_table = NULL;
-    hashtable *hashtable_table = NULL;
+    hashtable *ht = NULL;
     int shallow_copied_list_items = 0;
     if (o == NULL) {
         shallow_copied_list_items = 1;
     } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) {
-        hashtable_table = o->ptr;
+        ht = o->ptr;
         shallow_copied_list_items = 1;
-    } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) {
-        dict_table = o->ptr;
+    } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HASHTABLE) {
+        ht = o->ptr;
         shallow_copied_list_items = 1;
     } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = o->ptr;
-        hashtable_table = zs->ht;
+        ht = zs->ht;
         /* scanning ZSET allocates temporary strings even though it's a dict */
         shallow_copied_list_items = 0;
     }
@@ -1220,7 +1190,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
     }
 
     /* For main hash table scan or scannable data structure. */
-    if (!o || dict_table || hashtable_table) {
+    if (!o || ht) {
         /* We set the max number of iterations to ten times the specified
          * COUNT, so if the hash table is in a pathological state (very
          * sparsely populated) we avoid to block too much time at the cost
@@ -1260,10 +1230,8 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
              * If cursor is empty, we should try exploring next non-empty slot. */
             if (o == NULL) {
                 cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data);
-            } else if (dict_table) {
-                cursor = dictScan(dict_table, cursor, dictScanCallback, &data);
             } else {
-                cursor = hashtableScan(hashtable_table, cursor, hashtableScanCallback, &data);
+                cursor = hashtableScan(ht, cursor, hashtableScanCallback, &data);
             }
         } while (cursor && maxiterations-- && data.sampled < count);
     } else if (o->type == OBJ_SET) {
diff --git a/src/debug.c b/src/debug.c
index c80ff5af39..915e0c264d 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -231,7 +231,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o)
             sds sdsele;
 
             memset(eledigest, 0, 20);
-            sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY);
+            sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_FIELD);
             mixDigest(eledigest, sdsele, sdslen(sdsele));
             sdsfree(sdsele);
             sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE);
@@ -923,23 +923,17 @@ void debugCommand(client *c) {
         robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr);
         if (o == NULL) return;
 
-        /* Get the dict reference from the object, if possible. */
-        dict *d = NULL;
+        /* Get the hashtable reference from the object, if possible. */
         hashtable *ht = NULL;
         switch (o->encoding) {
         case OBJ_ENCODING_SKIPLIST: {
             zset *zs = o->ptr;
             ht = zs->ht;
         } break;
-        case OBJ_ENCODING_HT: d = o->ptr; break;
         case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break;
         }
 
-        if (d != NULL) {
-            char buf[4096];
-            dictGetStats(buf, sizeof(buf), d, full);
-            addReplyVerbatim(c, buf, strlen(buf), "txt");
-        } else if (ht != NULL) {
+        if (ht != NULL) {
             char buf[4096];
             hashtableGetStats(buf, sizeof(buf), ht, full);
             addReplyVerbatim(c, buf, strlen(buf), "txt");
diff --git a/src/defrag.c b/src/defrag.c
index 103730ee14..fb98da96c7 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -373,13 +373,6 @@ void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) {
     if (new_sds != NULL) *sds_ref = new_sds;
 }
 
-void activeDefragSdsHashtable(hashtable *ht) {
-    unsigned long cursor = 0;
-    do {
-        cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
-    } while (cursor != 0);
-}
-
 /* Defrag a list of ptr, sds or robj string values */
 static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
     quicklistNode *newnode, *node = *node_ref;
@@ -481,26 +474,25 @@ static void scanHashtableCallbackCountScanned(void *privdata, void *elemref) {
     server.stat_active_defrag_scanned++;
 }
 
-/* Used as dict scan callback when all the work is done in the dictDefragFunctions. */
-static void scanCallbackCountScanned(void *privdata, const dictEntry *de) {
-    UNUSED(privdata);
-    UNUSED(de);
-    server.stat_active_defrag_scanned++;
-}
-
 static void scanLaterSet(robj *ob, unsigned long *cursor) {
     if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return;
     hashtable *ht = ob->ptr;
     *cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
 }
 
+/* Hashtable scan callback for hash datatype */
+static void activeDefragHashTypeEntry(void *privdata, void *element_ref) {
+    UNUSED(privdata);
+    hashTypeEntry **entry_ref = (hashTypeEntry **)element_ref;
+
+    hashTypeEntry *new_entry = hashTypeEntryDefrag(*entry_ref, activeDefragAlloc, activeDefragSds);
+    if (new_entry) *entry_ref = new_entry;
+}
+
 static void scanLaterHash(robj *ob, unsigned long *cursor) {
-    if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return;
-    dict *d = ob->ptr;
-    dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc,
-                                     .defragKey = (dictDefragAllocFunction *)activeDefragSds,
-                                     .defragVal = (dictDefragAllocFunction *)activeDefragSds};
-    *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL);
+    if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HASHTABLE) return;
+    hashtable *ht = ob->ptr;
+    *cursor = hashtableScanDefrag(ht, *cursor, activeDefragHashTypeEntry, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
 }
 
 static void defragQuicklist(robj *ob) {
@@ -538,15 +530,19 @@ static void defragZsetSkiplist(robj *ob) {
 }
 
 static void defragHash(robj *ob) {
-    dict *d, *newd;
-    serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT);
-    d = ob->ptr;
-    if (dictSize(d) > server.active_defrag_max_scan_fields)
+    serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HASHTABLE);
+    hashtable *ht = ob->ptr;
+    if (hashtableSize(ht) > server.active_defrag_max_scan_fields) {
         defragLater(ob);
-    else
-        activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS);
-    /* defrag the dict struct and tables */
-    if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd;
+    } else {
+        unsigned long cursor = 0;
+        do {
+            cursor = hashtableScanDefrag(ht, cursor, activeDefragHashTypeEntry, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
+        } while (cursor != 0);
+    }
+    /* defrag the hashtable struct and tables */
+    hashtable *new_hashtable = hashtableDefragTables(ht, activeDefragAlloc);
+    if (new_hashtable) ob->ptr = new_hashtable;
 }
 
 static void defragSet(robj *ob) {
@@ -555,11 +551,14 @@ static void defragSet(robj *ob) {
     if (hashtableSize(ht) > server.active_defrag_max_scan_fields) {
         defragLater(ob);
     } else {
-        activeDefragSdsHashtable(ht);
+        unsigned long cursor = 0;
+        do {
+            cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
+        } while (cursor != 0);
     }
     /* defrag the hashtable struct and tables */
-    hashtable *newHashtable = hashtableDefragTables(ht, activeDefragAlloc);
-    if (newHashtable) ob->ptr = newHashtable;
+    hashtable *new_hashtable = hashtableDefragTables(ht, activeDefragAlloc);
+    if (new_hashtable) ob->ptr = new_hashtable;
 }
 
 /* Defrag callback for radix tree iterator, called for each node,
@@ -776,7 +775,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) {
     } else if (ob->type == OBJ_HASH) {
         if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl;
-        } else if (ob->encoding == OBJ_ENCODING_HT) {
+        } else if (ob->encoding == OBJ_ENCODING_HASHTABLE) {
             defragHash(ob);
         } else {
             serverPanic("Unknown hash encoding");
diff --git a/src/lazyfree.c b/src/lazyfree.c
index c22d3da964..3b061ccd84 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -123,9 +123,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) {
     } else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = obj->ptr;
         return zs->zsl->length;
-    } else if (obj->type == OBJ_HASH && obj->encoding == OBJ_ENCODING_HT) {
-        dict *ht = obj->ptr;
-        return dictSize(ht);
+    } else if (obj->type == OBJ_HASH && obj->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = obj->ptr;
+        return hashtableSize(ht);
     } else if (obj->type == OBJ_STREAM) {
         size_t effort = 0;
         stream *s = obj->ptr;
diff --git a/src/module.c b/src/module.c
index 76c362b758..fa60335837 100644
--- a/src/module.c
+++ b/src/module.c
@@ -11090,25 +11090,6 @@ typedef struct {
     ValkeyModuleScanKeyCB fn;
 } ScanKeyCBData;
 
-static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) {
-    ScanKeyCBData *data = privdata;
-    sds key = dictGetKey(de);
-    robj *o = data->key->value;
-    robj *field = createStringObject(key, sdslen(key));
-    robj *value = NULL;
-
-    if (o->type == OBJ_HASH) {
-        sds val = dictGetVal(de);
-        value = createStringObject(val, sdslen(val));
-    } else {
-        serverPanic("unexpected object type");
-    }
-
-    data->fn(data->key, field, value, data->user_data);
-    decrRefCount(field);
-    if (value) decrRefCount(value);
-}
-
 static void moduleScanKeyHashtableCallback(void *privdata, void *entry) {
     ScanKeyCBData *data = privdata;
     robj *o = data->key->value;
@@ -11122,6 +11103,10 @@ static void moduleScanKeyHashtableCallback(void *privdata, void *entry) {
         zskiplistNode *node = (zskiplistNode *)entry;
         key = node->ele;
         value = createStringObjectFromLongDouble(node->score, 0);
+    } else if (o->type == OBJ_HASH) {
+        key = hashTypeEntryGetField(entry);
+        sds val = hashTypeEntryGetValue(entry);
+        value = createStringObject(val, sdslen(val));
     } else {
         serverPanic("unexpected object type");
     }
@@ -11185,13 +11170,12 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
         errno = EINVAL;
         return 0;
     }
-    dict *d = NULL;
     hashtable *ht = NULL;
     robj *o = key->value;
     if (o->type == OBJ_SET) {
         if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr;
     } else if (o->type == OBJ_HASH) {
-        if (o->encoding == OBJ_ENCODING_HT) d = o->ptr;
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr;
     } else if (o->type == OBJ_ZSET) {
         if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->ht;
     } else {
@@ -11203,14 +11187,7 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
         return 0;
     }
     int ret = 1;
-    if (d) {
-        ScanKeyCBData data = {key, privdata, fn};
-        cursor->cursor = dictScan(d, cursor->cursor, moduleScanKeyDictCallback, &data);
-        if (cursor->cursor == 0) {
-            cursor->done = 1;
-            ret = 0;
-        }
-    } else if (ht) {
+    if (ht) {
         ScanKeyCBData data = {key, privdata, fn};
         cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data);
         if (cursor->cursor == 0) {
diff --git a/src/object.c b/src/object.c
index 86eefe43a3..b8200dd815 100644
--- a/src/object.c
+++ b/src/object.c
@@ -530,7 +530,7 @@ void freeZsetObject(robj *o) {
 
 void freeHashObject(robj *o) {
     switch (o->encoding) {
-    case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break;
+    case OBJ_ENCODING_HASHTABLE: hashtableRelease((hashtable *)o->ptr); break;
     case OBJ_ENCODING_LISTPACK: lpFree(o->ptr); break;
     default: serverPanic("Unknown hash encoding type"); break;
     }
@@ -675,25 +675,22 @@ void dismissZsetObject(robj *o, size_t size_hint) {
 
 /* See dismissObject() */
 void dismissHashObject(robj *o, size_t size_hint) {
-    if (o->encoding == OBJ_ENCODING_HT) {
-        dict *d = o->ptr;
-        serverAssert(dictSize(d) != 0);
+    if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = o->ptr;
+        serverAssert(hashtableSize(ht) != 0);
         /* We iterate all fields only when average field/value size is bigger than
          * a page size, and there's a high chance we'll actually dismiss something. */
-        if (size_hint / dictSize(d) >= server.page_size) {
-            dictEntry *de;
-            dictIterator *di = dictGetIterator(d);
-            while ((de = dictNext(di)) != NULL) {
-                /* Only dismiss values memory since the field size
-                 * usually is small. */
-                dismissSds(dictGetVal(de));
+        if (size_hint / hashtableSize(ht) >= server.page_size) {
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, ht);
+            void *next;
+            while (hashtableNext(&iter, &next)) {
+                dismissHashTypeEntry(next);
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iter);
         }
 
-        /* Dismiss hash table memory. */
-        dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry *));
-        dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry *));
+        dismissHashtable(ht);
     } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
         dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr));
     } else {
@@ -1106,7 +1103,6 @@ char *strEncoding(int encoding) {
     switch (encoding) {
     case OBJ_ENCODING_RAW: return "raw";
     case OBJ_ENCODING_INT: return "int";
-    case OBJ_ENCODING_HT: return "hashtable";
     case OBJ_ENCODING_HASHTABLE: return "hashtable";
     case OBJ_ENCODING_QUICKLIST: return "quicklist";
     case OBJ_ENCODING_LISTPACK: return "listpack";
@@ -1127,10 +1123,6 @@ char *strEncoding(int encoding) {
  * are checked and averaged to estimate the total size. */
 #define OBJ_COMPUTE_SIZE_DEF_SAMPLES 5 /* Default sample size. */
 size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
-    sds ele, ele2;
-    dict *d;
-    dictIterator *di;
-    struct dictEntry *de;
     size_t asize = 0, elesize = 0, samples = 0;
 
     if (o->type == OBJ_STRING) {
@@ -1202,19 +1194,19 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
     } else if (o->type == OBJ_HASH) {
         if (o->encoding == OBJ_ENCODING_LISTPACK) {
             asize = sizeof(*o) + zmalloc_size(o->ptr);
-        } else if (o->encoding == OBJ_ENCODING_HT) {
-            d = o->ptr;
-            di = dictGetIterator(d);
-            asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d));
-            while ((de = dictNext(di)) != NULL && samples < sample_size) {
-                ele = dictGetKey(de);
-                ele2 = dictGetVal(de);
-                elesize += sdsAllocSize(ele) + sdsAllocSize(ele2);
-                elesize += dictEntryMemUsage(de);
+        } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtable *ht = o->ptr;
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, ht);
+            void *next;
+
+            asize = sizeof(*o) + hashtableMemUsage(ht);
+            while (hashtableNext(&iter, &next) && samples < sample_size) {
+                elesize += hashTypeEntryAllocSize(next);
                 samples++;
             }
-            dictReleaseIterator(di);
-            if (samples) asize += (double)elesize / samples * dictSize(d);
+            hashtableResetIterator(&iter);
+            if (samples) asize += (double)elesize / samples * hashtableSize(ht);
         } else {
             serverPanic("Unknown hash encoding");
         }
diff --git a/src/rdb.c b/src/rdb.c
index 6a2ec78d71..0bb5d7d45d 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -710,7 +710,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) {
     case OBJ_HASH:
         if (o->encoding == OBJ_ENCODING_LISTPACK)
             return rdbSaveType(rdb, RDB_TYPE_HASH_LISTPACK);
-        else if (o->encoding == OBJ_ENCODING_HT)
+        else if (o->encoding == OBJ_ENCODING_HASHTABLE)
             return rdbSaveType(rdb, RDB_TYPE_HASH);
         else
             serverPanic("Unknown hash encoding");
@@ -950,32 +950,33 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
 
             if ((n = rdbSaveRawString(rdb, o->ptr, l)) == -1) return -1;
             nwritten += n;
-        } else if (o->encoding == OBJ_ENCODING_HT) {
-            dictIterator *di = dictGetIterator(o->ptr);
-            dictEntry *de;
+        } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtable *ht = o->ptr;
 
-            if ((n = rdbSaveLen(rdb, dictSize((dict *)o->ptr))) == -1) {
-                dictReleaseIterator(di);
+            if ((n = rdbSaveLen(rdb, hashtableSize(ht))) == -1) {
                 return -1;
             }
             nwritten += n;
 
-            while ((de = dictNext(di)) != NULL) {
-                sds field = dictGetKey(de);
-                sds value = dictGetVal(de);
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, ht);
+            void *next;
+            while (hashtableNext(&iter, &next)) {
+                sds field = hashTypeEntryGetField(next);
+                sds value = hashTypeEntryGetValue(next);
 
                 if ((n = rdbSaveRawString(rdb, (unsigned char *)field, sdslen(field))) == -1) {
-                    dictReleaseIterator(di);
+                    hashtableResetIterator(&iter);
                     return -1;
                 }
                 nwritten += n;
                 if ((n = rdbSaveRawString(rdb, (unsigned char *)value, sdslen(value))) == -1) {
-                    dictReleaseIterator(di);
+                    hashtableResetIterator(&iter);
                     return -1;
                 }
                 nwritten += n;
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iter);
         } else {
             serverPanic("Unknown hash encoding");
         }
@@ -2063,7 +2064,6 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
         }
     } else if (rdbtype == RDB_TYPE_HASH) {
         uint64_t len;
-        int ret;
         sds field, value;
         dict *dupSearchDict = NULL;
 
@@ -2075,10 +2075,10 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
 
         /* Too many entries? Use a hash table right from the start. */
         if (len > server.hash_max_listpack_entries)
-            hashTypeConvert(o, OBJ_ENCODING_HT);
+            hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
         else if (deep_integrity_validation) {
             /* In this mode, we need to guarantee that the server won't crash
-             * later when the ziplist is converted to a dict.
+             * later when the ziplist is converted to a hashtable.
              * Create a set (dict with no values) to for a dup search.
              * We can dismiss it as soon as we convert the ziplist to a hash. */
             dupSearchDict = dictCreate(&hashDictType);
@@ -2117,13 +2117,13 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             /* Convert to hash table if size threshold is exceeded */
             if (sdslen(field) > server.hash_max_listpack_value || sdslen(value) > server.hash_max_listpack_value ||
                 !lpSafeToAdd(o->ptr, sdslen(field) + sdslen(value))) {
-                hashTypeConvert(o, OBJ_ENCODING_HT);
-                ret = dictAdd((dict *)o->ptr, field, value);
-                if (ret == DICT_ERR) {
+                hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
+                hashTypeEntry *entry = hashTypeCreateEntry(field, value);
+                sdsfree(field);
+                if (!hashtableAdd((hashtable *)o->ptr, entry)) {
                     rdbReportCorruptRDB("Duplicate hash fields detected");
                     if (dupSearchDict) dictRelease(dupSearchDict);
-                    sdsfree(value);
-                    sdsfree(field);
+                    freeHashTypeEntry(entry);
                     decrRefCount(o);
                     return NULL;
                 }
@@ -2145,16 +2145,16 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             dupSearchDict = NULL;
         }
 
-        if (o->encoding == OBJ_ENCODING_HT && len > DICT_HT_INITIAL_SIZE) {
-            if (dictTryExpand(o->ptr, len) != DICT_OK) {
-                rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+            if (!hashtableTryExpand(o->ptr, len)) {
+                rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
                 decrRefCount(o);
                 return NULL;
             }
         }
 
         /* Load remaining fields and values into the hash table */
-        while (o->encoding == OBJ_ENCODING_HT && len > 0) {
+        while (o->encoding == OBJ_ENCODING_HASHTABLE && len > 0) {
             len--;
             /* Load encoded strings */
             if ((field = rdbGenericLoadStringObject(rdb, RDB_LOAD_SDS, NULL)) == NULL) {
@@ -2168,11 +2168,11 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             }
 
             /* Add pair to hash table */
-            ret = dictAdd((dict *)o->ptr, field, value);
-            if (ret == DICT_ERR) {
+            hashTypeEntry *entry = hashTypeCreateEntry(field, value);
+            sdsfree(field);
+            if (!hashtableAdd((hashtable *)o->ptr, entry)) {
                 rdbReportCorruptRDB("Duplicate hash fields detected");
-                sdsfree(value);
-                sdsfree(field);
+                freeHashTypeEntry(entry);
                 decrRefCount(o);
                 return NULL;
             }
@@ -2317,7 +2317,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                 o->encoding = OBJ_ENCODING_LISTPACK;
 
                 if (hashTypeLength(o) > server.hash_max_listpack_entries || maxlen > server.hash_max_listpack_value) {
-                    hashTypeConvert(o, OBJ_ENCODING_HT);
+                    hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
                 }
             }
             break;
@@ -2445,7 +2445,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             }
 
             if (hashTypeLength(o) > server.hash_max_listpack_entries)
-                hashTypeConvert(o, OBJ_ENCODING_HT);
+                hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
             else
                 o->ptr = lpShrinkToFit(o->ptr);
             break;
@@ -2466,7 +2466,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                 goto emptykey;
             }
 
-            if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HT);
+            if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
             break;
         default:
             /* totally unreachable */
diff --git a/src/server.c b/src/server.c
index 7db95a1822..338449dc1e 100644
--- a/src/server.c
+++ b/src/server.c
@@ -624,6 +624,24 @@ hashtableType subcommandSetType = {.entryGetKey = hashtableSubcommandGetKey,
                                    .keyCompare = hashtableStringKeyCaseCompare,
                                    .instant_rehashing = 1};
 
+/* Hash type hash table (note that small hashes are represented with listpacks) */
+const void *hashHashtableTypeGetKey(const void *entry) {
+    const hashTypeEntry *hash_entry = entry;
+    return (const void *)hashTypeEntryGetField(hash_entry);
+}
+
+void hashHashtableTypeDestructor(void *entry) {
+    hashTypeEntry *hash_entry = entry;
+    freeHashTypeEntry(hash_entry);
+}
+
+hashtableType hashHashtableType = {
+    .hashFunction = dictSdsHash,
+    .entryGetKey = hashHashtableTypeGetKey,
+    .keyCompare = hashtableSdsKeyCompare,
+    .entryDestructor = hashHashtableTypeDestructor,
+};
+
 /* Hash type hash table (note that small hashes are represented with listpacks) */
 dictType hashDictType = {
     dictSdsHash,       /* hash function */
diff --git a/src/server.h b/src/server.h
index 0e645cdc11..c05e50cd73 100644
--- a/src/server.h
+++ b/src/server.h
@@ -711,7 +711,7 @@ typedef struct ValkeyModuleType moduleType;
  * is set to one of this fields for this object. */
 #define OBJ_ENCODING_RAW 0        /* Raw representation */
 #define OBJ_ENCODING_INT 1        /* Encoded as integer */
-#define OBJ_ENCODING_HT 2         /* Encoded as hash table */
+#define OBJ_ENCODING_HASHTABLE 2  /* Encoded as a hashtable */
 #define OBJ_ENCODING_ZIPMAP 3     /* No longer used: old hash encoding. */
 #define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
 #define OBJ_ENCODING_ZIPLIST 5    /* No longer used: old list/hash/zset encoding. */
@@ -721,7 +721,6 @@ typedef struct ValkeyModuleType moduleType;
 #define OBJ_ENCODING_QUICKLIST 9  /* Encoded as linked list of listpacks */
 #define OBJ_ENCODING_STREAM 10    /* Encoded as a radix tree of listpacks */
 #define OBJ_ENCODING_LISTPACK 11  /* Encoded as a listpack */
-#define OBJ_ENCODING_HASHTABLE 12 /* Encoded as a hashtable */
 
 #define LRU_BITS 24
 #define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */
@@ -2520,13 +2519,13 @@ typedef struct {
 
     unsigned char *fptr, *vptr;
 
-    dictIterator di;
-    dictEntry *de;
+    hashtableIterator iter;
+    void *next;
 } hashTypeIterator;
 
 #include "stream.h" /* Stream data type header file. */
 
-#define OBJ_HASH_KEY 1
+#define OBJ_HASH_FIELD 1
 #define OBJ_HASH_VALUE 2
 
 /*-----------------------------------------------------------------------------
@@ -2544,6 +2543,7 @@ extern hashtableType kvstoreKeysHashtableType;
 extern hashtableType kvstoreExpiresHashtableType;
 extern double R_Zero, R_PosInf, R_NegInf, R_Nan;
 extern dictType hashDictType;
+extern hashtableType hashHashtableType;
 extern dictType stringSetDictType;
 extern dictType externalStringType;
 extern dictType sdsHashDictType;
@@ -3233,6 +3233,15 @@ robj *setTypeDup(robj *o);
 #define HASH_SET_TAKE_VALUE (1 << 1)
 #define HASH_SET_COPY 0
 
+typedef struct hashTypeEntry hashTypeEntry;
+hashTypeEntry *hashTypeCreateEntry(sds field, sds value);
+sds hashTypeEntryGetField(const hashTypeEntry *entry);
+sds hashTypeEntryGetValue(const hashTypeEntry *entry);
+size_t hashTypeEntryAllocSize(hashTypeEntry *entry);
+hashTypeEntry *hashTypeEntryDefrag(hashTypeEntry *entry, void *(*defragfn)(void *), sds (*sdsdefragfn)(sds));
+void dismissHashTypeEntry(hashTypeEntry *entry);
+void freeHashTypeEntry(hashTypeEntry *entry);
+
 void hashTypeConvert(robj *o, int enc);
 void hashTypeTryConversion(robj *subject, robj **argv, int start, int end);
 int hashTypeExists(robj *o, sds key);
@@ -3247,7 +3256,6 @@ void hashTypeCurrentFromListpack(hashTypeIterator *hi,
                                  unsigned int *vlen,
                                  long long *vll);
 sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what);
-void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll);
 sds hashTypeCurrentObjectNewSds(hashTypeIterator *hi, int what);
 robj *hashTypeLookupWriteOrCreate(client *c, robj *key);
 robj *hashTypeGetValueObject(robj *o, sds field);
diff --git a/src/t_hash.c b/src/t_hash.c
index 1aa37968b7..b6e6457bb6 100644
--- a/src/t_hash.c
+++ b/src/t_hash.c
@@ -30,6 +30,80 @@
 #include "server.h"
 #include <math.h>
 
+/*-----------------------------------------------------------------------------
+ * Hash Entry API
+ *----------------------------------------------------------------------------*/
+
+struct hashTypeEntry {
+    sds value;
+    unsigned char field_offset;
+    unsigned char field_data[];
+};
+
+/* takes ownership of value, does not take ownership of field */
+hashTypeEntry *hashTypeCreateEntry(sds field, sds value) {
+    size_t field_size = sdscopytobuffer(NULL, 0, field, NULL);
+
+    size_t total_size = sizeof(hashTypeEntry) + field_size;
+    hashTypeEntry *entry = zmalloc(total_size);
+
+    entry->value = value;
+    sdscopytobuffer(entry->field_data, field_size, field, &entry->field_offset);
+    return entry;
+}
+
+sds hashTypeEntryGetField(const hashTypeEntry *entry) {
+    const unsigned char *field = entry->field_data + entry->field_offset;
+    return (sds)field;
+}
+
+sds hashTypeEntryGetValue(const hashTypeEntry *entry) {
+    return entry->value;
+}
+
+/* frees previous value, takes ownership of new value */
+static void hashTypeEntryReplaceValue(hashTypeEntry *entry, sds value) {
+    sdsfree(entry->value);
+    entry->value = value;
+}
+
+/* Returns allocation size of hashTypeEntry and data owned by hashTypeEntry,
+ * even if not embedded in the same allocation. */
+size_t hashTypeEntryAllocSize(hashTypeEntry *entry) {
+    size_t size = zmalloc_usable_size(entry);
+    size += sdsAllocSize(entry->value);
+    return size;
+}
+
+/* Defragments a hashtable entry (field-value pair) if needed, using the
+ * provided defrag functions. The defrag functions return NULL if the allocation
+ * was not moved, otherwise they return a pointer to the new memory location.
+ * A separate sds defrag function is needed because of the unique memory layout
+ * of sds strings.
+ * If the location of the hashTypeEntry changed we return the new location,
+ * otherwise we return NULL. */
+hashTypeEntry *hashTypeEntryDefrag(hashTypeEntry *entry, void *(*defragfn)(void *), sds (*sdsdefragfn)(sds)) {
+    hashTypeEntry *new_entry = defragfn(entry);
+    if (new_entry) entry = new_entry;
+
+    sds new_value = sdsdefragfn(entry->value);
+    if (new_value) entry->value = new_value;
+
+    return new_entry;
+}
+
+/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've
+ * forked and memory won't be used again. See zmadvise_dontneed() */
+void dismissHashTypeEntry(hashTypeEntry *entry) {
+    /* Only dismiss values memory since the field size usually is small. */
+    dismissSds(entry->value);
+}
+
+void freeHashTypeEntry(hashTypeEntry *entry) {
+    sdsfree(entry->value);
+    zfree(entry);
+}
+
 /*-----------------------------------------------------------------------------
  * Hash type API
  *----------------------------------------------------------------------------*/
@@ -48,8 +122,8 @@ void hashTypeTryConversion(robj *o, robj **argv, int start, int end) {
      * might over allocate memory if there are duplicates. */
     size_t new_fields = (end - start + 1) / 2;
     if (new_fields > server.hash_max_listpack_entries) {
-        hashTypeConvert(o, OBJ_ENCODING_HT);
-        dictExpand(o->ptr, new_fields);
+        hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
+        hashtableExpand(o->ptr, new_fields);
         return;
     }
 
@@ -57,12 +131,12 @@ void hashTypeTryConversion(robj *o, robj **argv, int start, int end) {
         if (!sdsEncodedObject(argv[i])) continue;
         size_t len = sdslen(argv[i]->ptr);
         if (len > server.hash_max_listpack_value) {
-            hashTypeConvert(o, OBJ_ENCODING_HT);
+            hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
             return;
         }
         sum += len;
     }
-    if (!lpSafeToAdd(o->ptr, sum)) hashTypeConvert(o, OBJ_ENCODING_HT);
+    if (!lpSafeToAdd(o->ptr, sum)) hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
 }
 
 /* Get the value from a listpack encoded hash, identified by field.
@@ -95,13 +169,10 @@ int hashTypeGetFromListpack(robj *o, sds field, unsigned char **vstr, unsigned i
  * Returns NULL when the field cannot be found, otherwise the SDS value
  * is returned. */
 sds hashTypeGetFromHashTable(robj *o, sds field) {
-    dictEntry *de;
-
-    serverAssert(o->encoding == OBJ_ENCODING_HT);
-
-    de = dictFind(o->ptr, field);
-    if (de == NULL) return NULL;
-    return dictGetVal(de);
+    serverAssert(o->encoding == OBJ_ENCODING_HASHTABLE);
+    void *found_element;
+    if (!hashtableFind(o->ptr, field, &found_element)) return NULL;
+    return hashTypeEntryGetValue(found_element);
 }
 
 /* Higher level function of hashTypeGet*() that returns the hash value
@@ -117,9 +188,9 @@ int hashTypeGetValue(robj *o, sds field, unsigned char **vstr, unsigned int *vle
     if (o->encoding == OBJ_ENCODING_LISTPACK) {
         *vstr = NULL;
         if (hashTypeGetFromListpack(o, field, vstr, vlen, vll) == 0) return C_OK;
-    } else if (o->encoding == OBJ_ENCODING_HT) {
-        sds value;
-        if ((value = hashTypeGetFromHashTable(o, field)) != NULL) {
+    } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+        sds value = hashTypeGetFromHashTable(o, field);
+        if (value != NULL) {
             *vstr = (unsigned char *)value;
             *vlen = sdslen(value);
             return C_OK;
@@ -173,7 +244,7 @@ int hashTypeExists(robj *o, sds field) {
 /* Add a new field, overwrite the old with the new value if it already exists.
  * Return 0 on insert and 1 on update.
  *
- * By default, the key and value SDS strings are copied if needed, so the
+ * By default, the field and value SDS strings are copied if needed, so the
  * caller retains ownership of the strings passed. However this behavior
  * can be effected by passing appropriate flags (possibly bitwise OR-ed):
  *
@@ -199,7 +270,7 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) {
      * hashTypeTryConversion, so this check will be a NOP. */
     if (o->encoding == OBJ_ENCODING_LISTPACK) {
         if (sdslen(field) > server.hash_max_listpack_value || sdslen(value) > server.hash_max_listpack_value)
-            hashTypeConvert(o, OBJ_ENCODING_HT);
+            hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
     }
 
     if (o->encoding == OBJ_ENCODING_LISTPACK) {
@@ -228,10 +299,10 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) {
         o->ptr = zl;
 
         /* Check if the listpack needs to be converted to a hash table */
-        if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HT);
-    } else if (o->encoding == OBJ_ENCODING_HT) {
-        dict *ht = o->ptr;
-        dictEntry *de, *existing;
+        if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HASHTABLE);
+    } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = o->ptr;
+
         sds v;
         if (flags & HASH_SET_TAKE_VALUE) {
             v = value;
@@ -239,17 +310,16 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) {
         } else {
             v = sdsdup(value);
         }
-        de = dictAddRaw(ht, field, &existing);
-        if (de) {
-            dictSetVal(ht, de, v);
-            if (flags & HASH_SET_TAKE_FIELD) {
-                field = NULL;
-            } else {
-                dictSetKey(ht, de, sdsdup(field));
-            }
+
+        hashtablePosition position;
+        void *existing;
+        if (hashtableFindPositionForInsert(ht, field, &position, &existing)) {
+            /* does not exist yet */
+            hashTypeEntry *entry = hashTypeCreateEntry(field, v);
+            hashtableInsertAtPosition(ht, entry, &position);
         } else {
-            sdsfree(dictGetVal(existing));
-            dictSetVal(ht, existing, v);
+            /* exists: replace value */
+            hashTypeEntryReplaceValue(existing, v);
             update = 1;
         }
     } else {
@@ -276,17 +346,15 @@ int hashTypeDelete(robj *o, sds field) {
         if (fptr != NULL) {
             fptr = lpFind(zl, fptr, (unsigned char *)field, sdslen(field), 1);
             if (fptr != NULL) {
-                /* Delete both of the key and the value. */
+                /* Delete both field and value. */
                 zl = lpDeleteRangeWithEntry(zl, &fptr, 2);
                 o->ptr = zl;
                 deleted = 1;
             }
         }
-    } else if (o->encoding == OBJ_ENCODING_HT) {
-        if (dictDelete((dict *)o->ptr, field) == C_OK) {
-            deleted = 1;
-        }
-
+    } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = o->ptr;
+        deleted = hashtableDelete(ht, field);
     } else {
         serverPanic("Unknown hash encoding");
     }
@@ -295,16 +363,15 @@ int hashTypeDelete(robj *o, sds field) {
 
 /* Return the number of elements in a hash. */
 unsigned long hashTypeLength(const robj *o) {
-    unsigned long length = ULONG_MAX;
-
-    if (o->encoding == OBJ_ENCODING_LISTPACK) {
-        length = lpLength(o->ptr) / 2;
-    } else if (o->encoding == OBJ_ENCODING_HT) {
-        length = dictSize((const dict *)o->ptr);
-    } else {
+    switch (o->encoding) {
+    case OBJ_ENCODING_LISTPACK:
+        return lpLength(o->ptr) / 2;
+    case OBJ_ENCODING_HASHTABLE:
+        return hashtableSize((const hashtable *)o->ptr);
+    default:
         serverPanic("Unknown hash encoding");
+        return ULONG_MAX;
     }
-    return length;
 }
 
 void hashTypeInitIterator(robj *subject, hashTypeIterator *hi) {
@@ -314,15 +381,15 @@ void hashTypeInitIterator(robj *subject, hashTypeIterator *hi) {
     if (hi->encoding == OBJ_ENCODING_LISTPACK) {
         hi->fptr = NULL;
         hi->vptr = NULL;
-    } else if (hi->encoding == OBJ_ENCODING_HT) {
-        dictInitIterator(&hi->di, subject->ptr);
+    } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtableInitIterator(&hi->iter, subject->ptr);
     } else {
         serverPanic("Unknown hash encoding");
     }
 }
 
 void hashTypeResetIterator(hashTypeIterator *hi) {
-    if (hi->encoding == OBJ_ENCODING_HT) dictResetIterator(&hi->di);
+    if (hi->encoding == OBJ_ENCODING_HASHTABLE) hashtableResetIterator(&hi->iter);
 }
 
 /* Move to the next entry in the hash. Return C_OK when the next entry
@@ -354,8 +421,8 @@ int hashTypeNext(hashTypeIterator *hi) {
         /* fptr, vptr now point to the first or next pair */
         hi->fptr = fptr;
         hi->vptr = vptr;
-    } else if (hi->encoding == OBJ_ENCODING_HT) {
-        if ((hi->de = dictNext(&hi->di)) == NULL) return C_ERR;
+    } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) {
+        if (!hashtableNext(&hi->iter, &hi->next)) return C_ERR;
     } else {
         serverPanic("Unknown hash encoding");
     }
@@ -371,7 +438,7 @@ void hashTypeCurrentFromListpack(hashTypeIterator *hi,
                                  long long *vll) {
     serverAssert(hi->encoding == OBJ_ENCODING_LISTPACK);
 
-    if (what & OBJ_HASH_KEY) {
+    if (what & OBJ_HASH_FIELD) {
         *vstr = lpGetValue(hi->fptr, vlen, vll);
     } else {
         *vstr = lpGetValue(hi->vptr, vlen, vll);
@@ -382,12 +449,12 @@ void hashTypeCurrentFromListpack(hashTypeIterator *hi,
  * encoded as a hash table. Prototype is similar to
  * `hashTypeGetFromHashTable`. */
 sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what) {
-    serverAssert(hi->encoding == OBJ_ENCODING_HT);
+    serverAssert(hi->encoding == OBJ_ENCODING_HASHTABLE);
 
-    if (what & OBJ_HASH_KEY) {
-        return dictGetKey(hi->de);
+    if (what & OBJ_HASH_FIELD) {
+        return hashTypeEntryGetField(hi->next);
     } else {
-        return dictGetVal(hi->de);
+        return hashTypeEntryGetValue(hi->next);
     }
 }
 
@@ -401,11 +468,11 @@ sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what) {
  * If *vll is populated *vstr is set to NULL, so the caller
  * can always check the function return by checking the return value
  * type checking if vstr == NULL. */
-void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll) {
+static void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll) {
     if (hi->encoding == OBJ_ENCODING_LISTPACK) {
         *vstr = NULL;
         hashTypeCurrentFromListpack(hi, what, vstr, vlen, vll);
-    } else if (hi->encoding == OBJ_ENCODING_HT) {
+    } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) {
         sds ele = hashTypeCurrentFromHashTable(hi, what);
         *vstr = (unsigned char *)ele;
         *vlen = sdslen(ele);
@@ -414,7 +481,7 @@ void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr,
     }
 }
 
-/* Return the key or value at the current iterator position as a new
+/* Return the field or value at the current iterator position as a new
  * SDS string. */
 sds hashTypeCurrentObjectNewSds(hashTypeIterator *hi, int what) {
     unsigned char *vstr;
@@ -444,26 +511,22 @@ void hashTypeConvertListpack(robj *o, int enc) {
     if (enc == OBJ_ENCODING_LISTPACK) {
         /* Nothing to do... */
 
-    } else if (enc == OBJ_ENCODING_HT) {
+    } else if (enc == OBJ_ENCODING_HASHTABLE) {
         hashTypeIterator hi;
-        dict *dict;
-        int ret;
 
-        hashTypeInitIterator(o, &hi);
-        dict = dictCreate(&hashDictType);
+        hashtable *ht = hashtableCreate(&hashHashtableType);
 
-        /* Presize the dict to avoid rehashing */
-        dictExpand(dict, hashTypeLength(o));
+        /* Presize the hashtable to avoid rehashing */
+        hashtableExpand(ht, hashTypeLength(o));
 
+        hashTypeInitIterator(o, &hi);
         while (hashTypeNext(&hi) != C_ERR) {
-            sds key, value;
-
-            key = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY);
-            value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE);
-            ret = dictAdd(dict, key, value);
-            if (ret != DICT_OK) {
-                sdsfree(key);
-                sdsfree(value);             /* Needed for gcc ASAN */
+            sds field = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_FIELD);
+            sds value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE);
+            hashTypeEntry *entry = hashTypeCreateEntry(field, value);
+            sdsfree(field);
+            if (!hashtableAdd(ht, entry)) {
+                freeHashTypeEntry(entry);
                 hashTypeResetIterator(&hi); /* Needed for gcc ASAN */
                 serverLogHexDump(LL_WARNING, "listpack with dup elements dump", o->ptr, lpBytes(o->ptr));
                 serverPanic("Listpack corruption detected");
@@ -471,8 +534,8 @@ void hashTypeConvertListpack(robj *o, int enc) {
         }
         hashTypeResetIterator(&hi);
         zfree(o->ptr);
-        o->encoding = OBJ_ENCODING_HT;
-        o->ptr = dict;
+        o->encoding = OBJ_ENCODING_HASHTABLE;
+        o->ptr = ht;
     } else {
         serverPanic("Unknown hash encoding");
     }
@@ -481,7 +544,7 @@ void hashTypeConvertListpack(robj *o, int enc) {
 void hashTypeConvert(robj *o, int enc) {
     if (o->encoding == OBJ_ENCODING_LISTPACK) {
         hashTypeConvertListpack(o, enc);
-    } else if (o->encoding == OBJ_ENCODING_HT) {
+    } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
         serverPanic("Not implemented");
     } else {
         serverPanic("Unknown hash encoding");
@@ -506,27 +569,24 @@ robj *hashTypeDup(robj *o) {
         memcpy(new_zl, zl, sz);
         hobj = createObject(OBJ_HASH, new_zl);
         hobj->encoding = OBJ_ENCODING_LISTPACK;
-    } else if (o->encoding == OBJ_ENCODING_HT) {
-        dict *d = dictCreate(&hashDictType);
-        dictExpand(d, dictSize((const dict *)o->ptr));
+    } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = hashtableCreate(&hashHashtableType);
+        hashtableExpand(ht, hashtableSize((const hashtable *)o->ptr));
 
         hashTypeInitIterator(o, &hi);
         while (hashTypeNext(&hi) != C_ERR) {
-            sds field, value;
-            sds newfield, newvalue;
             /* Extract a field-value pair from an original hash object.*/
-            field = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_KEY);
-            value = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_VALUE);
-            newfield = sdsdup(field);
-            newvalue = sdsdup(value);
+            sds field = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_FIELD);
+            sds value = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_VALUE);
 
             /* Add a field-value pair to a new hash object. */
-            dictAdd(d, newfield, newvalue);
+            hashTypeEntry *entry = hashTypeCreateEntry(field, sdsdup(value));
+            hashtableAdd(ht, entry);
         }
         hashTypeResetIterator(&hi);
 
-        hobj = createObject(OBJ_HASH, d);
-        hobj->encoding = OBJ_ENCODING_HT;
+        hobj = createObject(OBJ_HASH, ht);
+        hobj->encoding = OBJ_ENCODING_HASHTABLE;
     } else {
         serverPanic("Unknown hash encoding");
     }
@@ -547,22 +607,23 @@ void hashReplyFromListpackEntry(client *c, listpackEntry *e) {
 }
 
 /* Return random element from a non empty hash.
- * 'key' and 'val' will be set to hold the element.
+ * 'field' and 'val' will be set to hold the element.
  * The memory in them is not to be freed or modified by the caller.
  * 'val' can be NULL in which case it's not extracted. */
-void hashTypeRandomElement(robj *hashobj, unsigned long hashsize, listpackEntry *key, listpackEntry *val) {
-    if (hashobj->encoding == OBJ_ENCODING_HT) {
-        dictEntry *de = dictGetFairRandomKey(hashobj->ptr);
-        sds s = dictGetKey(de);
-        key->sval = (unsigned char *)s;
-        key->slen = sdslen(s);
+static void hashTypeRandomElement(robj *hashobj, unsigned long hashsize, listpackEntry *field, listpackEntry *val) {
+    if (hashobj->encoding == OBJ_ENCODING_HASHTABLE) {
+        void *entry;
+        hashtableFairRandomEntry(hashobj->ptr, &entry);
+        sds sds_field = hashTypeEntryGetField(entry);
+        field->sval = (unsigned char *)sds_field;
+        field->slen = sdslen(sds_field);
         if (val) {
-            sds s = dictGetVal(de);
-            val->sval = (unsigned char *)s;
-            val->slen = sdslen(s);
+            sds sds_val = hashTypeEntryGetValue(entry);
+            val->sval = (unsigned char *)sds_val;
+            val->slen = sdslen(sds_val);
         }
     } else if (hashobj->encoding == OBJ_ENCODING_LISTPACK) {
-        lpRandomPair(hashobj->ptr, hashsize, key, val);
+        lpRandomPair(hashobj->ptr, hashsize, field, val);
     } else {
         serverPanic("Unknown hash encoding");
     }
@@ -799,7 +860,7 @@ static void addHashIteratorCursorToReply(writePreparedClient *wpc, hashTypeItera
             addWritePreparedReplyBulkCBuffer(wpc, vstr, vlen);
         else
             addWritePreparedReplyBulkLongLong(wpc, vll);
-    } else if (hi->encoding == OBJ_ENCODING_HT) {
+    } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) {
         sds value = hashTypeCurrentFromHashTable(hi, what);
         addWritePreparedReplyBulkCBuffer(wpc, value, sdslen(value));
     } else {
@@ -812,15 +873,15 @@ void genericHgetallCommand(client *c, int flags) {
     hashTypeIterator hi;
     int length, count = 0;
 
-    robj *emptyResp = (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) ? shared.emptymap[c->resp] : shared.emptyarray;
+    robj *emptyResp = (flags & OBJ_HASH_FIELD && flags & OBJ_HASH_VALUE) ? shared.emptymap[c->resp] : shared.emptyarray;
     if ((o = lookupKeyReadOrReply(c, c->argv[1], emptyResp)) == NULL || checkType(c, o, OBJ_HASH)) return;
 
     writePreparedClient *wpc = prepareClientForFutureWrites(c);
     if (!wpc) return;
-    /* We return a map if the user requested keys and values, like in the
+    /* We return a map if the user requested fields and values, like in the
      * HGETALL case. Otherwise to use a flat array makes more sense. */
     length = hashTypeLength(o);
-    if (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) {
+    if (flags & OBJ_HASH_FIELD && flags & OBJ_HASH_VALUE) {
         addWritePreparedReplyMapLen(wpc, length);
     } else {
         addWritePreparedReplyArrayLen(wpc, length);
@@ -828,8 +889,8 @@ void genericHgetallCommand(client *c, int flags) {
 
     hashTypeInitIterator(o, &hi);
     while (hashTypeNext(&hi) != C_ERR) {
-        if (flags & OBJ_HASH_KEY) {
-            addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_KEY);
+        if (flags & OBJ_HASH_FIELD) {
+            addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_FIELD);
             count++;
         }
         if (flags & OBJ_HASH_VALUE) {
@@ -841,12 +902,12 @@ void genericHgetallCommand(client *c, int flags) {
     hashTypeResetIterator(&hi);
 
     /* Make sure we returned the right number of elements. */
-    if (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) count /= 2;
+    if (flags & OBJ_HASH_FIELD && flags & OBJ_HASH_VALUE) count /= 2;
     serverAssert(count == length);
 }
 
 void hkeysCommand(client *c) {
-    genericHgetallCommand(c, OBJ_HASH_KEY);
+    genericHgetallCommand(c, OBJ_HASH_FIELD);
 }
 
 void hvalsCommand(client *c) {
@@ -854,7 +915,7 @@ void hvalsCommand(client *c) {
 }
 
 void hgetallCommand(client *c) {
-    genericHgetallCommand(c, OBJ_HASH_KEY | OBJ_HASH_VALUE);
+    genericHgetallCommand(c, OBJ_HASH_FIELD | OBJ_HASH_VALUE);
 }
 
 void hexistsCommand(client *c) {
@@ -873,14 +934,14 @@ void hscanCommand(client *c) {
     scanGenericCommand(c, o, cursor);
 }
 
-static void hrandfieldReplyWithListpack(writePreparedClient *wpc, unsigned int count, listpackEntry *keys, listpackEntry *vals) {
+static void hrandfieldReplyWithListpack(writePreparedClient *wpc, unsigned int count, listpackEntry *fields, listpackEntry *vals) {
     client *c = (client *)wpc;
     for (unsigned long i = 0; i < count; i++) {
         if (vals && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2);
-        if (keys[i].sval)
-            addWritePreparedReplyBulkCBuffer(wpc, keys[i].sval, keys[i].slen);
+        if (fields[i].sval)
+            addWritePreparedReplyBulkCBuffer(wpc, fields[i].sval, fields[i].slen);
         else
-            addWritePreparedReplyBulkLongLong(wpc, keys[i].lval);
+            addWritePreparedReplyBulkLongLong(wpc, fields[i].lval);
         if (vals) {
             if (vals[i].sval)
                 addWritePreparedReplyBulkCBuffer(wpc, vals[i].sval, vals[i].slen);
@@ -933,32 +994,32 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) {
             addWritePreparedReplyArrayLen(wpc, count * 2);
         else
             addWritePreparedReplyArrayLen(wpc, count);
-        if (hash->encoding == OBJ_ENCODING_HT) {
-            sds key, value;
+        if (hash->encoding == OBJ_ENCODING_HASHTABLE) {
             while (count--) {
-                dictEntry *de = dictGetFairRandomKey(hash->ptr);
-                key = dictGetKey(de);
-                value = dictGetVal(de);
+                void *entry;
+                hashtableFairRandomEntry(hash->ptr, &entry);
+                sds field = hashTypeEntryGetField(entry);
+                sds value = hashTypeEntryGetValue(entry);
                 if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2);
-                addWritePreparedReplyBulkCBuffer(wpc, key, sdslen(key));
+                addWritePreparedReplyBulkCBuffer(wpc, field, sdslen(field));
                 if (withvalues) addWritePreparedReplyBulkCBuffer(wpc, value, sdslen(value));
                 if (c->flag.close_asap) break;
             }
         } else if (hash->encoding == OBJ_ENCODING_LISTPACK) {
-            listpackEntry *keys, *vals = NULL;
+            listpackEntry *fields, *vals = NULL;
             unsigned long limit, sample_count;
 
             limit = count > HRANDFIELD_RANDOM_SAMPLE_LIMIT ? HRANDFIELD_RANDOM_SAMPLE_LIMIT : count;
-            keys = zmalloc(sizeof(listpackEntry) * limit);
+            fields = zmalloc(sizeof(listpackEntry) * limit);
             if (withvalues) vals = zmalloc(sizeof(listpackEntry) * limit);
             while (count) {
                 sample_count = count > limit ? limit : count;
                 count -= sample_count;
-                lpRandomPairs(hash->ptr, sample_count, keys, vals);
-                hrandfieldReplyWithListpack(wpc, sample_count, keys, vals);
+                lpRandomPairs(hash->ptr, sample_count, fields, vals);
+                hrandfieldReplyWithListpack(wpc, sample_count, fields, vals);
                 if (c->flag.close_asap) break;
             }
-            zfree(keys);
+            zfree(fields);
             zfree(vals);
         }
         return;
@@ -979,7 +1040,7 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) {
         hashTypeInitIterator(hash, &hi);
         while (hashTypeNext(&hi) != C_ERR) {
             if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2);
-            addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_KEY);
+            addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_FIELD);
             if (withvalues) addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_VALUE);
         }
         hashTypeResetIterator(&hi);
@@ -995,12 +1056,12 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) {
      * And it is inefficient to repeatedly pick one random element from a
      * listpack in CASE 4. So we use this instead. */
     if (hash->encoding == OBJ_ENCODING_LISTPACK) {
-        listpackEntry *keys, *vals = NULL;
-        keys = zmalloc(sizeof(listpackEntry) * count);
+        listpackEntry *fields, *vals = NULL;
+        fields = zmalloc(sizeof(listpackEntry) * count);
         if (withvalues) vals = zmalloc(sizeof(listpackEntry) * count);
-        serverAssert(lpRandomPairsUnique(hash->ptr, count, keys, vals) == count);
-        hrandfieldReplyWithListpack(wpc, count, keys, vals);
-        zfree(keys);
+        serverAssert(lpRandomPairsUnique(hash->ptr, count, fields, vals) == count);
+        hrandfieldReplyWithListpack(wpc, count, fields, vals);
+        zfree(fields);
         zfree(vals);
         return;
     }
@@ -1024,11 +1085,11 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) {
         /* Add all the elements into the temporary dictionary. */
         while ((hashTypeNext(&hi)) != C_ERR) {
             int ret = DICT_ERR;
-            sds key, value = NULL;
+            sds field, value = NULL;
 
-            key = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY);
+            field = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_FIELD);
             if (withvalues) value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE);
-            ret = dictAdd(d, key, value);
+            ret = dictAdd(d, field, value);
 
             serverAssert(ret == DICT_OK);
         }
@@ -1051,10 +1112,10 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) {
         dictEntry *de;
         di = dictGetIterator(d);
         while ((de = dictNext(di)) != NULL) {
-            sds key = dictGetKey(de);
+            sds field = dictGetKey(de);
             sds value = dictGetVal(de);
             if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2);
-            addWritePreparedReplyBulkSds(wpc, key);
+            addWritePreparedReplyBulkSds(wpc, field);
             if (withvalues) addWritePreparedReplyBulkSds(wpc, value);
         }
 
@@ -1069,25 +1130,25 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) {
     else {
         /* Hashtable encoding (generic implementation) */
         unsigned long added = 0;
-        listpackEntry key, value;
+        listpackEntry field, value;
         dict *d = dictCreate(&hashDictType);
         dictExpand(d, count);
         while (added < count) {
-            hashTypeRandomElement(hash, size, &key, withvalues ? &value : NULL);
+            hashTypeRandomElement(hash, size, &field, withvalues ? &value : NULL);
 
             /* Try to add the object to the dictionary. If it already exists
              * free it, otherwise increment the number of objects we have
              * in the result dictionary. */
-            sds skey = hashSdsFromListpackEntry(&key);
-            if (dictAdd(d, skey, NULL) != DICT_OK) {
-                sdsfree(skey);
+            sds sfield = hashSdsFromListpackEntry(&field);
+            if (dictAdd(d, sfield, NULL) != DICT_OK) {
+                sdsfree(sfield);
                 continue;
             }
             added++;
 
             /* We can reply right away, so that we don't need to store the value in the dict. */
             if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2);
-            hashReplyFromListpackEntry(c, &key);
+            hashReplyFromListpackEntry(c, &field);
             if (withvalues) hashReplyFromListpackEntry(c, &value);
         }
 

From 703fffa6571f687fe06194c13850c59caf03978f Mon Sep 17 00:00:00 2001
From: secwall <secwall@yandex-team.ru>
Date: Tue, 14 Jan 2025 05:05:04 +0100
Subject: [PATCH 080/101] Escape unix socket group in unit tests (#1554)

In some cases unix groups could have whitespace and/or `\` in them.
One example is my workstation. It's a MacOS in an Active Directory
domain. So my user has group `LD\Domain Users`.
Running `make test` on `unstable` and `8.0` branches fails with:

I'm not sure if we need to fix this in 8.0. But it seems that it should
be fixed in unstable.

Signed-off-by: secwall <secwall@yandex-team.ru>
---
 tests/unit/other.tcl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl
index 225287aa3a..15c0d38136 100644
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@@ -568,7 +568,8 @@ if {$::verbose} {
 set tempFileId [open $tempFileName w]
 set group [dict get [file attributes $tempFileName] -group]
 if {$group != ""} {
-    start_server [list tags {"repl external:skip"} overrides [list unixsocketgroup $group unixsocketperm 744]] {
+    set escaped_group "\"[string map {"\\" "\\\\"} $group]\""
+    start_server [list tags {"repl external:skip"} overrides [list unixsocketgroup $escaped_group unixsocketperm 744]] {
         test {test unixsocket options are set correctly} {
             set socketpath [lindex [r config get unixsocket] 1]
             set attributes [file attributes $socketpath]

From 6281761eea4b18eff69a4b46c57a997c0c5c5b5a Mon Sep 17 00:00:00 2001
From: Amit Nagler <58042354+naglera@users.noreply.github.com>
Date: Tue, 14 Jan 2025 10:49:46 +0200
Subject: [PATCH 081/101] Fix valgrind test (#1555)

Introduced at https://github.com/valkey-io/valkey/pull/1165/files

Signed-off-by: naglera <anagler123@gmail.com>
---
 tests/integration/dual-channel-replication.tcl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index 4ca70651a1..ec6783c1c9 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -511,8 +511,10 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         $primary config set repl-diskless-sync-delay 0
         if {$::valgrind} {
             $primary config set repl-timeout 100
+            $replica config set repl-timeout 100
         } else {
-            $primary config set repl-timeout 10
+            $primary config set repl-timeout 10            
+            $replica config set repl-timeout 10
         }
         $primary config set rdb-key-save-delay 200
         populate 10000 primary 10000
@@ -523,11 +525,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
 
         $replica config set dual-channel-replication-enabled yes
         $replica config set loglevel debug
-        if {$::valgrind} {
-            $primary config set repl-timeout 100
-        } else {
-            $primary config set repl-timeout 10
-        }
+        
         # Pause replica after primary fork
         $replica debug pause-after-fork 1
 

From a06e4f001366677dac77343c8c3e63f1456da883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Tue, 14 Jan 2025 10:38:12 +0100
Subject: [PATCH 082/101] Introduce const_sds for const-content sds (#1553)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`sds` is a typedef of `char *`.

`const sds` means `char * const`, i.e. a const-pointer to non-const
content.

More often, you would want `const char *`, i.e. a pointer to
const-content. Until now, it's not possible to express that. This PR
adds `const_sds` which is a pointer to const-content sds.

To get a const-pointer to const-content sds, you can use `const
const_sds`.

In this PR, some uses of `const sds` are replaced by `const_sds`. We can
use it more later.

Fixes #1542

---------

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/sds.c           | 16 ++++++++--------
 src/sds.h           | 29 ++++++++++++++++++-----------
 src/server.c        |  2 +-
 src/unit/test_sds.c |  2 +-
 4 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/sds.c b/src/sds.c
index 97be74ea47..d956f834e5 100644
--- a/src/sds.c
+++ b/src/sds.c
@@ -187,19 +187,19 @@ sds sdsnew(const char *init) {
 }
 
 /* Duplicate an sds string. */
-sds sdsdup(const sds s) {
+sds sdsdup(const_sds s) {
     return sdsnewlen(s, sdslen(s));
 }
 
 /*
  * This method returns the minimum amount of bytes required to store the sds (header + data + NULL terminator).
  */
-static inline size_t sdsminlen(const sds s) {
+static inline size_t sdsminlen(const_sds s) {
     return sdslen(s) + sdsHdrSize(s[-1]) + 1;
 }
 
 /* This method copies the sds `s` into `buf` which is the target character buffer. */
-size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const sds s, uint8_t *hdr_size) {
+size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const_sds s, uint8_t *hdr_size) {
     size_t required_keylen = sdsminlen(s);
     if (buf == NULL) {
         return required_keylen;
@@ -432,7 +432,7 @@ sds sdsResize(sds s, size_t size, int would_regrow) {
  * 3) The free buffer at the end if any.
  * 4) The implicit null term.
  */
-size_t sdsAllocSize(sds s) {
+size_t sdsAllocSize(const_sds s) {
     char type = s[-1] & SDS_TYPE_MASK;
     /* SDS_TYPE_5 header doesn't contain the size of the allocation */
     if (type == SDS_TYPE_5) {
@@ -444,7 +444,7 @@ size_t sdsAllocSize(sds s) {
 
 /* Return the pointer of the actual SDS allocation (normally SDS strings
  * are referenced by the start of the string buffer). */
-void *sdsAllocPtr(sds s) {
+void *sdsAllocPtr(const_sds s) {
     return (void *)(s - sdsHdrSize(s[-1]));
 }
 
@@ -559,7 +559,7 @@ sds sdscat(sds s, const char *t) {
  *
  * After the call, the modified sds string is no longer valid and all the
  * references must be substituted with the new pointer returned by the call. */
-sds sdscatsds(sds s, const sds t) {
+sds sdscatsds(sds s, const_sds t) {
     return sdscatlen(s, t, sdslen(t));
 }
 
@@ -870,7 +870,7 @@ void sdstoupper(sds s) {
  * If two strings share exactly the same prefix, but one of the two has
  * additional characters, the longer string is considered to be greater than
  * the smaller one. */
-int sdscmp(const sds s1, const sds s2) {
+int sdscmp(const_sds s1, const_sds s2) {
     size_t l1, l2, minlen;
     int cmp;
 
@@ -996,7 +996,7 @@ sds sdscatrepr(sds s, const char *p, size_t len) {
  * that is compatible with sdssplitargs(). For this reason, also spaces will be
  * treated as needing an escape.
  */
-int sdsneedsrepr(const sds s) {
+int sdsneedsrepr(const_sds s) {
     size_t len = sdslen(s);
     const char *p = s;
 
diff --git a/src/sds.h b/src/sds.h
index e1b8531955..d6db8d19c2 100644
--- a/src/sds.h
+++ b/src/sds.h
@@ -39,7 +39,14 @@ extern const char *SDS_NOINIT;
 #include <stdarg.h>
 #include <stdint.h>
 
+/* Constness:
+ *
+ * - 'const sds' means 'char * const'. It is a const-pointer to non-const content.
+ * - 'const_sds' means 'const char *'. It is a non-const pointer to const content.
+ * - 'const const_sds' means 'const char * const', const pointer and content. */
+
 typedef char *sds;
+typedef const char *const_sds;
 
 /* Note: sdshdr5 is never used, we just access the flags byte directly.
  * However is here to document the layout of type 5 SDS strings. */
@@ -83,7 +90,7 @@ struct __attribute__((__packed__)) sdshdr64 {
 #define SDS_HDR(T, s) ((struct sdshdr##T *)((s) - (sizeof(struct sdshdr##T))))
 #define SDS_TYPE_5_LEN(f) ((f) >> SDS_TYPE_BITS)
 
-static inline size_t sdslen(const sds s) {
+static inline size_t sdslen(const_sds s) {
     unsigned char flags = s[-1];
     switch (flags & SDS_TYPE_MASK) {
     case SDS_TYPE_5: return SDS_TYPE_5_LEN(flags);
@@ -95,7 +102,7 @@ static inline size_t sdslen(const sds s) {
     return 0;
 }
 
-static inline size_t sdsavail(const sds s) {
+static inline size_t sdsavail(const_sds s) {
     unsigned char flags = s[-1];
     switch (flags & SDS_TYPE_MASK) {
     case SDS_TYPE_5: {
@@ -151,7 +158,7 @@ static inline void sdsinclen(sds s, size_t inc) {
 }
 
 /* sdsalloc() = sdsavail() + sdslen() */
-static inline size_t sdsalloc(const sds s) {
+static inline size_t sdsalloc(const_sds s) {
     unsigned char flags = s[-1];
     switch (flags & SDS_TYPE_MASK) {
     case SDS_TYPE_5: return SDS_TYPE_5_LEN(flags);
@@ -180,14 +187,14 @@ sds sdsnewlen(const void *init, size_t initlen);
 sds sdstrynewlen(const void *init, size_t initlen);
 sds sdsnew(const char *init);
 sds sdsempty(void);
-sds sdsdup(const sds s);
-size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, sds s, uint8_t *hdr_size);
+sds sdsdup(const_sds s);
+size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const_sds s, uint8_t *hdr_size);
 void sdsfree(sds s);
 void sdsfreeVoid(void *s);
 sds sdsgrowzero(sds s, size_t len);
 sds sdscatlen(sds s, const void *t, size_t len);
 sds sdscat(sds s, const char *t);
-sds sdscatsds(sds s, const sds t);
+sds sdscatsds(sds s, const_sds t);
 sds sdscpylen(sds s, const char *t, size_t len);
 sds sdscpy(sds s, const char *t);
 
@@ -204,7 +211,7 @@ void sdssubstr(sds s, size_t start, size_t len);
 void sdsrange(sds s, ssize_t start, ssize_t end);
 void sdsupdatelen(sds s);
 void sdsclear(sds s);
-int sdscmp(const sds s1, const sds s2);
+int sdscmp(const_sds s1, const_sds s2);
 sds *sdssplitlen(const char *s, ssize_t len, const char *sep, int seplen, int *count);
 void sdsfreesplitres(sds *tokens, int count);
 void sdstolower(sds s);
@@ -215,14 +222,14 @@ sds *sdssplitargs(const char *line, int *argc);
 sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen);
 sds sdsjoin(char **argv, int argc, char *sep);
 sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen);
-int sdsneedsrepr(const sds s);
+int sdsneedsrepr(const_sds s);
 
 /* Callback for sdstemplate. The function gets called by sdstemplate
  * every time a variable needs to be expanded. The variable name is
  * provided as variable, and the callback is expected to return a
  * substitution value. Returning a NULL indicates an error.
  */
-typedef sds (*sdstemplate_callback_t)(const sds variable, void *arg);
+typedef sds (*sdstemplate_callback_t)(const_sds variable, void *arg);
 sds sdstemplate(const char *template, sdstemplate_callback_t cb_func, void *cb_arg);
 
 /* Low level functions exposed to the user API */
@@ -231,8 +238,8 @@ sds sdsMakeRoomForNonGreedy(sds s, size_t addlen);
 void sdsIncrLen(sds s, ssize_t incr);
 sds sdsRemoveFreeSpace(sds s, int would_regrow);
 sds sdsResize(sds s, size_t size, int would_regrow);
-size_t sdsAllocSize(sds s);
-void *sdsAllocPtr(sds s);
+size_t sdsAllocSize(const_sds s);
+void *sdsAllocPtr(const_sds s);
 
 /* Export the allocator used by SDS to the program using SDS.
  * Sometimes the program SDS is linked to, may use a different set of
diff --git a/src/server.c b/src/server.c
index 338449dc1e..9964910a07 100644
--- a/src/server.c
+++ b/src/server.c
@@ -6703,7 +6703,7 @@ void serverOutOfMemoryHandler(size_t allocation_size) {
 /* Callback for sdstemplate on proc-title-template. See valkey.conf for
  * supported variables.
  */
-static sds serverProcTitleGetVariable(const sds varname, void *arg) {
+static sds serverProcTitleGetVariable(const_sds varname, void *arg) {
     if (!strcmp(varname, "title")) {
         return sdsnew(arg);
     } else if (!strcmp(varname, "listen-addr")) {
diff --git a/src/unit/test_sds.c b/src/unit/test_sds.c
index 30f25e4f6f..7972963892 100644
--- a/src/unit/test_sds.c
+++ b/src/unit/test_sds.c
@@ -6,7 +6,7 @@
 #include "../sds.h"
 #include "../sdsalloc.h"
 
-static sds sdsTestTemplateCallback(sds varname, void *arg) {
+static sds sdsTestTemplateCallback(const_sds varname, void *arg) {
     UNUSED(arg);
     static const char *_var1 = "variable1";
     static const char *_var2 = "variable2";

From af8927a39c8f4f74ee95c80614aa063e93b6b527 Mon Sep 17 00:00:00 2001
From: "zhaozhao.zz" <zhaozhao.zz@alibaba-inc.com>
Date: Tue, 14 Jan 2025 19:01:00 +0800
Subject: [PATCH 083/101] add paused_actions for INFO Clients (#1519)

Add `paused_actions` and `paused_timeout_milliseconds` for INFO Clients
to inform users about if clients are paused.

---------

Signed-off-by: zhaozhao.zz <zhaozhao.zz@alibaba-inc.com>
---
 src/networking.c     | 10 ++++++++++
 src/server.c         | 15 ++++++++++++++-
 src/server.h         |  1 +
 tests/unit/pause.tcl | 19 +++++++++++++++++++
 4 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/networking.c b/src/networking.c
index 339cd304d4..48e397e6f4 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -4520,6 +4520,16 @@ void flushReplicasOutputBuffers(void) {
     }
 }
 
+mstime_t getPausedActionTimeout(uint32_t action) {
+    mstime_t timeout = 0;
+    for (int i = 0; i < NUM_PAUSE_PURPOSES; i++) {
+        pause_event *p = &(server.client_pause_per_purpose[i]);
+        if (p->paused_actions & action && (p->end - server.mstime) > timeout)
+            timeout = p->end - server.mstime;
+    }
+    return timeout;
+}
+
 /* Compute current paused actions and its end time, aggregated for
  * all pause purposes. */
 void updatePausedActions(void) {
diff --git a/src/server.c b/src/server.c
index 9964910a07..6338ed524e 100644
--- a/src/server.c
+++ b/src/server.c
@@ -5668,6 +5668,17 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
         unsigned long blocking_keys, blocking_keys_on_nokey, watched_keys;
         getExpansiveClientsInfo(&maxin, &maxout);
         totalNumberOfStatefulKeys(&blocking_keys, &blocking_keys_on_nokey, &watched_keys);
+
+        char *paused_actions = "none";
+        long long paused_timeout = 0;
+        if (server.paused_actions & PAUSE_ACTION_CLIENT_ALL) {
+            paused_actions = "all";
+            paused_timeout = getPausedActionTimeout(PAUSE_ACTION_CLIENT_ALL);
+        } else if (server.paused_actions & PAUSE_ACTION_CLIENT_WRITE) {
+            paused_actions = "write";
+            paused_timeout = getPausedActionTimeout(PAUSE_ACTION_CLIENT_WRITE);
+        }
+
         if (sections++) info = sdscat(info, "\r\n");
         info = sdscatprintf(
             info,
@@ -5684,7 +5695,9 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                 "clients_in_timeout_table:%llu\r\n", (unsigned long long)raxSize(server.clients_timeout_table),
                 "total_watched_keys:%lu\r\n", watched_keys,
                 "total_blocking_keys:%lu\r\n", blocking_keys,
-                "total_blocking_keys_on_nokey:%lu\r\n", blocking_keys_on_nokey));
+                "total_blocking_keys_on_nokey:%lu\r\n", blocking_keys_on_nokey,
+                "paused_actions:%s\r\n", paused_actions,
+                "paused_timeout_milliseconds:%lld\r\n", paused_timeout));
     }
 
     /* Memory */
diff --git a/src/server.h b/src/server.h
index c05e50cd73..00aabf80e1 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2715,6 +2715,7 @@ void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions);
 void unpauseActions(pause_purpose purpose);
 uint32_t isPausedActions(uint32_t action_bitmask);
 uint32_t isPausedActionsWithUpdate(uint32_t action_bitmask);
+mstime_t getPausedActionTimeout(uint32_t action);
 void updatePausedActions(void);
 void unblockPostponedClients(void);
 void processEventsWhileBlocked(void);
diff --git a/tests/unit/pause.tcl b/tests/unit/pause.tcl
index b18a32d48f..9697c3b44e 100644
--- a/tests/unit/pause.tcl
+++ b/tests/unit/pause.tcl
@@ -1,4 +1,23 @@
 start_server {tags {"pause network"}} {
+    test "Test check paused_actions in info stats" {
+        assert_equal [s paused_actions] "none"
+        assert_equal [s paused_timeout_milliseconds] 0
+
+        r client PAUSE 10000 WRITE
+        assert_equal [s paused_actions] "write"
+        after 1000
+        set timeout [s paused_timeout_milliseconds]
+        assert {$timeout > 0 && $timeout < 9000}
+        r client unpause
+
+        r multi
+        r client PAUSE 1000 ALL
+        r info clients
+        assert_match "*paused_actions:all*" [r exec]
+
+        r client unpause
+    }
+
     test "Test read commands are not blocked by client pause" {
         r client PAUSE 100000 WRITE
         set rd [valkey_deferring_client]

From 4b1b8e3ecb3d392829e5859c059a8df76fddaa8b Mon Sep 17 00:00:00 2001
From: Sarthak Aggarwal <sarthagg@amazon.com>
Date: Wed, 15 Jan 2025 11:44:13 -0800
Subject: [PATCH 084/101] Adding Missing filters to CLIENT LIST and Dedup
 Parsing (#1401)

Adds filter options to CLIENT LIST:

    * USER <username>
      Return clients authenticated by <username>.
    * ADDR <ip:port>
      Return clients connected from the specified address.
    * LADDR <ip:port>
      Return clients connected to the specified local address.
    * SKIPME (YES|NO)
      Exclude the current client from the list (default: no).
    * MAXAGE <maxage>
      Only list connections older than the specified age.

Modifies the ID filter to CLIENT KILL to allow multiple IDs

    * ID <client-id> [<client-id>...]
      Kill connections by client ids.


This makes CLIENT LIST and CLIENT KILL accept the same options.

For backward compatibility, the default value for SKIPME is NO for
CLIENT LIST and YES for CLIENT KILL.

The MAXAGE comes from CLIENT KILL, where it *keeps* clients with the
given max age and kills the older ones. This logic becomes weird for
CLIENT LIST, but is kept for similary with CLIENT KILL, for the use case
of first testing manually using CLIENT LIST, and then running CLIENT
KILL with the same filters.

The `ID client-id [client-id ...]` no longer needs to be the last
filter. The parsing logic determines if an argument is an ID or not
based on whether it can be parsed as an integer or not.

Partly addresses: #668

---------

Signed-off-by: Sarthak Aggarwal <sarthagg@amazon.com>
---
 src/commands.def                       |   55 +-
 src/commands/client-caching.json       |    2 +-
 src/commands/client-capa.json          |    2 +-
 src/commands/client-getname.json       |    2 +-
 src/commands/client-getredir.json      |    2 +-
 src/commands/client-help.json          |    2 +-
 src/commands/client-id.json            |    2 +-
 src/commands/client-import-source.json |    2 +-
 src/commands/client-info.json          |    2 +-
 src/commands/client-kill.json          |    7 +-
 src/commands/client-list.json          |   55 +-
 src/commands/client-no-evict.json      |    2 +-
 src/commands/client-no-touch.json      |    2 +-
 src/commands/client-pause.json         |    2 +-
 src/commands/client-reply.json         |    2 +-
 src/commands/client-setname.json       |    2 +-
 src/commands/client-tracking.json      |    2 +-
 src/commands/client-trackinginfo.json  |    2 +-
 src/commands/client-unblock.json       |    2 +-
 src/commands/client-unpause.json       |    2 +-
 src/commands/client.json               |    1 +
 src/networking.c                       | 1109 +++++++++++++-----------
 src/server.h                           |   19 +
 tests/unit/introspection.tcl           |  122 ++-
 24 files changed, 867 insertions(+), 535 deletions(-)

diff --git a/src/commands.def b/src/commands.def
index c5d766e3f8..cd919a80e1 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -1289,6 +1289,7 @@ commandHistory CLIENT_KILL_History[] = {
 {"6.2.0","`LADDR` option."},
 {"8.0.0","`MAXAGE` option."},
 {"8.0.0","Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."},
+{"8.1.0","`ID` option accepts multiple IDs."},
 };
 #endif
 
@@ -1320,7 +1321,7 @@ struct COMMAND_ARG CLIENT_KILL_filter_new_format_skipme_Subargs[] = {
 
 /* CLIENT KILL filter new_format argument table */
 struct COMMAND_ARG CLIENT_KILL_filter_new_format_Subargs[] = {
-{MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"2.8.12",CMD_ARG_OPTIONAL,0,NULL)},
+{MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"2.8.12",CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,0,NULL)},
 {MAKE_ARG("client-type",ARG_TYPE_ONEOF,-1,"TYPE",NULL,"2.8.12",CMD_ARG_OPTIONAL,6,NULL),.subargs=CLIENT_KILL_filter_new_format_client_type_Subargs},
 {MAKE_ARG("username",ARG_TYPE_STRING,-1,"USER",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)},
 {MAKE_ARG("addr",ARG_TYPE_STRING,-1,"ADDR",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"},
@@ -1352,6 +1353,7 @@ commandHistory CLIENT_LIST_History[] = {
 {"7.0.0","Added `resp`, `multi-mem`, `rbs` and `rbp` fields."},
 {"7.0.3","Added `ssub` field."},
 {"8.0.0","Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."},
+{"8.1.0","Added filters USER, ADDR, LADDR, SKIPME, and MAXAGE"},
 };
 #endif
 
@@ -1375,10 +1377,21 @@ struct COMMAND_ARG CLIENT_LIST_client_type_Subargs[] = {
 {MAKE_ARG("pubsub",ARG_TYPE_PURE_TOKEN,-1,"PUBSUB",NULL,NULL,CMD_ARG_NONE,0,NULL)},
 };
 
+/* CLIENT LIST skipme argument table */
+struct COMMAND_ARG CLIENT_LIST_skipme_Subargs[] = {
+{MAKE_ARG("yes",ARG_TYPE_PURE_TOKEN,-1,"YES",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("no",ARG_TYPE_PURE_TOKEN,-1,"NO",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
 /* CLIENT LIST argument table */
 struct COMMAND_ARG CLIENT_LIST_Args[] = {
 {MAKE_ARG("client-type",ARG_TYPE_ONEOF,-1,"TYPE",NULL,"5.0.0",CMD_ARG_OPTIONAL,4,NULL),.subargs=CLIENT_LIST_client_type_Subargs},
 {MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"6.2.0",CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,0,NULL)},
+{MAKE_ARG("username",ARG_TYPE_STRING,-1,"USER",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL)},
+{MAKE_ARG("addr",ARG_TYPE_STRING,-1,"ADDR",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"},
+{MAKE_ARG("laddr",ARG_TYPE_STRING,-1,"LADDR",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"},
+{MAKE_ARG("skipme",ARG_TYPE_ONEOF,-1,"SKIPME",NULL,"8.1.0",CMD_ARG_OPTIONAL,2,NULL),.subargs=CLIENT_LIST_skipme_Subargs},
+{MAKE_ARG("maxage",ARG_TYPE_INTEGER,-1,"MAXAGE",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL)},
 };
 
 /********** CLIENT NO_EVICT ********************/
@@ -1652,26 +1665,26 @@ struct COMMAND_ARG CLIENT_UNBLOCK_Args[] = {
 
 /* CLIENT command table */
 struct COMMAND_STRUCT CLIENT_Subcommands[] = {
-{MAKE_CMD("caching","Instructs the server whether to track the keys in the next request.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CACHING_History,0,CLIENT_CACHING_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_CACHING_Keyspecs,0,NULL,1),.args=CLIENT_CACHING_Args},
-{MAKE_CMD("capa","A client claims its capability.","O(1)","8.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CAPA_History,0,CLIENT_CAPA_Tips,0,clientCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_CAPA_Keyspecs,0,NULL,1),.args=CLIENT_CAPA_Args},
-{MAKE_CMD("getname","Returns the name of the connection.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETNAME_History,0,CLIENT_GETNAME_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETNAME_Keyspecs,0,NULL,0)},
-{MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)},
-{MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)},
-{MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)},
-{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args},
-{MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)},
-{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,7,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args},
-{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,7,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args},
-{MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args},
-{MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args},
-{MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args},
-{MAKE_CMD("reply","Instructs the server whether to reply to commands.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_REPLY_History,0,CLIENT_REPLY_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_REPLY_Keyspecs,0,NULL,1),.args=CLIENT_REPLY_Args},
+{MAKE_CMD("caching","Instructs the server whether to track the keys in the next request.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CACHING_History,0,CLIENT_CACHING_Tips,0,clientCachingCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_CACHING_Keyspecs,0,NULL,1),.args=CLIENT_CACHING_Args},
+{MAKE_CMD("capa","A client claims its capability.","O(1)","8.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CAPA_History,0,CLIENT_CAPA_Tips,0,clientCapaCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_CAPA_Keyspecs,0,NULL,1),.args=CLIENT_CAPA_Args},
+{MAKE_CMD("getname","Returns the name of the connection.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETNAME_History,0,CLIENT_GETNAME_Tips,0,clientGetNameCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETNAME_Keyspecs,0,NULL,0)},
+{MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientGetredirCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)},
+{MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientHelpCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)},
+{MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientIDCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)},
+{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientImportSourceCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args},
+{MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientInfoCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)},
+{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,8,CLIENT_KILL_Tips,0,clientKillCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args},
+{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,8,CLIENT_LIST_Tips,1,clientListCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,7),.args=CLIENT_LIST_Args},
+{MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientNoEvictCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args},
+{MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientNoTouchCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args},
+{MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientPauseCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args},
+{MAKE_CMD("reply","Instructs the server whether to reply to commands.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_REPLY_History,0,CLIENT_REPLY_Tips,0,clientReplyCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_REPLY_Keyspecs,0,NULL,1),.args=CLIENT_REPLY_Args},
 {MAKE_CMD("setinfo","Sets information specific to the client or connection.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETINFO_History,0,CLIENT_SETINFO_Tips,2,clientSetinfoCommand,4,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETINFO_Keyspecs,0,NULL,1),.args=CLIENT_SETINFO_Args},
-{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,2,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args},
-{MAKE_CMD("tracking","Controls server-assisted client-side caching for the connection.","O(1). Some options may introduce additional complexity.","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKING_History,0,CLIENT_TRACKING_Tips,0,clientCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKING_Keyspecs,0,NULL,7),.args=CLIENT_TRACKING_Args},
-{MAKE_CMD("trackinginfo","Returns information about server-assisted client-side caching for the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKINGINFO_History,0,CLIENT_TRACKINGINFO_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKINGINFO_Keyspecs,0,NULL,0)},
-{MAKE_CMD("unblock","Unblocks a client blocked by a blocking command from a different connection.","O(log N) where N is the number of client connections","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNBLOCK_History,0,CLIENT_UNBLOCK_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNBLOCK_Keyspecs,0,NULL,2),.args=CLIENT_UNBLOCK_Args},
-{MAKE_CMD("unpause","Resumes processing commands from paused clients.","O(N) Where N is the number of paused clients","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNPAUSE_History,0,CLIENT_UNPAUSE_Tips,0,clientCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNPAUSE_Keyspecs,0,NULL,0)},
+{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,2,clientSetNameCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args},
+{MAKE_CMD("tracking","Controls server-assisted client-side caching for the connection.","O(1). Some options may introduce additional complexity.","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKING_History,0,CLIENT_TRACKING_Tips,0,clientTrackingCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKING_Keyspecs,0,NULL,7),.args=CLIENT_TRACKING_Args},
+{MAKE_CMD("trackinginfo","Returns information about server-assisted client-side caching for the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKINGINFO_History,0,CLIENT_TRACKINGINFO_Tips,0,clientTrackingInfoCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKINGINFO_Keyspecs,0,NULL,0)},
+{MAKE_CMD("unblock","Unblocks a client blocked by a blocking command from a different connection.","O(log N) where N is the number of client connections","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNBLOCK_History,0,CLIENT_UNBLOCK_Tips,0,clientUnblockCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNBLOCK_Keyspecs,0,NULL,2),.args=CLIENT_UNBLOCK_Args},
+{MAKE_CMD("unpause","Resumes processing commands from paused clients.","O(N) Where N is the number of paused clients","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNPAUSE_History,0,CLIENT_UNPAUSE_Tips,0,clientUnpauseCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNPAUSE_Keyspecs,0,NULL,0)},
 {0}
 };
 
@@ -10910,7 +10923,7 @@ struct COMMAND_STRUCT serverCommandTable[] = {
 {MAKE_CMD("readwrite","Enables read-write queries for a connection to a Valkey replica node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,READWRITE_History,0,READWRITE_Tips,0,readwriteCommand,1,CMD_FAST|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,READWRITE_Keyspecs,0,NULL,0)},
 /* connection */
 {MAKE_CMD("auth","Authenticates the connection.","O(N) where N is the number of passwords defined for the user","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,AUTH_History,1,AUTH_Tips,0,authCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,AUTH_Keyspecs,0,NULL,2),.args=AUTH_Args},
-{MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,NULL,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands},
+{MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,clientCommand,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands},
 {MAKE_CMD("echo","Returns the given string.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,ECHO_History,0,ECHO_Tips,0,echoCommand,2,CMD_LOADING|CMD_STALE|CMD_FAST,ACL_CATEGORY_CONNECTION,ECHO_Keyspecs,0,NULL,1),.args=ECHO_Args},
 {MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,2,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args},
 {MAKE_CMD("ping","Returns the server's liveliness response.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,PING_History,0,PING_Tips,2,pingCommand,-1,CMD_FAST|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,PING_Keyspecs,0,NULL,1),.args=PING_Args},
diff --git a/src/commands/client-caching.json b/src/commands/client-caching.json
index 2a4ae891db..d661492f45 100644
--- a/src/commands/client-caching.json
+++ b/src/commands/client-caching.json
@@ -6,7 +6,7 @@
         "since": "6.0.0",
         "arity": 3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientCachingCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-capa.json b/src/commands/client-capa.json
index 3c16cd44f9..0d0f577f94 100644
--- a/src/commands/client-capa.json
+++ b/src/commands/client-capa.json
@@ -6,7 +6,7 @@
         "since": "8.0.0",
         "arity": -3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientCapaCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-getname.json b/src/commands/client-getname.json
index 9e237af849..e13db064b7 100644
--- a/src/commands/client-getname.json
+++ b/src/commands/client-getname.json
@@ -6,7 +6,7 @@
         "since": "2.6.9",
         "arity": 2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientGetNameCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-getredir.json b/src/commands/client-getredir.json
index 6fdb002dc8..3df1df6b6f 100644
--- a/src/commands/client-getredir.json
+++ b/src/commands/client-getredir.json
@@ -6,7 +6,7 @@
         "since": "6.0.0",
         "arity": 2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientGetredirCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-help.json b/src/commands/client-help.json
index b49294c9ee..ae771d52ae 100644
--- a/src/commands/client-help.json
+++ b/src/commands/client-help.json
@@ -6,7 +6,7 @@
         "since": "5.0.0",
         "arity": 2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientHelpCommand",
         "command_flags": [
             "LOADING",
             "STALE",
diff --git a/src/commands/client-id.json b/src/commands/client-id.json
index 7c2bf08200..f6131250dd 100644
--- a/src/commands/client-id.json
+++ b/src/commands/client-id.json
@@ -6,7 +6,7 @@
         "since": "5.0.0",
         "arity": 2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientIDCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-import-source.json b/src/commands/client-import-source.json
index 113c07d70a..dd5ef65e77 100644
--- a/src/commands/client-import-source.json
+++ b/src/commands/client-import-source.json
@@ -6,7 +6,7 @@
         "since": "8.1.0",
         "arity": 3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientImportSourceCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-info.json b/src/commands/client-info.json
index f974da437b..afda2ca967 100644
--- a/src/commands/client-info.json
+++ b/src/commands/client-info.json
@@ -6,7 +6,7 @@
         "since": "6.2.0",
         "arity": 2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientInfoCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-kill.json b/src/commands/client-kill.json
index 97fa932cd8..0ae3579534 100644
--- a/src/commands/client-kill.json
+++ b/src/commands/client-kill.json
@@ -6,7 +6,7 @@
         "since": "2.4.0",
         "arity": -3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientKillCommand",
         "history": [
             [
                 "2.8.12",
@@ -35,6 +35,10 @@
             [
                 "8.0.0",
                 "Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."
+            ],
+            [
+                "8.1.0",
+                "`ID` option accepts multiple IDs."
             ]
         ],
         "command_flags": [
@@ -68,6 +72,7 @@
                                 "name": "client-id",
                                 "type": "integer",
                                 "optional": true,
+                                "multiple": true,
                                 "since": "2.8.12"
                             },
                             {
diff --git a/src/commands/client-list.json b/src/commands/client-list.json
index d9c0054e60..05e4de2419 100644
--- a/src/commands/client-list.json
+++ b/src/commands/client-list.json
@@ -6,7 +6,7 @@
         "since": "2.4.0",
         "arity": -2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientListCommand",
         "history": [
             [
                 "2.8.12",
@@ -35,6 +35,10 @@
             [
                 "8.0.0",
                 "Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."
+            ],
+            [
+                "8.1.0",
+                "Added filters USER, ADDR, LADDR, SKIPME, and MAXAGE"
             ]
         ],
         "command_flags": [
@@ -91,6 +95,55 @@
                 "optional": true,
                 "multiple": true,
                 "since": "6.2.0"
+            },
+            {
+                "token": "USER",
+                "name": "username",
+                "type": "string",
+                "optional": true,
+                "since": "8.1.0"
+            },
+            {
+                "token": "ADDR",
+                "name": "addr",
+                "display": "ip:port",
+                "type": "string",
+                "optional": true,
+                "since": "8.1.0"
+            },
+            {
+                "token": "LADDR",
+                "name": "laddr",
+                "display": "ip:port",
+                "type": "string",
+                "optional": true,
+                "since": "8.1.0"
+            },
+            {
+                "token": "SKIPME",
+                "name": "skipme",
+                "type": "oneof",
+                "optional": true,
+                "since": "8.1.0",
+                "arguments": [
+                    {
+                        "name": "yes",
+                        "type": "pure-token",
+                        "token": "YES"
+                    },
+                    {
+                        "name": "no",
+                        "type": "pure-token",
+                        "token": "NO"
+                    }
+                ]
+            },
+            {
+                "token": "MAXAGE",
+                "name": "maxage",
+                "type": "integer",
+                "optional": true,
+                "since": "8.1.0"
             }
         ]
     }
diff --git a/src/commands/client-no-evict.json b/src/commands/client-no-evict.json
index 9ed6718405..710f8a97f9 100644
--- a/src/commands/client-no-evict.json
+++ b/src/commands/client-no-evict.json
@@ -6,7 +6,7 @@
         "since": "7.0.0",
         "arity": 3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientNoEvictCommand",
         "command_flags": [
             "ADMIN",
             "NOSCRIPT",
diff --git a/src/commands/client-no-touch.json b/src/commands/client-no-touch.json
index 4cf7b72416..4196770a2e 100644
--- a/src/commands/client-no-touch.json
+++ b/src/commands/client-no-touch.json
@@ -6,7 +6,7 @@
         "since": "7.2.0",
         "arity": 3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientNoTouchCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-pause.json b/src/commands/client-pause.json
index b1dd7bc478..54faf796c2 100644
--- a/src/commands/client-pause.json
+++ b/src/commands/client-pause.json
@@ -6,7 +6,7 @@
         "since": "3.0.0",
         "arity": -3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientPauseCommand",
         "history": [
             [
                 "6.2.0",
diff --git a/src/commands/client-reply.json b/src/commands/client-reply.json
index 9406de85cf..8d2b713a69 100644
--- a/src/commands/client-reply.json
+++ b/src/commands/client-reply.json
@@ -6,7 +6,7 @@
         "since": "3.2.0",
         "arity": 3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientReplyCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-setname.json b/src/commands/client-setname.json
index b071bd18ff..f544dc6a0f 100644
--- a/src/commands/client-setname.json
+++ b/src/commands/client-setname.json
@@ -6,7 +6,7 @@
         "since": "2.6.9",
         "arity": 3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientSetNameCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-tracking.json b/src/commands/client-tracking.json
index 2c3768c2fb..1acf84fafc 100644
--- a/src/commands/client-tracking.json
+++ b/src/commands/client-tracking.json
@@ -6,7 +6,7 @@
         "since": "6.0.0",
         "arity": -3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientTrackingCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-trackinginfo.json b/src/commands/client-trackinginfo.json
index 270a3d5e6e..78ba8201d7 100644
--- a/src/commands/client-trackinginfo.json
+++ b/src/commands/client-trackinginfo.json
@@ -6,7 +6,7 @@
         "since": "6.2.0",
         "arity": 2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientTrackingInfoCommand",
         "command_flags": [
             "NOSCRIPT",
             "LOADING",
diff --git a/src/commands/client-unblock.json b/src/commands/client-unblock.json
index d391ede9e9..2173173f40 100644
--- a/src/commands/client-unblock.json
+++ b/src/commands/client-unblock.json
@@ -6,7 +6,7 @@
         "since": "5.0.0",
         "arity": -3,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientUnblockCommand",
         "command_flags": [
             "ADMIN",
             "NOSCRIPT",
diff --git a/src/commands/client-unpause.json b/src/commands/client-unpause.json
index 6c55210d2a..bb78fb848b 100644
--- a/src/commands/client-unpause.json
+++ b/src/commands/client-unpause.json
@@ -6,7 +6,7 @@
         "since": "6.2.0",
         "arity": 2,
         "container": "CLIENT",
-        "function": "clientCommand",
+        "function": "clientUnpauseCommand",
         "command_flags": [
             "ADMIN",
             "NOSCRIPT",
diff --git a/src/commands/client.json b/src/commands/client.json
index b50996128e..116fb4d4a2 100644
--- a/src/commands/client.json
+++ b/src/commands/client.json
@@ -4,6 +4,7 @@
         "complexity": "Depends on subcommand.",
         "group": "connection",
         "since": "2.4.0",
+        "function": "clientCommand",
         "arity": -2,
         "command_flags": [
             "SENTINEL"
diff --git a/src/networking.c b/src/networking.c
index 48e397e6f4..0ff0fed4c0 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -31,6 +31,7 @@
 #include "cluster.h"
 #include "cluster_slot_stats.h"
 #include "script.h"
+#include "intset.h"
 #include "sds.h"
 #include "fpconv_dtoa.h"
 #include "fmtargs.h"
@@ -43,10 +44,35 @@
 #include <ctype.h>
 #include <stdatomic.h>
 
+/* This struct is used to encapsulate filtering criteria for operations on clients
+ * such as identifying specific clients to kill or retrieve. Each field in the struct
+ * represents a filter that can be applied based on specific attributes of a client. */
+typedef struct {
+    /* A set of client IDs to filter. If NULL, no ID filtering is applied. */
+    intset *ids;
+    /* Maximum age (in seconds) of a client connection for filtering.
+     * Connections younger than this value will not match.
+     * A value of 0 means no age filtering. */
+    long long max_age;
+    /* Address/port of the client. If NULL, no address filtering is applied. */
+    char *addr;
+    /* Remote address/port of the client. If NULL, no address filtering is applied. */
+    char *laddr;
+    /* Filtering clients by authentication user. If NULL, no user-based filtering is applied. */
+    user *user;
+    /* Client type to filter. If set to -1, no type filtering is applied. */
+    int type;
+    /* Boolean flag to determine if the current client (`me`) should be filtered. 1 means "skip me", 0 means otherwise. */
+    int skipme;
+} clientFilter;
+
 static void setProtocolError(const char *errstr, client *c);
 static void pauseClientsByClient(mstime_t end, int isPauseClientAll);
 int postponeClientRead(client *c);
 char *getClientSockname(client *c);
+static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter);
+static int clientMatchesFilter(client *client, clientFilter client_filter);
+static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data);
 
 int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */
 __thread sds thread_shared_qb = NULL;
@@ -2451,6 +2477,7 @@ int handleClientsWithPendingWrites(void) {
 /* resetClient prepare the client to process the next command */
 void resetClient(client *c) {
     serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL;
+    serverCommandProc *prevParentCmd = c->cmd && c->cmd->parent ? c->cmd->parent->proc : NULL;
 
     freeClientArgv(c);
     freeClientOriginalArgv(c);
@@ -2480,7 +2507,7 @@ void resetClient(client *c) {
     /* We do the same for the CACHING command as well. It also affects
      * the next command or transaction executed, in a way very similar
      * to ASKING. */
-    if (!c->flag.multi && prevcmd != clientCommand) c->flag.tracking_caching = 0;
+    if (!c->flag.multi && prevParentCmd != clientCommand) c->flag.tracking_caching = 0;
 
     /* Remove the CLIENT_REPLY_SKIP flag if any so that the reply
      * to the next command will be sent, but set the flag if the command
@@ -3354,6 +3381,22 @@ sds getAllClientsInfoString(int type, int hide_user_data) {
     return o;
 }
 
+static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data) {
+    listNode *ln;
+    listIter li;
+    client *client;
+    sds o = sdsempty();
+    sdsclear(o);
+    listRewind(server.clients, &li);
+    while ((ln = listNext(&li)) != NULL) {
+        client = listNodeValue(ln);
+        if (!clientMatchesFilter(client, *client_filter)) continue;
+        o = catClientInfoString(o, client, hide_user_data);
+        o = sdscatlen(o, "\n", 1);
+    }
+    return o;
+}
+
 /* Check validity of an attribute that's gonna be shown in CLIENT LIST. */
 int validateClientAttr(const char *val) {
     /* Check if the charset is ok. We need to do this otherwise
@@ -3473,570 +3516,648 @@ void quitCommand(client *c) {
     c->flag.close_after_reply = 1;
 }
 
-void clientCommand(client *c) {
-    listNode *ln;
-    listIter li;
+static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter) {
+    while (index < c->argc) {
+        int moreargs = c->argc > index + 1;
 
-    if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) {
-        const char *help[] = {
-            "CACHING (YES|NO)",
-            "    Enable/disable tracking of the keys for next command in OPTIN/OPTOUT modes.",
-            "CAPA <option> [options...]",
-            "    The client claims its some capability options. Options are:",
-            "    * REDIRECT",
-            "      The client can handle redirection during primary and replica failover in standalone mode.",
-            "GETREDIR",
-            "    Return the client ID we are redirecting to when tracking is enabled.",
-            "GETNAME",
-            "    Return the name of the current connection.",
-            "ID",
-            "    Return the ID of the current connection.",
-            "INFO",
-            "    Return information about the current client connection.",
-            "KILL <ip:port>",
-            "    Kill connection made from <ip:port>.",
-            "KILL <option> <value> [<option> <value> [...]]",
-            "    Kill connections. Options are:",
-            "    * ADDR (<ip:port>|<unixsocket>:0)",
-            "      Kill connections made from the specified address",
-            "    * LADDR (<ip:port>|<unixsocket>:0)",
-            "      Kill connections made to specified local address",
-            "    * TYPE (NORMAL|PRIMARY|REPLICA|PUBSUB)",
-            "      Kill connections by type.",
-            "    * USER <username>",
-            "      Kill connections authenticated by <username>.",
-            "    * SKIPME (YES|NO)",
-            "      Skip killing current connection (default: yes).",
-            "    * ID <client-id>",
-            "      Kill connections by client id.",
-            "    * MAXAGE <maxage>",
-            "      Kill connections older than the specified age.",
-            "LIST [options ...]",
-            "    Return information about client connections. Options:",
-            "    * TYPE (NORMAL|PRIMARY|REPLICA|PUBSUB)",
-            "      Return clients of specified type.",
-            "UNPAUSE",
-            "    Stop the current client pause, resuming traffic.",
-            "PAUSE <timeout> [WRITE|ALL]",
-            "    Suspend all, or just write, clients for <timeout> milliseconds.",
-            "REPLY (ON|OFF|SKIP)",
-            "    Control the replies sent to the current connection.",
-            "SETNAME <name>",
-            "    Assign the name <name> to the current connection.",
-            "SETINFO <option> <value>",
-            "    Set client meta attr. Options are:",
-            "    * LIB-NAME: the client lib name.",
-            "    * LIB-VER: the client lib version.",
-            "UNBLOCK <clientid> [TIMEOUT|ERROR]",
-            "    Unblock the specified blocked client.",
-            "TRACKING (ON|OFF) [REDIRECT <id>] [BCAST] [PREFIX <prefix> [...]]",
-            "         [OPTIN] [OPTOUT] [NOLOOP]",
-            "    Control server assisted client side caching.",
-            "TRACKINGINFO",
-            "    Report tracking status for the current connection.",
-            "NO-EVICT (ON|OFF)",
-            "    Protect current client connection from eviction.",
-            "NO-TOUCH (ON|OFF)",
-            "    Will not touch LRU/LFU stats when this mode is on.",
-            "IMPORT-SOURCE (ON|OFF)",
-            "    Mark this connection as an import source if import-mode is enabled.",
-            "    Sync tools can set their connections into 'import-source' state to visit",
-            "    expired keys.",
-            NULL};
-        addReplyHelp(c, help);
-    } else if (!strcasecmp(c->argv[1]->ptr, "id") && c->argc == 2) {
-        /* CLIENT ID */
-        addReplyLongLong(c, c->id);
-    } else if (!strcasecmp(c->argv[1]->ptr, "info") && c->argc == 2) {
-        /* CLIENT INFO */
-        sds o = catClientInfoString(sdsempty(), c, 0);
-        o = sdscatlen(o, "\n", 1);
-        addReplyVerbatim(c, o, sdslen(o), "txt");
-        sdsfree(o);
-    } else if (!strcasecmp(c->argv[1]->ptr, "list")) {
-        /* CLIENT LIST */
-        int type = -1;
-        sds o = NULL;
-        if (c->argc == 4 && !strcasecmp(c->argv[2]->ptr, "type")) {
-            type = getClientTypeByName(c->argv[3]->ptr);
-            if (type == -1) {
-                addReplyErrorFormat(c, "Unknown client type '%s'", (char *)c->argv[3]->ptr);
-                return;
+        if (!strcasecmp(c->argv[index]->ptr, "id")) {
+            if (filter->ids == NULL) {
+                /* Initialize the intset for IDs */
+                filter->ids = intsetNew();
             }
-        } else if (c->argc > 3 && !strcasecmp(c->argv[2]->ptr, "id")) {
-            int j;
-            o = sdsempty();
-            for (j = 3; j < c->argc; j++) {
-                long long cid;
-                if (getLongLongFromObjectOrReply(c, c->argv[j], &cid, "Invalid client ID")) {
-                    sdsfree(o);
-                    return;
+            index++; /* Move to the first ID after "ID" */
+
+            /* Process all IDs until a non-numeric argument or end of args */
+            while (index < c->argc) {
+                long long id;
+                if (!string2ll(c->argv[index]->ptr, sdslen(c->argv[index]->ptr), &id)) {
+                    break; /* Stop processing IDs if a non-numeric argument is encountered */
                 }
-                client *cl = lookupClientByID(cid);
-                if (cl) {
-                    o = catClientInfoString(o, cl, 0);
-                    o = sdscatlen(o, "\n", 1);
+                if (id < 1) {
+                    addReplyError(c, "client-id should be greater than 0");
+                    return C_ERR;
                 }
+
+                uint8_t added;
+                filter->ids = intsetAdd(filter->ids, id, &added);
+                index++; /* Move to the next argument */
+            }
+        } else if (!strcasecmp(c->argv[index]->ptr, "maxage") && moreargs) {
+            long long tmp;
+
+            if (getLongLongFromObjectOrReply(c, c->argv[index + 1], &tmp,
+                                             "maxage is not an integer or out of range") != C_OK)
+                return C_ERR;
+            if (tmp <= 0) {
+                addReplyError(c, "maxage should be greater than 0");
+                return C_ERR;
             }
-        } else if (c->argc != 2) {
-            addReplyErrorObject(c, shared.syntaxerr);
-            return;
-        }
 
-        if (!o) o = getAllClientsInfoString(type, 0);
-        addReplyVerbatim(c, o, sdslen(o), "txt");
-        sdsfree(o);
-    } else if (!strcasecmp(c->argv[1]->ptr, "reply") && c->argc == 3) {
-        /* CLIENT REPLY ON|OFF|SKIP */
-        if (!strcasecmp(c->argv[2]->ptr, "on")) {
-            c->flag.reply_skip = 0;
-            c->flag.reply_off = 0;
-            addReply(c, shared.ok);
-        } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
-            c->flag.reply_off = 1;
-        } else if (!strcasecmp(c->argv[2]->ptr, "skip")) {
-            if (!c->flag.reply_off) c->flag.reply_skip_next = 1;
-        } else {
-            addReplyErrorObject(c, shared.syntaxerr);
-            return;
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr, "no-evict") && c->argc == 3) {
-        /* CLIENT NO-EVICT ON|OFF */
-        if (!strcasecmp(c->argv[2]->ptr, "on")) {
-            c->flag.no_evict = 1;
-            removeClientFromMemUsageBucket(c, 0);
-            addReply(c, shared.ok);
-        } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
-            c->flag.no_evict = 0;
-            updateClientMemUsageAndBucket(c);
-            addReply(c, shared.ok);
+            filter->max_age = tmp;
+            index += 2;
+        } else if (!strcasecmp(c->argv[index]->ptr, "type") && moreargs) {
+            filter->type = getClientTypeByName(c->argv[index + 1]->ptr);
+            if (filter->type == -1) {
+                addReplyErrorFormat(c, "Unknown client type '%s'", (char *)c->argv[index + 1]->ptr);
+                return C_ERR;
+            }
+            index += 2;
+        } else if (!strcasecmp(c->argv[index]->ptr, "addr") && moreargs) {
+            filter->addr = c->argv[index + 1]->ptr;
+            index += 2;
+        } else if (!strcasecmp(c->argv[index]->ptr, "laddr") && moreargs) {
+            filter->laddr = c->argv[index + 1]->ptr;
+            index += 2;
+        } else if (!strcasecmp(c->argv[index]->ptr, "user") && moreargs) {
+            filter->user = ACLGetUserByName(c->argv[index + 1]->ptr, sdslen(c->argv[index + 1]->ptr));
+            if (filter->user == NULL) {
+                addReplyErrorFormat(c, "No such user '%s'", (char *)c->argv[index + 1]->ptr);
+                return C_ERR;
+            }
+            index += 2;
+        } else if (!strcasecmp(c->argv[index]->ptr, "skipme") && moreargs) {
+            if (!strcasecmp(c->argv[index + 1]->ptr, "yes")) {
+                filter->skipme = 1;
+            } else if (!strcasecmp(c->argv[index + 1]->ptr, "no")) {
+                filter->skipme = 0;
+            } else {
+                addReplyErrorObject(c, shared.syntaxerr);
+                return C_ERR;
+            }
+            index += 2;
         } else {
             addReplyErrorObject(c, shared.syntaxerr);
-            return;
+            return C_ERR;
         }
-    } else if (!strcasecmp(c->argv[1]->ptr, "kill")) {
-        /* CLIENT KILL <ip:port>
-         * CLIENT KILL <option> [value] ... <option> [value] */
-        char *addr = NULL;
-        char *laddr = NULL;
-        user *user = NULL;
-        int type = -1;
-        uint64_t id = 0;
-        long long max_age = 0;
-        int skipme = 1;
-        int killed = 0, close_this_client = 0;
-
-        if (c->argc == 3) {
-            /* Old style syntax: CLIENT KILL <addr> */
-            addr = c->argv[2]->ptr;
-            skipme = 0; /* With the old form, you can kill yourself. */
-        } else if (c->argc > 3) {
-            int i = 2; /* Next option index. */
-
-            /* New style syntax: parse options. */
-            while (i < c->argc) {
-                int moreargs = c->argc > i + 1;
-
-                if (!strcasecmp(c->argv[i]->ptr, "id") && moreargs) {
-                    long tmp;
-
-                    if (getRangeLongFromObjectOrReply(c, c->argv[i + 1], 1, LONG_MAX, &tmp,
-                                                      "client-id should be greater than 0") != C_OK)
-                        return;
-                    id = tmp;
-                } else if (!strcasecmp(c->argv[i]->ptr, "maxage") && moreargs) {
-                    long long tmp;
-
-                    if (getLongLongFromObjectOrReply(c, c->argv[i + 1], &tmp,
-                                                     "maxage is not an integer or out of range") != C_OK)
-                        return;
-                    if (tmp <= 0) {
-                        addReplyError(c, "maxage should be greater than 0");
-                        return;
-                    }
+    }
+    return C_OK;
+}
 
-                    max_age = tmp;
-                } else if (!strcasecmp(c->argv[i]->ptr, "type") && moreargs) {
-                    type = getClientTypeByName(c->argv[i + 1]->ptr);
-                    if (type == -1) {
-                        addReplyErrorFormat(c, "Unknown client type '%s'", (char *)c->argv[i + 1]->ptr);
-                        return;
-                    }
-                } else if (!strcasecmp(c->argv[i]->ptr, "addr") && moreargs) {
-                    addr = c->argv[i + 1]->ptr;
-                } else if (!strcasecmp(c->argv[i]->ptr, "laddr") && moreargs) {
-                    laddr = c->argv[i + 1]->ptr;
-                } else if (!strcasecmp(c->argv[i]->ptr, "user") && moreargs) {
-                    user = ACLGetUserByName(c->argv[i + 1]->ptr, sdslen(c->argv[i + 1]->ptr));
-                    if (user == NULL) {
-                        addReplyErrorFormat(c, "No such user '%s'", (char *)c->argv[i + 1]->ptr);
-                        return;
-                    }
-                } else if (!strcasecmp(c->argv[i]->ptr, "skipme") && moreargs) {
-                    if (!strcasecmp(c->argv[i + 1]->ptr, "yes")) {
-                        skipme = 1;
-                    } else if (!strcasecmp(c->argv[i + 1]->ptr, "no")) {
-                        skipme = 0;
-                    } else {
-                        addReplyErrorObject(c, shared.syntaxerr);
-                        return;
-                    }
-                } else {
-                    addReplyErrorObject(c, shared.syntaxerr);
-                    return;
-                }
-                i += 2;
-            }
-        } else {
-            addReplyErrorObject(c, shared.syntaxerr);
+static int clientMatchesFilter(client *client, clientFilter client_filter) {
+    /* Check each filter condition and return false if the client does not match. */
+    if (client_filter.addr && strcmp(getClientPeerId(client), client_filter.addr) != 0) return 0;
+    if (client_filter.laddr && strcmp(getClientSockname(client), client_filter.laddr) != 0) return 0;
+    if (client_filter.type != -1 && getClientType(client) != client_filter.type) return 0;
+    if (client_filter.ids && !intsetFind(client_filter.ids, client->id)) return 0;
+    if (client_filter.user && client->user != client_filter.user) return 0;
+    if (client_filter.skipme && client == server.current_client) return 0;
+    if (client_filter.max_age != 0 && (long long)(commandTimeSnapshot() / 1000 - client->ctime) < client_filter.max_age) return 0;
+
+    /* If all conditions are satisfied, the client matches the filter. */
+    return 1;
+}
+
+void clientHelpCommand(client *c) {
+    const char *help[] = {
+        "CACHING (YES|NO)",
+        "    Enable/disable tracking of the keys for next command in OPTIN/OPTOUT modes.",
+        "CAPA <option> [options...]",
+        "    The client claims its some capability options. Options are:",
+        "    * REDIRECT",
+        "      The client can handle redirection during primary and replica failover in standalone mode.",
+        "GETREDIR",
+        "    Return the client ID we are redirecting to when tracking is enabled.",
+        "GETNAME",
+        "    Return the name of the current connection.",
+        "ID",
+        "    Return the ID of the current connection.",
+        "INFO",
+        "    Return information about the current client connection.",
+        "KILL <ip:port>",
+        "    Kill connection made from <ip:port>.",
+        "KILL <option> <value> [<option> <value> [...]]",
+        "    Kill connections. Options are:",
+        "    * ADDR (<ip:port>|<unixsocket>:0)",
+        "      Kill connections made from the specified address",
+        "    * LADDR (<ip:port>|<unixsocket>:0)",
+        "      Kill connections made to specified local address",
+        "    * TYPE (NORMAL|PRIMARY|REPLICA|PUBSUB)",
+        "      Kill connections by type.",
+        "    * USER <username>",
+        "      Kill connections authenticated by <username>.",
+        "    * SKIPME (YES|NO)",
+        "      Skip killing current connection (default: yes).",
+        "    * ID <client-id> [<client-id>...]",
+        "      Kill connections by client ids.",
+        "    * MAXAGE <maxage>",
+        "      Kill connections older than the specified age.",
+        "LIST [options ...]",
+        "    Return information about client connections. Options:",
+        "    * TYPE (NORMAL|PRIMARY|REPLICA|PUBSUB)",
+        "      Return clients of specified type.",
+        "    * USER <username>",
+        "      Return clients authenticated by <username>.",
+        "    * ADDR <ip:port>",
+        "      Return clients connected from the specified address.",
+        "    * LADDR <ip:port>",
+        "      Return clients connected to the specified local address.",
+        "    * ID <client-id> [<client-id>...]",
+        "      Return clients with the specified IDs.",
+        "    * SKIPME (YES|NO)",
+        "      Exclude the current client from the list (default: no).",
+        "    * MAXAGE <maxage>",
+        "      List connections older than the specified age.",
+        "UNPAUSE",
+        "    Stop the current client pause, resuming traffic.",
+        "PAUSE <timeout> [WRITE|ALL]",
+        "    Suspend all, or just write, clients for <timeout> milliseconds.",
+        "REPLY (ON|OFF|SKIP)",
+        "    Control the replies sent to the current connection.",
+        "SETNAME <name>",
+        "    Assign the name <name> to the current connection.",
+        "SETINFO <option> <value>",
+        "    Set client meta attr. Options are:",
+        "    * LIB-NAME: the client lib name.",
+        "    * LIB-VER: the client lib version.",
+        "UNBLOCK <clientid> [TIMEOUT|ERROR]",
+        "    Unblock the specified blocked client.",
+        "TRACKING (ON|OFF) [REDIRECT <id>] [BCAST] [PREFIX <prefix> [...]]",
+        "         [OPTIN] [OPTOUT] [NOLOOP]",
+        "    Control server assisted client side caching.",
+        "TRACKINGINFO",
+        "    Report tracking status for the current connection.",
+        "NO-EVICT (ON|OFF)",
+        "    Protect current client connection from eviction.",
+        "NO-TOUCH (ON|OFF)",
+        "    Will not touch LRU/LFU stats when this mode is on.",
+        "IMPORT-SOURCE (ON|OFF)",
+        "    Mark this connection as an import source if import-mode is enabled.",
+        "    Sync tools can set their connections into 'import-source' state to visit",
+        "    expired keys.",
+        NULL};
+    addReplyHelp(c, help);
+}
+
+void clientIDCommand(client *c) {
+    addReplyLongLong(c, c->id);
+}
+
+void clientInfoCommand(client *c) {
+    sds info = catClientInfoString(sdsempty(), c, 0);
+    info = sdscatlen(info, "\n", 1);
+    addReplyVerbatim(c, info, sdslen(info), "txt");
+    sdsfree(info);
+}
+
+void clientListCommand(client *c) {
+    int type = -1;
+    sds response = NULL;
+
+    if (c->argc > 3) {
+        clientFilter filter = {.ids = NULL, .max_age = 0, .addr = NULL, .laddr = NULL, .user = NULL, .type = -1, .skipme = 0};
+        int i = 2;
+
+        if (parseClientFiltersOrReply(c, i, &filter) != C_OK) {
+            zfree(filter.ids);
             return;
         }
+        response = getAllFilteredClientsInfoString(&filter, 0);
+        zfree(filter.ids);
+    } else if (c->argc != 2) {
+        addReplyErrorObject(c, shared.syntaxerr);
+        return;
+    }
 
-        /* Iterate clients killing all the matching clients. */
-        listRewind(server.clients, &li);
-        while ((ln = listNext(&li)) != NULL) {
-            client *client = listNodeValue(ln);
-            if (addr && strcmp(getClientPeerId(client), addr) != 0) continue;
-            if (laddr && strcmp(getClientSockname(client), laddr) != 0) continue;
-            if (type != -1 && getClientType(client) != type) continue;
-            if (id != 0 && client->id != id) continue;
-            if (user && client->user != user) continue;
-            if (c == client && skipme) continue;
-            if (max_age != 0 && (long long)(commandTimeSnapshot() / 1000 - client->ctime) < max_age) continue;
-
-            /* Kill it. */
-            if (c == client) {
-                close_this_client = 1;
-            } else {
-                freeClient(client);
-            }
-            killed++;
+    if (!response) response = getAllClientsInfoString(type, 0);
+    addReplyVerbatim(c, response, sdslen(response), "txt");
+    sdsfree(response);
+}
+
+void clientReplyCommand(client *c) {
+    /* CLIENT REPLY ON|OFF|SKIP */
+    if (!strcasecmp(c->argv[2]->ptr, "on")) {
+        c->flag.reply_skip = 0;
+        c->flag.reply_off = 0;
+        addReply(c, shared.ok);
+    } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
+        c->flag.reply_off = 1;
+    } else if (!strcasecmp(c->argv[2]->ptr, "skip")) {
+        if (!c->flag.reply_off) c->flag.reply_skip_next = 1;
+    } else {
+        addReplyErrorObject(c, shared.syntaxerr);
+        return;
+    }
+}
+
+void clientNoEvictCommand(client *c) {
+    /* CLIENT NO-EVICT ON|OFF */
+    if (!strcasecmp(c->argv[2]->ptr, "on")) {
+        c->flag.no_evict = 1;
+        removeClientFromMemUsageBucket(c, 0);
+        addReply(c, shared.ok);
+    } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
+        c->flag.no_evict = 0;
+        updateClientMemUsageAndBucket(c);
+        addReply(c, shared.ok);
+    } else {
+        addReplyErrorObject(c, shared.syntaxerr);
+        return;
+    }
+}
+
+void clientKillCommand(client *c) {
+    /* CLIENT KILL <ip:port>
+     * CLIENT KILL <option> [value] ... <option> [value] */
+
+    listNode *ln;
+    listIter li;
+
+    clientFilter client_filter = {.ids = NULL,
+                                  .max_age = 0,
+                                  .addr = NULL,
+                                  .laddr = NULL,
+                                  .user = NULL,
+                                  .type = -1,
+                                  .skipme = 1};
+
+    int killed = 0, close_this_client = 0;
+
+    if (c->argc == 3) {
+        /* Old style syntax: CLIENT KILL <addr> */
+        client_filter.addr = c->argv[2]->ptr;
+        client_filter.skipme = 0; /* With the old form, you can kill yourself. */
+    } else if (c->argc > 3) {
+        int i = 2; /* Next option index. */
+
+        /* New style syntax: parse options. */
+        if (parseClientFiltersOrReply(c, i, &client_filter) != C_OK) {
+            /* Free the intset on error */
+            goto client_kill_done;
         }
+    } else {
+        addReplyErrorObject(c, shared.syntaxerr);
+        /* Free the intset on error */
+        goto client_kill_done;
+    }
 
-        /* Reply according to old/new format. */
-        if (c->argc == 3) {
-            if (killed == 0)
-                addReplyError(c, "No such client");
-            else
-                addReply(c, shared.ok);
+    /* Iterate clients killing all the matching clients. */
+    listRewind(server.clients, &li);
+    while ((ln = listNext(&li)) != NULL) {
+        client *client = listNodeValue(ln);
+        if (!clientMatchesFilter(client, client_filter)) continue;
+
+        /* Kill it. */
+        if (c == client) {
+            close_this_client = 1;
         } else {
-            addReplyLongLong(c, killed);
+            freeClient(client);
         }
+        killed++;
+    }
 
-        /* If this client has to be closed, flag it as CLOSE_AFTER_REPLY
-         * only after we queued the reply to its output buffers. */
-        if (close_this_client) c->flag.close_after_reply = 1;
-    } else if (!strcasecmp(c->argv[1]->ptr, "unblock") && (c->argc == 3 || c->argc == 4)) {
-        /* CLIENT UNBLOCK <id> [timeout|error] */
-        long long id;
-        int unblock_error = 0;
-
-        if (c->argc == 4) {
-            if (!strcasecmp(c->argv[3]->ptr, "timeout")) {
-                unblock_error = 0;
-            } else if (!strcasecmp(c->argv[3]->ptr, "error")) {
-                unblock_error = 1;
-            } else {
-                addReplyError(c, "CLIENT UNBLOCK reason should be TIMEOUT or ERROR");
-                return;
-            }
-        }
-        if (getLongLongFromObjectOrReply(c, c->argv[2], &id, NULL) != C_OK) return;
-        struct client *target = lookupClientByID(id);
-        /* Note that we never try to unblock a client blocked on a module command, which
-         * doesn't have a timeout callback (even in the case of UNBLOCK ERROR).
-         * The reason is that we assume that if a command doesn't expect to be timedout,
-         * it also doesn't expect to be unblocked by CLIENT UNBLOCK */
-        if (target && target->flag.blocked && moduleBlockedClientMayTimeout(target)) {
-            if (unblock_error)
-                unblockClientOnError(target, "-UNBLOCKED client unblocked via CLIENT UNBLOCK");
-            else
-                unblockClientOnTimeout(target);
-
-            addReply(c, shared.cone);
+    /* Reply according to old/new format. */
+    if (c->argc == 3) {
+        if (killed == 0)
+            addReplyError(c, "No such client");
+        else
+            addReply(c, shared.ok);
+    } else {
+        addReplyLongLong(c, killed);
+    }
+
+    /* If this client has to be closed, flag it as CLOSE_AFTER_REPLY
+     * only after we queued the reply to its output buffers. */
+    if (close_this_client) c->flag.close_after_reply = 1;
+client_kill_done:
+    zfree(client_filter.ids);
+}
+
+
+void clientUnblockCommand(client *c) {
+    /* CLIENT UNBLOCK <id> [timeout|error] */
+    long long id;
+    int unblock_error = 0;
+
+    if (c->argc == 4) {
+        if (!strcasecmp(c->argv[3]->ptr, "timeout")) {
+            unblock_error = 0;
+        } else if (!strcasecmp(c->argv[3]->ptr, "error")) {
+            unblock_error = 1;
         } else {
-            addReply(c, shared.czero);
+            addReplyError(c, "CLIENT UNBLOCK reason should be TIMEOUT or ERROR");
+            return;
         }
-    } else if (!strcasecmp(c->argv[1]->ptr, "setname") && c->argc == 3) {
-        /* CLIENT SETNAME */
-        if (clientSetNameOrReply(c, c->argv[2]) == C_OK) addReply(c, shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr, "getname") && c->argc == 2) {
-        /* CLIENT GETNAME */
-        if (c->name)
-            addReplyBulk(c, c->name);
+    }
+    if (getLongLongFromObjectOrReply(c, c->argv[2], &id, NULL) != C_OK) return;
+    struct client *target = lookupClientByID(id);
+    /* Note that we never try to unblock a client blocked on a module command, which
+     * doesn't have a timeout callback (even in the case of UNBLOCK ERROR).
+     * The reason is that we assume that if a command doesn't expect to be timedout,
+     * it also doesn't expect to be unblocked by CLIENT UNBLOCK */
+    if (target && target->flag.blocked && moduleBlockedClientMayTimeout(target)) {
+        if (unblock_error)
+            unblockClientOnError(target, "-UNBLOCKED client unblocked via CLIENT UNBLOCK");
         else
-            addReplyNull(c);
-    } else if (!strcasecmp(c->argv[1]->ptr, "unpause") && c->argc == 2) {
-        /* CLIENT UNPAUSE */
-        unpauseActions(PAUSE_BY_CLIENT_COMMAND);
-        addReply(c, shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr, "pause") && (c->argc == 3 || c->argc == 4)) {
-        /* CLIENT PAUSE TIMEOUT [WRITE|ALL] */
-        mstime_t end;
-        int isPauseClientAll = 1;
-        if (c->argc == 4) {
-            if (!strcasecmp(c->argv[3]->ptr, "write")) {
-                isPauseClientAll = 0;
-            } else if (strcasecmp(c->argv[3]->ptr, "all")) {
-                addReplyError(c, "CLIENT PAUSE mode must be WRITE or ALL");
-                return;
-            }
-        }
+            unblockClientOnTimeout(target);
+
+        addReply(c, shared.cone);
+    } else {
+        addReply(c, shared.czero);
+    }
+}
 
-        if (getTimeoutFromObjectOrReply(c, c->argv[2], &end, UNIT_MILLISECONDS) != C_OK) return;
-        pauseClientsByClient(end, isPauseClientAll);
+void clientSetNameCommand(client *c) {
+    /* CLIENT SETNAME */
+    if (clientSetNameOrReply(c, c->argv[2]) == C_OK)
         addReply(c, shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr, "tracking") && c->argc >= 3) {
-        /* CLIENT TRACKING (on|off) [REDIRECT <id>] [BCAST] [PREFIX first]
-         *                          [PREFIX second] [OPTIN] [OPTOUT] [NOLOOP]... */
-        long long redir = 0;
-        struct ClientFlags options = {0};
-        robj **prefix = NULL;
-        size_t numprefix = 0;
-        initClientPubSubData(c);
-
-        /* Parse the options. */
-        for (int j = 3; j < c->argc; j++) {
-            int moreargs = (c->argc - 1) - j;
-
-            if (!strcasecmp(c->argv[j]->ptr, "redirect") && moreargs) {
-                j++;
-                if (redir != 0) {
-                    addReplyError(c, "A client can only redirect to a single "
-                                     "other client");
-                    zfree(prefix);
-                    return;
-                }
+}
 
-                if (getLongLongFromObjectOrReply(c, c->argv[j], &redir, NULL) != C_OK) {
-                    zfree(prefix);
-                    return;
-                }
-                /* We will require the client with the specified ID to exist
-                 * right now, even if it is possible that it gets disconnected
-                 * later. Still a valid sanity check. */
-                if (lookupClientByID(redir) == NULL) {
-                    addReplyError(c, "The client ID you want redirect to "
-                                     "does not exist");
-                    zfree(prefix);
-                    return;
-                }
-            } else if (!strcasecmp(c->argv[j]->ptr, "bcast")) {
-                options.tracking_bcast = 1;
-            } else if (!strcasecmp(c->argv[j]->ptr, "optin")) {
-                options.tracking_optin = 1;
-            } else if (!strcasecmp(c->argv[j]->ptr, "optout")) {
-                options.tracking_optout = 1;
-            } else if (!strcasecmp(c->argv[j]->ptr, "noloop")) {
-                options.tracking_noloop = 1;
-            } else if (!strcasecmp(c->argv[j]->ptr, "prefix") && moreargs) {
-                j++;
-                prefix = zrealloc(prefix, sizeof(robj *) * (numprefix + 1));
-                prefix[numprefix++] = c->argv[j];
-            } else {
-                zfree(prefix);
-                addReplyErrorObject(c, shared.syntaxerr);
-                return;
-            }
+void clientGetNameCommand(client *c) {
+    /* CLIENT GETNAME */
+    if (c->name)
+        addReplyBulk(c, c->name);
+    else
+        addReplyNull(c);
+}
+
+void clientUnpauseCommand(client *c) {
+    /* CLIENT UNPAUSE */
+    unpauseActions(PAUSE_BY_CLIENT_COMMAND);
+    addReply(c, shared.ok);
+}
+
+void clientPauseCommand(client *c) {
+    /* CLIENT PAUSE TIMEOUT [WRITE|ALL] */
+    mstime_t end;
+    int isPauseClientAll = 1;
+    if (c->argc == 4) {
+        if (!strcasecmp(c->argv[3]->ptr, "write")) {
+            isPauseClientAll = 0;
+        } else if (strcasecmp(c->argv[3]->ptr, "all")) {
+            addReplyError(c, "CLIENT PAUSE mode must be WRITE or ALL");
+            return;
         }
+    }
+
+    if (getTimeoutFromObjectOrReply(c, c->argv[2], &end, UNIT_MILLISECONDS) != C_OK) return;
+    pauseClientsByClient(end, isPauseClientAll);
+    addReply(c, shared.ok);
+}
+
+void clientTrackingCommand(client *c) {
+    /* CLIENT TRACKING (on|off) [REDIRECT <id>] [BCAST] [PREFIX first]
+     *                          [PREFIX second] [OPTIN] [OPTOUT] [NOLOOP]... */
+    long long redir = 0;
+    struct ClientFlags options = {0};
+    robj **prefix = NULL;
+    size_t numprefix = 0;
+    initClientPubSubData(c);
+
+    /* Parse the options. */
+    for (int j = 3; j < c->argc; j++) {
+        int moreargs = (c->argc - 1) - j;
 
-        /* Options are ok: enable or disable the tracking for this client. */
-        if (!strcasecmp(c->argv[2]->ptr, "on")) {
-            /* Before enabling tracking, make sure options are compatible
-             * among each other and with the current state of the client. */
-            if (!(options.tracking_bcast) && numprefix) {
-                addReplyError(c, "PREFIX option requires BCAST mode to be enabled");
+        if (!strcasecmp(c->argv[j]->ptr, "redirect") && moreargs) {
+            j++;
+            if (redir != 0) {
+                addReplyError(c, "A client can only redirect to a single "
+                                 "other client");
                 zfree(prefix);
                 return;
             }
 
-            if (c->flag.tracking) {
-                int oldbcast = !!c->flag.tracking_bcast;
-                int newbcast = !!(options.tracking_bcast);
-                if (oldbcast != newbcast) {
-                    addReplyError(c, "You can't switch BCAST mode on/off before disabling "
-                                     "tracking for this client, and then re-enabling it with "
-                                     "a different mode.");
-                    zfree(prefix);
-                    return;
-                }
-            }
-
-            if (options.tracking_bcast && (options.tracking_optin || options.tracking_optout)) {
-                addReplyError(c, "OPTIN and OPTOUT are not compatible with BCAST");
+            if (getLongLongFromObjectOrReply(c, c->argv[j], &redir, NULL) != C_OK) {
                 zfree(prefix);
                 return;
             }
-
-            if (options.tracking_optin && options.tracking_optout) {
-                addReplyError(c, "You can't specify both OPTIN mode and OPTOUT mode");
+            /* We will require the client with the specified ID to exist
+             * right now, even if it is possible that it gets disconnected
+             * later. Still a valid sanity check. */
+            if (lookupClientByID(redir) == NULL) {
+                addReplyError(c, "The client ID you want redirect to "
+                                 "does not exist");
                 zfree(prefix);
                 return;
             }
+        } else if (!strcasecmp(c->argv[j]->ptr, "bcast")) {
+            options.tracking_bcast = 1;
+        } else if (!strcasecmp(c->argv[j]->ptr, "optin")) {
+            options.tracking_optin = 1;
+        } else if (!strcasecmp(c->argv[j]->ptr, "optout")) {
+            options.tracking_optout = 1;
+        } else if (!strcasecmp(c->argv[j]->ptr, "noloop")) {
+            options.tracking_noloop = 1;
+        } else if (!strcasecmp(c->argv[j]->ptr, "prefix") && moreargs) {
+            j++;
+            prefix = zrealloc(prefix, sizeof(robj *) * (numprefix + 1));
+            prefix[numprefix++] = c->argv[j];
+        } else {
+            zfree(prefix);
+            addReplyErrorObject(c, shared.syntaxerr);
+            return;
+        }
+    }
+
+    /* Options are ok: enable or disable the tracking for this client. */
+    if (!strcasecmp(c->argv[2]->ptr, "on")) {
+        /* Before enabling tracking, make sure options are compatible
+         * among each other and with the current state of the client. */
+        if (!(options.tracking_bcast) && numprefix) {
+            addReplyError(c, "PREFIX option requires BCAST mode to be enabled");
+            zfree(prefix);
+            return;
+        }
 
-            if ((options.tracking_optin && c->flag.tracking_optout) ||
-                (options.tracking_optout && c->flag.tracking_optin)) {
-                addReplyError(c, "You can't switch OPTIN/OPTOUT mode before disabling "
+        if (c->flag.tracking) {
+            int oldbcast = !!c->flag.tracking_bcast;
+            int newbcast = !!(options.tracking_bcast);
+            if (oldbcast != newbcast) {
+                addReplyError(c, "You can't switch BCAST mode on/off before disabling "
                                  "tracking for this client, and then re-enabling it with "
                                  "a different mode.");
                 zfree(prefix);
                 return;
             }
+        }
 
-            if (options.tracking_bcast) {
-                if (!checkPrefixCollisionsOrReply(c, prefix, numprefix)) {
-                    zfree(prefix);
-                    return;
-                }
-            }
+        if (options.tracking_bcast && (options.tracking_optin || options.tracking_optout)) {
+            addReplyError(c, "OPTIN and OPTOUT are not compatible with BCAST");
+            zfree(prefix);
+            return;
+        }
 
-            enableTracking(c, redir, options, prefix, numprefix);
-        } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
-            disableTracking(c);
-        } else {
+        if (options.tracking_optin && options.tracking_optout) {
+            addReplyError(c, "You can't specify both OPTIN mode and OPTOUT mode");
             zfree(prefix);
-            addReplyErrorObject(c, shared.syntaxerr);
             return;
         }
-        zfree(prefix);
-        addReply(c, shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr, "caching") && c->argc >= 3) {
-        if (!c->flag.tracking) {
-            addReplyError(c, "CLIENT CACHING can be called only when the "
-                             "client is in tracking mode with OPTIN or "
-                             "OPTOUT mode enabled");
+
+        if ((options.tracking_optin && c->flag.tracking_optout) ||
+            (options.tracking_optout && c->flag.tracking_optin)) {
+            addReplyError(c, "You can't switch OPTIN/OPTOUT mode before disabling "
+                             "tracking for this client, and then re-enabling it with "
+                             "a different mode.");
+            zfree(prefix);
             return;
         }
 
-        char *opt = c->argv[2]->ptr;
-        if (!strcasecmp(opt, "yes")) {
-            if (c->flag.tracking_optin) {
-                c->flag.tracking_caching = 1;
-            } else {
-                addReplyError(c, "CLIENT CACHING YES is only valid when tracking is enabled in OPTIN mode.");
-                return;
-            }
-        } else if (!strcasecmp(opt, "no")) {
-            if (c->flag.tracking_optout) {
-                c->flag.tracking_caching = 1;
-            } else {
-                addReplyError(c, "CLIENT CACHING NO is only valid when tracking is enabled in OPTOUT mode.");
+        if (options.tracking_bcast) {
+            if (!checkPrefixCollisionsOrReply(c, prefix, numprefix)) {
+                zfree(prefix);
                 return;
             }
-        } else {
-            addReplyErrorObject(c, shared.syntaxerr);
-            return;
         }
 
-        /* Common reply for when we succeeded. */
-        addReply(c, shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr, "getredir") && c->argc == 2) {
-        /* CLIENT GETREDIR */
-        if (c->flag.tracking) {
-            addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
-        } else {
-            addReplyLongLong(c, -1);
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr, "trackinginfo") && c->argc == 2) {
-        addReplyMapLen(c, 3);
-
-        /* Flags */
-        addReplyBulkCString(c, "flags");
-        void *arraylen_ptr = addReplyDeferredLen(c);
-        int numflags = 0;
-        addReplyBulkCString(c, c->flag.tracking ? "on" : "off");
-        numflags++;
-        if (c->flag.tracking_bcast) {
-            addReplyBulkCString(c, "bcast");
-            numflags++;
-        }
+        enableTracking(c, redir, options, prefix, numprefix);
+    } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
+        disableTracking(c);
+    } else {
+        zfree(prefix);
+        addReplyErrorObject(c, shared.syntaxerr);
+        return;
+    }
+    zfree(prefix);
+    addReply(c, shared.ok);
+}
+
+void clientCachingCommand(client *c) {
+    if (!c->flag.tracking) {
+        addReplyError(c, "CLIENT CACHING can be called only when the "
+                         "client is in tracking mode with OPTIN or "
+                         "OPTOUT mode enabled");
+        return;
+    }
+
+    char *opt = c->argv[2]->ptr;
+    if (!strcasecmp(opt, "yes")) {
         if (c->flag.tracking_optin) {
-            addReplyBulkCString(c, "optin");
-            numflags++;
-            if (c->flag.tracking_caching) {
-                addReplyBulkCString(c, "caching-yes");
-                numflags++;
-            }
+            c->flag.tracking_caching = 1;
+        } else {
+            addReplyError(c, "CLIENT CACHING YES is only valid when tracking is enabled in OPTIN mode.");
+            return;
         }
+    } else if (!strcasecmp(opt, "no")) {
         if (c->flag.tracking_optout) {
-            addReplyBulkCString(c, "optout");
-            numflags++;
-            if (c->flag.tracking_caching) {
-                addReplyBulkCString(c, "caching-no");
-                numflags++;
-            }
+            c->flag.tracking_caching = 1;
+        } else {
+            addReplyError(c, "CLIENT CACHING NO is only valid when tracking is enabled in OPTOUT mode.");
+            return;
         }
-        if (c->flag.tracking_noloop) {
-            addReplyBulkCString(c, "noloop");
+    } else {
+        addReplyErrorObject(c, shared.syntaxerr);
+        return;
+    }
+
+    /* Common reply for when we succeeded. */
+    addReply(c, shared.ok);
+}
+
+void clientGetredirCommand(client *c) {
+    /* CLIENT GETREDIR */
+    if (c->flag.tracking) {
+        addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
+    } else {
+        addReplyLongLong(c, -1);
+    }
+}
+
+void clientTrackingInfoCommand(client *c) {
+    addReplyMapLen(c, 3);
+
+    /* Flags */
+    addReplyBulkCString(c, "flags");
+    void *arraylen_ptr = addReplyDeferredLen(c);
+    int numflags = 0;
+    addReplyBulkCString(c, c->flag.tracking ? "on" : "off");
+    numflags++;
+    if (c->flag.tracking_bcast) {
+        addReplyBulkCString(c, "bcast");
+        numflags++;
+    }
+    if (c->flag.tracking_optin) {
+        addReplyBulkCString(c, "optin");
+        numflags++;
+        if (c->flag.tracking_caching) {
+            addReplyBulkCString(c, "caching-yes");
             numflags++;
         }
-        if (c->flag.tracking_broken_redir) {
-            addReplyBulkCString(c, "broken_redirect");
+    }
+    if (c->flag.tracking_optout) {
+        addReplyBulkCString(c, "optout");
+        numflags++;
+        if (c->flag.tracking_caching) {
+            addReplyBulkCString(c, "caching-no");
             numflags++;
         }
-        setDeferredSetLen(c, arraylen_ptr, numflags);
+    }
+    if (c->flag.tracking_noloop) {
+        addReplyBulkCString(c, "noloop");
+        numflags++;
+    }
+    if (c->flag.tracking_broken_redir) {
+        addReplyBulkCString(c, "broken_redirect");
+        numflags++;
+    }
+    setDeferredSetLen(c, arraylen_ptr, numflags);
 
-        /* Redirect */
-        addReplyBulkCString(c, "redirect");
-        if (c->flag.tracking) {
-            addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
-        } else {
-            addReplyLongLong(c, -1);
+    /* Redirect */
+    addReplyBulkCString(c, "redirect");
+    if (c->flag.tracking) {
+        addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
+    } else {
+        addReplyLongLong(c, -1);
+    }
+
+    /* Prefixes */
+    addReplyBulkCString(c, "prefixes");
+    if (c->pubsub_data->client_tracking_prefixes) {
+        addReplyArrayLen(c, raxSize(c->pubsub_data->client_tracking_prefixes));
+        raxIterator ri;
+        raxStart(&ri, c->pubsub_data->client_tracking_prefixes);
+        raxSeek(&ri, "^", NULL, 0);
+        while (raxNext(&ri)) {
+            addReplyBulkCBuffer(c, ri.key, ri.key_len);
         }
+        raxStop(&ri);
+    } else {
+        addReplyArrayLen(c, 0);
+    }
+}
 
-        /* Prefixes */
-        addReplyBulkCString(c, "prefixes");
-        if (c->pubsub_data->client_tracking_prefixes) {
-            addReplyArrayLen(c, raxSize(c->pubsub_data->client_tracking_prefixes));
-            raxIterator ri;
-            raxStart(&ri, c->pubsub_data->client_tracking_prefixes);
-            raxSeek(&ri, "^", NULL, 0);
-            while (raxNext(&ri)) {
-                addReplyBulkCBuffer(c, ri.key, ri.key_len);
-            }
-            raxStop(&ri);
-        } else {
-            addReplyArrayLen(c, 0);
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr, "no-touch")) {
-        /* CLIENT NO-TOUCH ON|OFF */
-        if (!strcasecmp(c->argv[2]->ptr, "on")) {
-            c->flag.no_touch = 1;
-            addReply(c, shared.ok);
-        } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
-            c->flag.no_touch = 0;
-            addReply(c, shared.ok);
-        } else {
-            addReplyErrorObject(c, shared.syntaxerr);
-        }
-    } else if (!strcasecmp(c->argv[1]->ptr, "capa") && c->argc >= 3) {
-        for (int i = 2; i < c->argc; i++) {
-            if (!strcasecmp(c->argv[i]->ptr, "redirect")) {
-                c->capa |= CLIENT_CAPA_REDIRECT;
-            }
-        }
+void clientNoTouchCommand(client *c) {
+    /* CLIENT NO-TOUCH ON|OFF */
+    if (!strcasecmp(c->argv[2]->ptr, "on")) {
+        c->flag.no_touch = 1;
         addReply(c, shared.ok);
-    } else if (!strcasecmp(c->argv[1]->ptr, "import-source")) {
-        /* CLIENT IMPORT-SOURCE ON|OFF */
-        if (!server.import_mode && strcasecmp(c->argv[2]->ptr, "off")) {
-            addReplyError(c, "Server is not in import mode");
-            return;
-        }
-        if (!strcasecmp(c->argv[2]->ptr, "on")) {
-            c->flag.import_source = 1;
-            addReply(c, shared.ok);
-        } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
-            c->flag.import_source = 0;
-            addReply(c, shared.ok);
-        } else {
-            addReplyErrorObject(c, shared.syntaxerr);
-            return;
+    } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
+        c->flag.no_touch = 0;
+        addReply(c, shared.ok);
+    } else {
+        addReplyErrorObject(c, shared.syntaxerr);
+    }
+}
+
+void clientCapaCommand(client *c) {
+    for (int i = 2; i < c->argc; i++) {
+        if (!strcasecmp(c->argv[i]->ptr, "redirect")) {
+            c->capa |= CLIENT_CAPA_REDIRECT;
         }
+    }
+    addReply(c, shared.ok);
+}
+
+void clientImportSourceCommand(client *c) {
+    /* CLIENT IMPORT-SOURCE ON|OFF */
+    if (!server.import_mode && strcasecmp(c->argv[2]->ptr, "off")) {
+        addReplyError(c, "Server is not in import mode");
+        return;
+    }
+    if (!strcasecmp(c->argv[2]->ptr, "on")) {
+        c->flag.import_source = 1;
+        addReply(c, shared.ok);
+    } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
+        c->flag.import_source = 0;
+        addReply(c, shared.ok);
     } else {
-        addReplySubcommandSyntaxError(c);
+        addReplyErrorObject(c, shared.syntaxerr);
+        return;
     }
 }
 
+void clientCommand(client *c) {
+    addReplySubcommandSyntaxError(c);
+}
+
 /* HELLO [<protocol-version> [AUTH <user> <password>] [SETNAME <name>] ] */
 void helloCommand(client *c) {
     long long ver = 0;
diff --git a/src/server.h b/src/server.h
index 00aabf80e1..a0955c4d68 100644
--- a/src/server.h
+++ b/src/server.h
@@ -3783,6 +3783,25 @@ void dumpCommand(client *c);
 void objectCommand(client *c);
 void memoryCommand(client *c);
 void clientCommand(client *c);
+void clientHelpCommand(client *c);
+void clientIDCommand(client *c);
+void clientInfoCommand(client *c);
+void clientListCommand(client *c);
+void clientReplyCommand(client *c);
+void clientNoEvictCommand(client *c);
+void clientKillCommand(client *c);
+void clientUnblockCommand(client *c);
+void clientSetNameCommand(client *c);
+void clientGetNameCommand(client *c);
+void clientUnpauseCommand(client *c);
+void clientPauseCommand(client *c);
+void clientTrackingCommand(client *c);
+void clientCachingCommand(client *c);
+void clientGetredirCommand(client *c);
+void clientTrackingInfoCommand(client *c);
+void clientNoTouchCommand(client *c);
+void clientCapaCommand(client *c);
+void clientImportSourceCommand(client *c);
 void helloCommand(client *c);
 void clientSetinfoCommand(client *c);
 void evalCommand(client *c);
diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl
index bafc46d4b7..ca1f37e574 100644
--- a/tests/unit/introspection.tcl
+++ b/tests/unit/introspection.tcl
@@ -19,6 +19,126 @@ start_server {tags {"introspection"}} {
         r client info
     } {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=* lib-name=* lib-ver=* tot-net-in=* tot-net-out=* tot-cmds=*}
 
+    test {CLIENT LIST with ADDR filter} {
+        set client_info [r client info]
+        regexp {addr=([^ ]+)} $client_info match myaddr
+        set cl [split [r client list addr $myaddr] "\r\n"]
+        regexp {addr=([^ ]+) .* cmd=([^ ]+)} [lindex $cl 0] _ actual_addr actual_cmd
+        assert_equal $myaddr $actual_addr
+        assert_equal "client|list" $actual_cmd
+    }
+
+    test {CLIENT LIST with LADDR filter} {
+        set client_info [r client info]
+        regexp {laddr=([^ ]+)} $client_info match myladdr
+        set cl [split [r client list laddr $myladdr] "\r\n"]
+
+        regexp {laddr=([^ ]+)} [lindex $cl 0] _ actual_laddr
+
+        assert_equal $myladdr $actual_laddr
+    }
+
+    test {CLIENT LIST with MAXAGE filter} {
+        set cl [split [r client list maxage 1000000] "\r\n"]
+
+        foreach line $cl {
+            regexp {age=([0-9]+)} $line _ age
+            assert {[expr {$age <= 1000000}]}
+        }
+    }
+
+    test {CLIENT LIST with TYPE filter} {
+        set cl [split [r client list type normal] "\r\n"]
+
+        foreach line $cl {
+            regexp {flags=([^ ]+)} $line _ flags
+            assert [regexp {.*N.*} $flags]
+        }
+    }
+
+    test {CLIENT LIST with USER filter} {
+        set client_info [r client info]
+        regexp {user=([^ ]+)} $client_info match myuser
+        set cl [split [r client list user $myuser] "\r\n"]
+
+        foreach line $cl {
+            regexp {user=([^ ]+)} $line _ actual_user
+            assert_equal $myuser $actual_user
+        }
+    }
+
+    test {CLIENT LIST with SKIPME filter} {
+        set cl [split [r client list skipme no] "\r\n"]
+
+        set found_self 0
+        foreach line $cl {
+            regexp {id=([0-9]+)} $line _ client_id
+            if {[expr {$client_id == [r client id]}]} {
+                set found_self 1
+            }
+        }
+
+        assert_equal $found_self 1
+    }
+
+    test {CLIENT LIST with multiple IDs and TYPE filter} {
+        # Create multiple clients
+        set c1 [valkey_client]
+        set c2 [valkey_client]
+        set c3 [valkey_client]
+
+        # Fetch their IDs
+        set id1 [$c1 client id]
+        set id2 [$c2 client id]
+        set id3 [$c3 client id]
+
+        # Filter by multiple IDs and TYPE
+        set cl [split [r client list id $id1 $id2 type normal] "\r\n"]
+
+        # Assert only c1 and c2 are present and match TYPE=N (NORMAL)
+        foreach line $cl {
+            regexp {id=([0-9]+).*flags=([^ ]+)} $line _ client_id flags
+            assert {[lsearch -exact "$id1 $id2" $client_id] != -1}
+            assert {[string match *N* $flags]}
+        }
+
+        # Close clients
+        $c1 close
+        $c2 close
+        $c3 close
+    }
+
+    test {CLIENT LIST with filters matching no clients} {
+        # Create multiple clients
+        set c1 [valkey_client]
+        set c2 [valkey_client]
+
+        # Use a filter that doesn't match any client (e.g., invalid user)
+        assert_error "ERR No such user 'invalid_user'" {r client list user invalid_user}
+
+        # Close clients
+        $c1 close
+        $c2 close
+    }
+
+    test {CLIENT LIST with illegal arguments} {
+        assert_error "ERR syntax error" {r client list id 10 wrong_arg}
+
+        assert_error "ERR syntax error" {r client list id str}
+        assert_error "ERR *greater than 0*" {r client list id -1}
+        assert_error "ERR *greater than 0*" {r client list id 0}
+
+        assert_error "ERR Unknown client type*" {r client list type wrong_type}
+
+        assert_error "ERR No such user*" {r client list user wrong_user}
+
+        assert_error "ERR syntax error*" {r client list skipme yes_or_no}
+
+        assert_error "ERR *not an integer or out of range*" {r client list maxage str}
+        assert_error "ERR *not an integer or out of range*" {r client list maxage 9999999999999999999}
+        assert_error "ERR *greater than 0*" {r client list maxage -1}
+    }
+
     proc get_field_in_client_info {info field} {
         set info [string trim $info]
         foreach item [split $info " "] {
@@ -108,7 +228,7 @@ start_server {tags {"introspection"}} {
         assert_error "ERR wrong number of arguments for 'client|kill' command" {r client kill}
         assert_error "ERR syntax error*" {r client kill id 10 wrong_arg}
 
-        assert_error "ERR *greater than 0*" {r client kill id str}
+        assert_error "ERR syntax error*" {r client kill id str}
         assert_error "ERR *greater than 0*" {r client kill id -1}
         assert_error "ERR *greater than 0*" {r client kill id 0}
 

From 858d91acab78bf853d3729e423e4dffb61227bb1 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 16 Jan 2025 10:23:03 +0800
Subject: [PATCH 085/101] Allow clang-format to be triggered in push events
 (#1565)

Just like spell-check workflow, we should allow to trigger it
in push events, so that the forks repo can notice the format
thing way before submitting the PR.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 .github/workflows/clang-format.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index efc63a1f6f..ab4f7a040d 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -1,6 +1,7 @@
 name: Clang Format Check
 
 on:
+  push:
   pull_request:
     paths:
       - 'src/**'

From 87fe817b506d257875af1999317a1b1e3fe8ec5b Mon Sep 17 00:00:00 2001
From: Ray Cao <zisong.cw@alibaba-inc.com>
Date: Thu, 16 Jan 2025 16:40:34 +0800
Subject: [PATCH 086/101] Incr expired_keys if the unix-time is already expired
 for EXPIREAT and other commands(#1517)

Some commands that use unix-time, such as `EXPIREAT` and `SET EXAT`, should include the deleted keys in the `expired_keys` statistics if the specified time has already expired, and notifications should be sent in the manner of expired.

---------

Signed-off-by: Ray Cao <zisong.cw@alibaba-inc.com>
---
 src/db.c              |  3 ++-
 tests/unit/expire.tcl | 25 +++++++++++++++++++++++++
 tests/unit/pubsub.tcl | 11 +++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/db.c b/src/db.c
index 94074bf668..535d493954 100644
--- a/src/db.c
+++ b/src/db.c
@@ -1850,7 +1850,8 @@ void deleteExpiredKeyFromOverwriteAndPropagate(client *c, robj *keyobj) {
     robj *aux = server.lazyfree_lazy_expire ? shared.unlink : shared.del;
     rewriteClientCommandVector(c, 2, aux, keyobj);
     signalModifiedKey(c, c->db, keyobj);
-    notifyKeyspaceEvent(NOTIFY_GENERIC, "del", keyobj, c->db->id);
+    notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired", keyobj, c->db->id);
+    server.stat_expiredkeys++;
 }
 
 /* Propagate an implicit key deletion into replicas and the AOF file.
diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl
index e1dcc9203b..3736538105 100644
--- a/tests/unit/expire.tcl
+++ b/tests/unit/expire.tcl
@@ -142,6 +142,31 @@ start_server {tags {"expire"}} {
         list $e $f
     } {somevalue {}}
 
+    test {EXPIRE / EXPIREAT / PEXPIRE / PEXPIREAT Expiration time is already expired} {
+        r flushall
+        r config resetstat
+
+        r set x somevalue
+        r expire x -1
+        assert_equal {0} [r exists x]
+        assert_equal {1} [s expired_keys]
+
+        r set x somevalue
+        r expireat x [expr [clock seconds] - 1]
+        assert_equal {0} [r exists x]
+        assert_equal {2}  [s expired_keys]
+
+        r set x somevalue
+        r pexpire x -1000
+        assert_equal {0} [r exists x]
+        assert_equal {3} [s expired_keys]
+
+        r set x somevalue
+        r pexpireat x [expr [clock milliseconds] - 1000]
+        assert_equal {0} [r exists x]
+        assert_equal {4} [s expired_keys]
+    }
+
     test {TTL returns time to live in seconds} {
         r del x
         r setex x 10 somevalue
diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl
index 24b78b6e5a..53b70bf552 100644
--- a/tests/unit/pubsub.tcl
+++ b/tests/unit/pubsub.tcl
@@ -425,6 +425,17 @@ start_server {tags {"pubsub network"}} {
         $rd1 close
     }
 
+    test "Keyspace notification: expired event (Expiration time is already expired)" {
+        r config set notify-keyspace-events Ex
+        r del foo
+        set rd1 [valkey_deferring_client]
+        assert_equal {1} [psubscribe $rd1 *]
+        r set foo 1
+        r expire foo -1
+        assert_equal "pmessage * __keyevent@${db}__:expired foo" [$rd1 read]
+        $rd1 close
+    }
+
     test "Keyspace notifications: evicted events" {
         r config set notify-keyspace-events Ee
         r config set maxmemory-policy allkeys-lru

From d3d6736a401a195461b51a834a6578c8999d6671 Mon Sep 17 00:00:00 2001
From: Ricardo Dias <ricardo.dias@percona.com>
Date: Thu, 16 Jan 2025 09:08:16 +0000
Subject: [PATCH 087/101] Extract the scripting engine code from the functions
 unit (#1312)

This commit creates a new compilation unit for the scripting engine code
by extracting the existing code from the functions unit.
We're doing this refactor to prepare the code for running the `EVAL`
command using different scripting engines.

This PR has a module API change: we changed the type of error messages
returned by the callback
`ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc` to be a
`ValkeyModuleString` (aka `robj`);

This PR also fixes #1470.

---------

Signed-off-by: Ricardo Dias <ricardo.dias@percona.com>
---
 cmake/Modules/SourceFiles.cmake          |   1 +
 src/Makefile                             |   4 +-
 src/function_lua.c                       |  17 +-
 src/functions.c                          | 296 ++++++-----------------
 src/functions.h                          |  74 +-----
 src/module.c                             |  14 +-
 src/scripting_engine.c                   | 284 ++++++++++++++++++++++
 src/scripting_engine.h                   |  73 ++++++
 src/server.c                             |   6 +-
 src/util.c                               |  20 --
 src/util.h                               |   1 -
 src/valkeymodule.h                       |  61 ++++-
 tests/modules/helloscripting.c           |  39 ++-
 tests/unit/moduleapi/scriptingengine.tcl |   4 +
 14 files changed, 558 insertions(+), 336 deletions(-)
 create mode 100644 src/scripting_engine.c
 create mode 100644 src/scripting_engine.h

diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake
index 1a754ff846..80c3f0c876 100644
--- a/cmake/Modules/SourceFiles.cmake
+++ b/cmake/Modules/SourceFiles.cmake
@@ -100,6 +100,7 @@ set(VALKEY_SERVER_SRCS
     ${CMAKE_SOURCE_DIR}/src/script_lua.c
     ${CMAKE_SOURCE_DIR}/src/script.c
     ${CMAKE_SOURCE_DIR}/src/functions.c
+    ${CMAKE_SOURCE_DIR}/src/scripting_engine.c
     ${CMAKE_SOURCE_DIR}/src/function_lua.c
     ${CMAKE_SOURCE_DIR}/src/commands.c
     ${CMAKE_SOURCE_DIR}/src/strl.c
diff --git a/src/Makefile b/src/Makefile
index e52f4f08d3..9e4075660d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -374,7 +374,7 @@ else
     endef
 endif
 
-# Determine install/uninstall Redis symlinks for compatibility when 
+# Determine install/uninstall Redis symlinks for compatibility when
 # installing/uninstalling Valkey binaries (defaulting to `yes`)
 USE_REDIS_SYMLINKS?=yes
 ifeq ($(USE_REDIS_SYMLINKS),yes)
@@ -416,7 +416,7 @@ endif
 ENGINE_NAME=valkey
 SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
 ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
-ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o
+ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o scripting_engine.o
 ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
 ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
 ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
diff --git a/src/function_lua.c b/src/function_lua.c
index b535528906..59c16eae54 100644
--- a/src/function_lua.c
+++ b/src/function_lua.c
@@ -39,6 +39,7 @@
  * Uses script_lua.c to run the Lua code.
  */
 
+#include "scripting_engine.h"
 #include "functions.h"
 #include "script_lua.h"
 #include <lua.h>
@@ -121,7 +122,7 @@ static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx,
                                           const char *code,
                                           size_t timeout,
                                           size_t *out_num_compiled_functions,
-                                          char **err) {
+                                          robj **err) {
     /* The lua engine is implemented in the core, and not in a Valkey Module */
     serverAssert(module_ctx == NULL);
 
@@ -139,7 +140,8 @@ static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx,
 
     /* compile the code */
     if (luaL_loadbuffer(lua, code, strlen(code), "@user_function")) {
-        *err = valkey_asprintf("Error compiling function: %s", lua_tostring(lua, -1));
+        sds error = sdscatfmt(sdsempty(), "Error compiling function: %s", lua_tostring(lua, -1));
+        *err = createObject(OBJ_STRING, error);
         lua_pop(lua, 1); /* pops the error */
         goto done;
     }
@@ -157,7 +159,8 @@ static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx,
     if (lua_pcall(lua, 0, 0, 0)) {
         errorInfo err_info = {0};
         luaExtractErrorInformation(lua, &err_info);
-        *err = valkey_asprintf("Error registering functions: %s", err_info.msg);
+        sds error = sdscatfmt(sdsempty(), "Error registering functions: %s", err_info.msg);
+        *err = createObject(OBJ_STRING, error);
         lua_pop(lua, 1); /* pops the error */
         luaErrorInformationDiscard(&err_info);
         listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD);
@@ -557,8 +560,8 @@ int luaEngineInitEngine(void) {
         .get_memory_info = luaEngineGetMemoryInfo,
     };
 
-    return functionsRegisterEngine(LUA_ENGINE_NAME,
-                                   NULL,
-                                   lua_engine_ctx,
-                                   &lua_engine_methods);
+    return scriptingEngineManagerRegister(LUA_ENGINE_NAME,
+                                          NULL,
+                                          lua_engine_ctx,
+                                          &lua_engine_methods);
 }
diff --git a/src/functions.c b/src/functions.c
index 0d003f7fac..14d8c5296e 100644
--- a/src/functions.c
+++ b/src/functions.c
@@ -31,7 +31,6 @@
 #include "sds.h"
 #include "dict.h"
 #include "adlist.h"
-#include "module.h"
 
 #define LOAD_TIMEOUT_MS 500
 
@@ -41,8 +40,6 @@ typedef enum {
     restorePolicy_Replace
 } restorePolicy;
 
-static size_t engine_cache_memory = 0;
-
 /* Forward declaration */
 static void engineFunctionDispose(void *obj);
 static void engineStatsDispose(void *obj);
@@ -67,15 +64,6 @@ typedef struct functionsLibMetaData {
     sds code;
 } functionsLibMetaData;
 
-dictType engineDictType = {
-    dictSdsCaseHash,       /* hash function */
-    dictSdsDup,            /* key dup */
-    dictSdsKeyCaseCompare, /* key compare */
-    dictSdsDestructor,     /* key destructor */
-    NULL,                  /* val destructor */
-    NULL                   /* allow to expand */
-};
-
 dictType functionDictType = {
     dictSdsCaseHash,       /* hash function */
     dictSdsDup,            /* key dup */
@@ -112,34 +100,14 @@ dictType librariesDictType = {
     NULL                  /* allow to expand */
 };
 
-/* Dictionary of engines */
-static dict *engines = NULL;
-
 /* Libraries Ctx. */
 static functionsLibCtx *curr_functions_lib_ctx = NULL;
 
-static void setupEngineModuleCtx(engineInfo *ei, client *c) {
-    if (ei->engineModule != NULL) {
-        serverAssert(ei->module_ctx != NULL);
-        moduleScriptingEngineInitContext(ei->module_ctx, ei->engineModule, c);
-    }
-}
-
-static void teardownEngineModuleCtx(engineInfo *ei) {
-    if (ei->engineModule != NULL) {
-        serverAssert(ei->module_ctx != NULL);
-        moduleFreeContext(ei->module_ctx);
-    }
-}
-
 static size_t functionMallocSize(functionInfo *fi) {
-    setupEngineModuleCtx(fi->li->ei, NULL);
-    size_t size = zmalloc_size(fi) +
-                  sdsAllocSize(fi->name) +
-                  (fi->desc ? sdsAllocSize(fi->desc) : 0) +
-                  fi->li->ei->engine->get_function_memory_overhead(fi->li->ei->module_ctx, fi->function);
-    teardownEngineModuleCtx(fi->li->ei);
-    return size;
+    return zmalloc_size(fi) +
+           sdsAllocSize(fi->name) +
+           (fi->desc ? sdsAllocSize(fi->desc) : 0) +
+           scriptingEngineCallGetFunctionMemoryOverhead(fi->li->engine, fi->function);
 }
 
 static size_t libraryMallocSize(functionLibInfo *li) {
@@ -161,12 +129,8 @@ static void engineFunctionDispose(void *obj) {
     if (fi->desc) {
         sdsfree(fi->desc);
     }
-    setupEngineModuleCtx(fi->li->ei, NULL);
-    engine *engine = fi->li->ei->engine;
-    engine->free_function(fi->li->ei->module_ctx,
-                          engine->engine_ctx,
-                          fi->function);
-    teardownEngineModuleCtx(fi->li->ei);
+
+    scriptingEngineCallFreeFunction(fi->li->engine, fi->function);
     zfree(fi);
 }
 
@@ -239,30 +203,30 @@ functionsLibCtx *functionsLibCtxGetCurrent(void) {
     return curr_functions_lib_ctx;
 }
 
+static void initializeFunctionsLibEngineStats(scriptingEngine *engine,
+                                              void *context) {
+    functionsLibCtx *lib_ctx = (functionsLibCtx *)context;
+    functionsLibEngineStats *stats = zcalloc(sizeof(*stats));
+    dictAdd(lib_ctx->engines_stats, scriptingEngineGetName(engine), stats);
+}
+
 /* Create a new functions ctx */
 functionsLibCtx *functionsLibCtxCreate(void) {
     functionsLibCtx *ret = zmalloc(sizeof(functionsLibCtx));
     ret->libraries = dictCreate(&librariesDictType);
     ret->functions = dictCreate(&functionDictType);
     ret->engines_stats = dictCreate(&engineStatsDictType);
-    dictIterator *iter = dictGetIterator(engines);
-    dictEntry *entry = NULL;
-    while ((entry = dictNext(iter))) {
-        engineInfo *ei = dictGetVal(entry);
-        functionsLibEngineStats *stats = zcalloc(sizeof(*stats));
-        dictAdd(ret->engines_stats, ei->name, stats);
-    }
-    dictReleaseIterator(iter);
+    scriptingEngineManagerForEachEngine(initializeFunctionsLibEngineStats, ret);
     ret->cache_memory = 0;
     return ret;
 }
 
-void functionsAddEngineStats(engineInfo *ei) {
+void functionsAddEngineStats(sds engine_name) {
     serverAssert(curr_functions_lib_ctx != NULL);
-    dictEntry *entry = dictFind(curr_functions_lib_ctx->engines_stats, ei->name);
+    dictEntry *entry = dictFind(curr_functions_lib_ctx->engines_stats, engine_name);
     if (entry == NULL) {
         functionsLibEngineStats *stats = zcalloc(sizeof(*stats));
-        dictAdd(curr_functions_lib_ctx->engines_stats, ei->name, stats);
+        dictAdd(curr_functions_lib_ctx->engines_stats, engine_name, stats);
     }
 }
 
@@ -312,12 +276,12 @@ static int functionLibCreateFunction(robj *name,
     return C_OK;
 }
 
-static functionLibInfo *engineLibraryCreate(sds name, engineInfo *ei, sds code) {
+static functionLibInfo *engineLibraryCreate(sds name, scriptingEngine *e, sds code) {
     functionLibInfo *li = zmalloc(sizeof(*li));
     *li = (functionLibInfo){
         .name = sdsdup(name),
         .functions = dictCreate(&libraryFunctionDictType),
-        .ei = ei,
+        .engine = e,
         .code = sdsdup(code),
     };
     return li;
@@ -339,7 +303,7 @@ static void libraryUnlink(functionsLibCtx *lib_ctx, functionLibInfo *li) {
     lib_ctx->cache_memory -= libraryMallocSize(li);
 
     /* update stats */
-    functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, li->ei->name);
+    functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, scriptingEngineGetName(li->engine));
     serverAssert(stats);
     stats->n_lib--;
     stats->n_functions -= dictSize(li->functions);
@@ -359,7 +323,7 @@ static void libraryLink(functionsLibCtx *lib_ctx, functionLibInfo *li) {
     lib_ctx->cache_memory += libraryMallocSize(li);
 
     /* update stats */
-    functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, li->ei->name);
+    functionsLibEngineStats *stats = dictFetchValue(lib_ctx->engines_stats, scriptingEngineGetName(li->engine));
     serverAssert(stats);
     stats->n_lib++;
     stats->n_functions += dictSize(li->functions);
@@ -446,107 +410,29 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l
     return ret;
 }
 
-/* Register an engine, should be called once by the engine on startup and give
- * the following:
- *
- * - engine_name - name of the engine to register
- *
- * - engine_module - the valkey module that implements this engine
- *
- * - engine_ctx - the engine ctx that should be used by the server to interact
- * with the engine.
- *
- * - engine_methods - the struct with the scripting engine callback functions
- * pointers.
- *
- */
-int functionsRegisterEngine(const char *engine_name,
-                            ValkeyModule *engine_module,
-                            engineCtx *engine_ctx,
-                            engineMethods *engine_methods) {
-    sds engine_name_sds = sdsnew(engine_name);
-    if (dictFetchValue(engines, engine_name_sds)) {
-        serverLog(LL_WARNING, "Same engine was registered twice");
-        sdsfree(engine_name_sds);
-        return C_ERR;
-    }
-
-    engine *eng = zmalloc(sizeof(engine));
-    *eng = (engine){
-        .engine_ctx = engine_ctx,
-        .create = engine_methods->create_functions_library,
-        .call = engine_methods->call_function,
-        .get_function_memory_overhead = engine_methods->get_function_memory_overhead,
-        .free_function = engine_methods->free_function,
-        .get_memory_info = engine_methods->get_memory_info,
-    };
-
-    client *c = createClient(NULL);
-    c->flag.deny_blocking = 1;
-    c->flag.script = 1;
-    c->flag.fake = 1;
-    engineInfo *ei = zmalloc(sizeof(*ei));
-    *ei = (engineInfo){
-        .name = engine_name_sds,
-        .engineModule = engine_module,
-        .module_ctx = engine_module ? moduleAllocateContext() : NULL,
-        .engine = eng,
-        .c = c,
-    };
-
-    dictAdd(engines, engine_name_sds, ei);
-
-    functionsAddEngineStats(ei);
-
-    setupEngineModuleCtx(ei, NULL);
-    engineMemoryInfo mem_info = eng->get_memory_info(ei->module_ctx,
-                                                     eng->engine_ctx);
-    engine_cache_memory += zmalloc_size(ei) +
-                           sdsAllocSize(ei->name) +
-                           zmalloc_size(eng) +
-                           mem_info.engine_memory_overhead;
-
-    teardownEngineModuleCtx(ei);
-
-    return C_OK;
+static void replyEngineStats(scriptingEngine *engine, void *context) {
+    client *c = (client *)context;
+    addReplyBulkCString(c, scriptingEngineGetName(engine));
+    addReplyMapLen(c, 2);
+    functionsLibEngineStats *e_stats =
+        dictFetchValue(curr_functions_lib_ctx->engines_stats, scriptingEngineGetName(engine));
+    addReplyBulkCString(c, "libraries_count");
+    addReplyLongLong(c, e_stats ? e_stats->n_lib : 0);
+    addReplyBulkCString(c, "functions_count");
+    addReplyLongLong(c, e_stats ? e_stats->n_functions : 0);
 }
 
-/* Removes a scripting engine from the server.
- *
- * - engine_name - name of the engine to remove
- */
-int functionsUnregisterEngine(const char *engine_name) {
-    sds engine_name_sds = sdsnew(engine_name);
-    dictEntry *entry = dictFind(engines, engine_name_sds);
-    if (entry == NULL) {
-        serverLog(LL_WARNING, "There's no engine registered with name %s", engine_name);
-        sdsfree(engine_name_sds);
-        return C_ERR;
-    }
-
-    engineInfo *ei = dictGetVal(entry);
-
+void functionsRemoveLibFromEngine(scriptingEngine *engine) {
     dictIterator *iter = dictGetSafeIterator(curr_functions_lib_ctx->libraries);
+    dictEntry *entry = NULL;
     while ((entry = dictNext(iter))) {
         functionLibInfo *li = dictGetVal(entry);
-        if (li->ei == ei) {
+        if (li->engine == engine) {
             libraryUnlink(curr_functions_lib_ctx, li);
             engineLibraryFree(li);
         }
     }
     dictReleaseIterator(iter);
-
-    zfree(ei->engine);
-    sdsfree(ei->name);
-    freeClient(ei->c);
-    if (ei->engineModule != NULL) {
-        serverAssert(ei->module_ctx != NULL);
-        zfree(ei->module_ctx);
-    }
-    zfree(ei);
-
-    sdsfree(engine_name_sds);
-    return C_OK;
 }
 
 /*
@@ -578,20 +464,8 @@ void functionStatsCommand(client *c) {
     }
 
     addReplyBulkCString(c, "engines");
-    addReplyMapLen(c, dictSize(engines));
-    dictIterator *iter = dictGetIterator(engines);
-    dictEntry *entry = NULL;
-    while ((entry = dictNext(iter))) {
-        engineInfo *ei = dictGetVal(entry);
-        addReplyBulkCString(c, ei->name);
-        addReplyMapLen(c, 2);
-        functionsLibEngineStats *e_stats = dictFetchValue(curr_functions_lib_ctx->engines_stats, ei->name);
-        addReplyBulkCString(c, "libraries_count");
-        addReplyLongLong(c, e_stats->n_lib);
-        addReplyBulkCString(c, "functions_count");
-        addReplyLongLong(c, e_stats->n_functions);
-    }
-    dictReleaseIterator(iter);
+    addReplyMapLen(c, scriptingEngineManagerGetNumEngines());
+    scriptingEngineManagerForEachEngine(replyEngineStats, c);
 }
 
 static void functionListReplyFlags(client *c, functionInfo *fi) {
@@ -667,7 +541,8 @@ void functionListCommand(client *c) {
         addReplyBulkCString(c, "library_name");
         addReplyBulkCBuffer(c, li->name, sdslen(li->name));
         addReplyBulkCString(c, "engine");
-        addReplyBulkCBuffer(c, li->ei->name, sdslen(li->ei->name));
+        sds engine_name = scriptingEngineGetName(li->engine);
+        addReplyBulkCBuffer(c, engine_name, sdslen(engine_name));
 
         addReplyBulkCString(c, "functions");
         addReplyArrayLen(c, dictSize(li->functions));
@@ -747,7 +622,7 @@ static void fcallCommandGeneric(client *c, int ro) {
         return;
     }
     functionInfo *fi = dictGetVal(de);
-    engine *engine = fi->li->ei->engine;
+    scriptingEngine *engine = fi->li->engine;
 
     long long numkeys;
     /* Get the number of arguments that are keys */
@@ -764,19 +639,16 @@ static void fcallCommandGeneric(client *c, int ro) {
     }
 
     scriptRunCtx run_ctx;
-    if (scriptPrepareForRun(&run_ctx, fi->li->ei->c, c, fi->name, fi->f_flags, ro) != C_OK) return;
-    setupEngineModuleCtx(fi->li->ei, run_ctx.original_client);
-
-    engine->call(fi->li->ei->module_ctx,
-                 engine->engine_ctx,
-                 &run_ctx,
-                 fi->function,
-                 c->argv + 3,
-                 numkeys,
-                 c->argv + 3 + numkeys,
-                 c->argc - 3 - numkeys);
-
-    teardownEngineModuleCtx(fi->li->ei);
+    if (scriptPrepareForRun(&run_ctx, scriptingEngineGetClient(engine), c, fi->name, fi->f_flags, ro) != C_OK) return;
+
+    scriptingEngineCallFunction(engine,
+                                &run_ctx,
+                                run_ctx.original_client,
+                                fi->function,
+                                c->argv + 3,
+                                numkeys,
+                                c->argv + 3 + numkeys,
+                                c->argc - 3 - numkeys);
     scriptResetRun(&run_ctx);
 }
 
@@ -1076,12 +948,10 @@ void functionFreeLibMetaData(functionsLibMetaData *md) {
     if (md->engine) sdsfree(md->engine);
 }
 
-static void freeCompiledFunctions(engineInfo *ei,
+static void freeCompiledFunctions(scriptingEngine *engine,
                                   compiledFunction **compiled_functions,
                                   size_t num_compiled_functions,
                                   size_t free_function_from_idx) {
-    setupEngineModuleCtx(ei, NULL);
-
     for (size_t i = 0; i < num_compiled_functions; i++) {
         compiledFunction *func = compiled_functions[i];
         decrRefCount(func->name);
@@ -1089,16 +959,12 @@ static void freeCompiledFunctions(engineInfo *ei,
             decrRefCount(func->desc);
         }
         if (i >= free_function_from_idx) {
-            ei->engine->free_function(ei->module_ctx,
-                                      ei->engine->engine_ctx,
-                                      func->function);
+            scriptingEngineCallFreeFunction(engine, func->function);
         }
         zfree(func);
     }
 
     zfree(compiled_functions);
-
-    teardownEngineModuleCtx(ei);
 }
 
 /* Compile and save the given library, return the loaded library name on success
@@ -1120,12 +986,13 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC
         goto error;
     }
 
-    engineInfo *ei = dictFetchValue(engines, md.engine);
-    if (!ei) {
+    scriptingEngine *engine = scriptingEngineManagerFind(md.engine);
+    if (!engine) {
         *err = sdscatfmt(sdsempty(), "Engine '%S' not found", md.engine);
         goto error;
     }
-    engine *engine = ei->engine;
+
+    functionsAddEngineStats(md.engine);
 
     old_li = dictFetchValue(lib_ctx->libraries, md.name);
     if (old_li && !replace) {
@@ -1138,26 +1005,25 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC
         libraryUnlink(lib_ctx, old_li);
     }
 
-    new_li = engineLibraryCreate(md.name, ei, code);
+    new_li = engineLibraryCreate(md.name, engine, code);
     size_t num_compiled_functions = 0;
-    char *compile_error = NULL;
-    setupEngineModuleCtx(ei, NULL);
+    robj *compile_error = NULL;
     compiledFunction **compiled_functions =
-        engine->create(ei->module_ctx,
-                       engine->engine_ctx,
-                       md.code,
-                       timeout,
-                       &num_compiled_functions,
-                       &compile_error);
-    teardownEngineModuleCtx(ei);
+        scriptingEngineCallCreateFunctionsLibrary(engine,
+                                                  md.code,
+                                                  timeout,
+                                                  &num_compiled_functions,
+                                                  &compile_error);
     if (compiled_functions == NULL) {
         serverAssert(num_compiled_functions == 0);
         serverAssert(compile_error != NULL);
-        *err = sdsnew(compile_error);
-        zfree(compile_error);
+        *err = sdsdup(compile_error->ptr);
+        decrRefCount(compile_error);
         goto error;
     }
 
+    serverAssert(compile_error == NULL);
+
     for (size_t i = 0; i < num_compiled_functions; i++) {
         compiledFunction *func = compiled_functions[i];
         int ret = functionLibCreateFunction(func->name,
@@ -1167,7 +1033,7 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC
                                             func->f_flags,
                                             err);
         if (ret == C_ERR) {
-            freeCompiledFunctions(ei,
+            freeCompiledFunctions(engine,
                                   compiled_functions,
                                   num_compiled_functions,
                                   i);
@@ -1175,7 +1041,7 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC
         }
     }
 
-    freeCompiledFunctions(ei,
+    freeCompiledFunctions(engine,
                           compiled_functions,
                           num_compiled_functions,
                           num_compiled_functions);
@@ -1259,32 +1125,26 @@ void functionLoadCommand(client *c) {
     addReplyBulkSds(c, library_name);
 }
 
+static void getEngineUsedMemory(scriptingEngine *engine, void *context) {
+    size_t *engines_memory = (size_t *)context;
+    engineMemoryInfo mem_info = scriptingEngineCallGetMemoryInfo(engine);
+    *engines_memory += mem_info.used_memory;
+}
+
 /* Return memory usage of all the engines combine */
 unsigned long functionsMemory(void) {
-    dictIterator *iter = dictGetIterator(engines);
-    dictEntry *entry = NULL;
     size_t engines_memory = 0;
-    while ((entry = dictNext(iter))) {
-        engineInfo *ei = dictGetVal(entry);
-        engine *engine = ei->engine;
-        setupEngineModuleCtx(ei, NULL);
-        engineMemoryInfo mem_info = engine->get_memory_info(ei->module_ctx,
-                                                            engine->engine_ctx);
-        engines_memory += mem_info.used_memory;
-        teardownEngineModuleCtx(ei);
-    }
-    dictReleaseIterator(iter);
-
+    scriptingEngineManagerForEachEngine(getEngineUsedMemory, &engines_memory);
     return engines_memory;
 }
 
 /* Return memory overhead of all the engines combine */
 unsigned long functionsMemoryOverhead(void) {
-    size_t memory_overhead = dictMemUsage(engines);
+    size_t memory_overhead = scriptingEngineManagerGetMemoryUsage();
     memory_overhead += dictMemUsage(curr_functions_lib_ctx->functions);
     memory_overhead += sizeof(functionsLibCtx);
     memory_overhead += curr_functions_lib_ctx->cache_memory;
-    memory_overhead += engine_cache_memory;
+    memory_overhead += scriptingEngineManagerGetTotalMemoryOverhead();
 
     return memory_overhead;
 }
@@ -1309,8 +1169,6 @@ size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx) {
 /* Initialize engine data structures.
  * Should be called once on server initialization */
 int functionsInit(void) {
-    engines = dictCreate(&engineDictType);
-
     curr_functions_lib_ctx = functionsLibCtxCreate();
 
     if (luaEngineInitEngine() != C_OK) {
diff --git a/src/functions.h b/src/functions.h
index a48ff1b8db..7f6d144365 100644
--- a/src/functions.h
+++ b/src/functions.h
@@ -49,73 +49,19 @@
  */
 
 #include "server.h"
+#include "scripting_engine.h"
 #include "script.h"
 #include "valkeymodule.h"
 
 typedef struct functionLibInfo functionLibInfo;
 
-/* ValkeyModule type aliases for scripting engine structs and types. */
-typedef struct ValkeyModule ValkeyModule;
-typedef ValkeyModuleScriptingEngineCtx engineCtx;
-typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx;
-typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction;
-typedef ValkeyModuleScriptingEngineMemoryInfo engineMemoryInfo;
-typedef ValkeyModuleScriptingEngineMethods engineMethods;
-
-typedef struct engine {
-    /* engine specific context */
-    engineCtx *engine_ctx;
-
-    /* Compiles the script code and returns an array of compiled functions
-     * registered in the script./
-     *
-     * Returns NULL on error and set err to be the error message */
-    compiledFunction **(*create)(
-        ValkeyModuleCtx *module_ctx,
-        engineCtx *engine_ctx,
-        const char *code,
-        size_t timeout,
-        size_t *out_num_compiled_functions,
-        char **err);
-
-    /* Invoking a function, func_ctx is an opaque object (from engine POV).
-     * The func_ctx should be used by the engine to interaction with the server,
-     * such interaction could be running commands, set resp, or set
-     * replication mode
-     */
-    void (*call)(ValkeyModuleCtx *module_ctx,
-                 engineCtx *engine_ctx,
-                 functionCtx *func_ctx,
-                 void *compiled_function,
-                 robj **keys,
-                 size_t nkeys,
-                 robj **args,
-                 size_t nargs);
-
-    /* free the given function */
-    void (*free_function)(ValkeyModuleCtx *module_ctx,
-                          engineCtx *engine_ctx,
-                          void *compiled_function);
-
-    /* Return memory overhead for a given function,
-     * such memory is not counted as engine memory but as general
-     * structs memory that hold different information */
-    size_t (*get_function_memory_overhead)(ValkeyModuleCtx *module_ctx,
-                                           void *compiled_function);
-
-    /* Get the current used memory by the engine */
-    engineMemoryInfo (*get_memory_info)(ValkeyModuleCtx *module_ctx,
-                                        engineCtx *engine_ctx);
-
-} engine;
-
 /* Hold information about an engine.
  * Used on rdb.c so it must be declared here. */
 typedef struct engineInfo {
     sds name;                    /* Name of the engine */
     ValkeyModule *engineModule;  /* the module that implements the scripting engine */
     ValkeyModuleCtx *module_ctx; /* Scripting engine module context */
-    engine *engine;              /* engine callbacks that allows to interact with the engine */
+    scriptingEngine *engine;     /* engine callbacks that allows to interact with the engine */
     client *c;                   /* Client that is used to run commands */
 } engineInfo;
 
@@ -133,18 +79,12 @@ typedef struct functionInfo {
 /* Hold information about the specific library.
  * Used on rdb.c so it must be declared here. */
 struct functionLibInfo {
-    sds name;        /* Library name */
-    dict *functions; /* Functions dictionary */
-    engineInfo *ei;  /* Pointer to the function engine */
-    sds code;        /* Library code */
+    sds name;                /* Library name */
+    dict *functions;         /* Functions dictionary */
+    scriptingEngine *engine; /* Pointer to the scripting engine */
+    sds code;                /* Library code */
 };
 
-int functionsRegisterEngine(const char *engine_name,
-                            ValkeyModule *engine_module,
-                            void *engine_ctx,
-                            engineMethods *engine_methods);
-int functionsUnregisterEngine(const char *engine_name);
-
 sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout);
 unsigned long functionsMemory(void);
 unsigned long functionsMemoryOverhead(void);
@@ -159,6 +99,8 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx);
 void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *));
 void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async);
 
+void functionsRemoveLibFromEngine(scriptingEngine *engine);
+
 int luaEngineInitEngine(void);
 int functionsInit(void);
 
diff --git a/src/module.c b/src/module.c
index fa60335837..75dcd81cd6 100644
--- a/src/module.c
+++ b/src/module.c
@@ -62,8 +62,8 @@
 #include "crc16_slottable.h"
 #include "valkeymodule.h"
 #include "io_threads.h"
-#include "functions.h"
 #include "module.h"
+#include "scripting_engine.h"
 #include <dlfcn.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
@@ -13165,10 +13165,10 @@ int VM_RegisterScriptingEngine(ValkeyModuleCtx *module_ctx,
         return VALKEYMODULE_ERR;
     }
 
-    if (functionsRegisterEngine(engine_name,
-                                module_ctx->module,
-                                engine_ctx,
-                                engine_methods) != C_OK) {
+    if (scriptingEngineManagerRegister(engine_name,
+                                       module_ctx->module,
+                                       engine_ctx,
+                                       engine_methods) != C_OK) {
         return VALKEYMODULE_ERR;
     }
 
@@ -13184,7 +13184,9 @@ int VM_RegisterScriptingEngine(ValkeyModuleCtx *module_ctx,
  */
 int VM_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name) {
     UNUSED(ctx);
-    functionsUnregisterEngine(engine_name);
+    if (scriptingEngineManagerUnregister(engine_name) != C_OK) {
+        return VALKEYMODULE_ERR;
+    }
     return VALKEYMODULE_OK;
 }
 
diff --git a/src/scripting_engine.c b/src/scripting_engine.c
new file mode 100644
index 0000000000..9488f5ef93
--- /dev/null
+++ b/src/scripting_engine.c
@@ -0,0 +1,284 @@
+#include "scripting_engine.h"
+#include "dict.h"
+#include "functions.h"
+#include "module.h"
+
+typedef struct scriptingEngineImpl {
+    /* Engine specific context */
+    engineCtx *ctx;
+
+    /* Callback functions implemented by the scripting engine module */
+    engineMethods methods;
+} scriptingEngineImpl;
+
+typedef struct scriptingEngine {
+    sds name;                    /* Name of the engine */
+    ValkeyModule *module;        /* the module that implements the scripting engine */
+    scriptingEngineImpl impl;    /* engine context and callbacks to interact with the engine */
+    client *c;                   /* Client that is used to run commands */
+    ValkeyModuleCtx *module_ctx; /* Cache of the module context object */
+} scriptingEngine;
+
+
+typedef struct engineManger {
+    dict *engines;                /* engines dictionary */
+    size_t total_memory_overhead; /* the sum of the memory overhead of all registered scripting engines */
+} engineManager;
+
+
+static engineManager engineMgr = {
+    .engines = NULL,
+    .total_memory_overhead = 0,
+};
+
+static uint64_t dictStrCaseHash(const void *key) {
+    return dictGenCaseHashFunction((unsigned char *)key, strlen((char *)key));
+}
+
+dictType engineDictType = {
+    dictStrCaseHash,       /* hash function */
+    NULL,                  /* key dup */
+    dictSdsKeyCaseCompare, /* key compare */
+    NULL,                  /* key destructor */
+    NULL,                  /* val destructor */
+    NULL                   /* allow to expand */
+};
+
+/* Initializes the scripting engine manager.
+ * The engine manager is responsible for managing the several scripting engines
+ * that are loaded in the server and implemented by Valkey Modules.
+ *
+ * Returns C_ERR if some error occurs during the initialization.
+ */
+int scriptingEngineManagerInit(void) {
+    engineMgr.engines = dictCreate(&engineDictType);
+    return C_OK;
+}
+
+/* Returns the amount of memory overhead consumed by all registered scripting
+   engines. */
+size_t scriptingEngineManagerGetTotalMemoryOverhead(void) {
+    return engineMgr.total_memory_overhead;
+}
+
+size_t scriptingEngineManagerGetNumEngines(void) {
+    return dictSize(engineMgr.engines);
+}
+
+size_t scriptingEngineManagerGetMemoryUsage(void) {
+    return dictMemUsage(engineMgr.engines) + sizeof(engineMgr);
+}
+
+/* Registers a new scripting engine in the engine manager.
+ *
+ * - `engine_name`: the name of the scripting engine. This name will match
+ * against the engine name specified in the script header using a shebang.
+ *
+ * - `ctx`: engine specific context pointer.
+ *
+ * - engine_methods - the struct with the scripting engine callback functions
+ * pointers.
+ *
+ * Returns C_ERR in case of an error during registration.
+ */
+int scriptingEngineManagerRegister(const char *engine_name,
+                                   ValkeyModule *engine_module,
+                                   engineCtx *engine_ctx,
+                                   engineMethods *engine_methods) {
+    sds engine_name_sds = sdsnew(engine_name);
+
+    if (dictFetchValue(engineMgr.engines, engine_name_sds)) {
+        serverLog(LL_WARNING, "Scripting engine '%s' is already registered in the server", engine_name_sds);
+        sdsfree(engine_name_sds);
+        return C_ERR;
+    }
+
+    client *c = createClient(NULL);
+    c->flag.deny_blocking = 1;
+    c->flag.script = 1;
+    c->flag.fake = 1;
+
+    scriptingEngine *e = zmalloc(sizeof(*e));
+    *e = (scriptingEngine){
+        .name = engine_name_sds,
+        .module = engine_module,
+        .impl = {
+            .ctx = engine_ctx,
+            .methods = {
+                .create_functions_library = engine_methods->create_functions_library,
+                .call_function = engine_methods->call_function,
+                .free_function = engine_methods->free_function,
+                .get_function_memory_overhead = engine_methods->get_function_memory_overhead,
+                .get_memory_info = engine_methods->get_memory_info,
+            },
+        },
+        .c = c,
+        .module_ctx = engine_module ? moduleAllocateContext() : NULL,
+    };
+
+    dictAdd(engineMgr.engines, engine_name_sds, e);
+
+    engineMemoryInfo mem_info = scriptingEngineCallGetMemoryInfo(e);
+    engineMgr.total_memory_overhead += zmalloc_size(e) +
+                                       sdsAllocSize(e->name) +
+                                       mem_info.engine_memory_overhead;
+
+    return C_OK;
+}
+
+/* Removes a scripting engine from the engine manager.
+ *
+ * - `engine_name`: name of the engine to remove
+ */
+int scriptingEngineManagerUnregister(const char *engine_name) {
+    dictEntry *entry = dictUnlink(engineMgr.engines, engine_name);
+    if (entry == NULL) {
+        serverLog(LL_WARNING, "There's no engine registered with name %s", engine_name);
+        return C_ERR;
+    }
+
+    scriptingEngine *e = dictGetVal(entry);
+
+    functionsRemoveLibFromEngine(e);
+
+    engineMemoryInfo mem_info = scriptingEngineCallGetMemoryInfo(e);
+    engineMgr.total_memory_overhead -= zmalloc_size(e) +
+                                       sdsAllocSize(e->name) +
+                                       mem_info.engine_memory_overhead;
+
+    sdsfree(e->name);
+    freeClient(e->c);
+    if (e->module_ctx) {
+        serverAssert(e->module != NULL);
+        zfree(e->module_ctx);
+    }
+    zfree(e);
+
+    dictFreeUnlinkedEntry(engineMgr.engines, entry);
+
+    return C_OK;
+}
+
+/*
+ * Lookups the engine with `engine_name` in the engine manager and returns it if
+ * it exists. Otherwise returns `NULL`.
+ */
+scriptingEngine *scriptingEngineManagerFind(sds engine_name) {
+    dictEntry *entry = dictFind(engineMgr.engines, engine_name);
+    if (entry) {
+        return dictGetVal(entry);
+    }
+    return NULL;
+}
+
+sds scriptingEngineGetName(scriptingEngine *engine) {
+    return engine->name;
+}
+
+client *scriptingEngineGetClient(scriptingEngine *engine) {
+    return engine->c;
+}
+
+ValkeyModule *scriptingEngineGetModule(scriptingEngine *engine) {
+    return engine->module;
+}
+
+/*
+ * Iterates the list of engines registered in the engine manager and calls the
+ * callback function with each engine.
+ *
+ * The `context` pointer is also passed in each callback call.
+ */
+void scriptingEngineManagerForEachEngine(engineIterCallback callback,
+                                         void *context) {
+    dictIterator *iter = dictGetIterator(engineMgr.engines);
+    dictEntry *entry = NULL;
+    while ((entry = dictNext(iter))) {
+        scriptingEngine *e = dictGetVal(entry);
+        callback(e, context);
+    }
+    dictReleaseIterator(iter);
+}
+
+static void engineSetupModuleCtx(scriptingEngine *e, client *c) {
+    if (e->module != NULL) {
+        serverAssert(e->module_ctx != NULL);
+        moduleScriptingEngineInitContext(e->module_ctx, e->module, c);
+    }
+}
+
+static void engineTeardownModuleCtx(scriptingEngine *e) {
+    if (e->module != NULL) {
+        serverAssert(e->module_ctx != NULL);
+        moduleFreeContext(e->module_ctx);
+    }
+}
+
+compiledFunction **scriptingEngineCallCreateFunctionsLibrary(scriptingEngine *engine,
+                                                             const char *code,
+                                                             size_t timeout,
+                                                             size_t *out_num_compiled_functions,
+                                                             robj **err) {
+    engineSetupModuleCtx(engine, NULL);
+
+    compiledFunction **functions = engine->impl.methods.create_functions_library(
+        engine->module_ctx,
+        engine->impl.ctx,
+        code,
+        timeout,
+        out_num_compiled_functions,
+        err);
+
+    engineTeardownModuleCtx(engine);
+
+    return functions;
+}
+
+void scriptingEngineCallFunction(scriptingEngine *engine,
+                                 functionCtx *func_ctx,
+                                 client *caller,
+                                 void *compiled_function,
+                                 robj **keys,
+                                 size_t nkeys,
+                                 robj **args,
+                                 size_t nargs) {
+    engineSetupModuleCtx(engine, caller);
+
+    engine->impl.methods.call_function(
+        engine->module_ctx,
+        engine->impl.ctx,
+        func_ctx,
+        compiled_function,
+        keys,
+        nkeys,
+        args,
+        nargs);
+
+    engineTeardownModuleCtx(engine);
+}
+
+void scriptingEngineCallFreeFunction(scriptingEngine *engine,
+                                     void *compiled_func) {
+    engineSetupModuleCtx(engine, NULL);
+    engine->impl.methods.free_function(engine->module_ctx,
+                                       engine->impl.ctx,
+                                       compiled_func);
+    engineTeardownModuleCtx(engine);
+}
+
+size_t scriptingEngineCallGetFunctionMemoryOverhead(scriptingEngine *engine,
+                                                    void *compiled_function) {
+    engineSetupModuleCtx(engine, NULL);
+    size_t mem = engine->impl.methods.get_function_memory_overhead(
+        engine->module_ctx, compiled_function);
+    engineTeardownModuleCtx(engine);
+    return mem;
+}
+
+engineMemoryInfo scriptingEngineCallGetMemoryInfo(scriptingEngine *engine) {
+    engineSetupModuleCtx(engine, NULL);
+    engineMemoryInfo mem_info = engine->impl.methods.get_memory_info(
+        engine->module_ctx, engine->impl.ctx);
+    engineTeardownModuleCtx(engine);
+    return mem_info;
+}
diff --git a/src/scripting_engine.h b/src/scripting_engine.h
new file mode 100644
index 0000000000..0ed49e6f88
--- /dev/null
+++ b/src/scripting_engine.h
@@ -0,0 +1,73 @@
+#ifndef _SCRIPTING_ENGINE_H_
+#define _SCRIPTING_ENGINE_H_
+
+#include "server.h"
+
+// Forward declaration of the engine structure.
+typedef struct scriptingEngine scriptingEngine;
+
+/* ValkeyModule type aliases for scripting engine structs and types. */
+typedef struct ValkeyModule ValkeyModule;
+typedef ValkeyModuleScriptingEngineCtx engineCtx;
+typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx;
+typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction;
+typedef ValkeyModuleScriptingEngineMemoryInfo engineMemoryInfo;
+typedef ValkeyModuleScriptingEngineMethods engineMethods;
+
+/*
+ * Callback function used to iterate the list of engines registered in the
+ * engine manager.
+ *
+ * - `engine`: the scripting engine in the current iteration.
+ *
+ * - `context`: a generic pointer to a context object.
+ *
+ */
+typedef void (*engineIterCallback)(scriptingEngine *engine, void *context);
+
+/*
+ * Engine manager API functions.
+ */
+int scriptingEngineManagerInit(void);
+size_t scriptingEngineManagerGetTotalMemoryOverhead(void);
+size_t scriptingEngineManagerGetNumEngines(void);
+size_t scriptingEngineManagerGetMemoryUsage(void);
+int scriptingEngineManagerRegister(const char *engine_name,
+                                   ValkeyModule *engine_module,
+                                   engineCtx *engine_ctx,
+                                   engineMethods *engine_methods);
+int scriptingEngineManagerUnregister(const char *engine_name);
+scriptingEngine *scriptingEngineManagerFind(sds engine_name);
+void scriptingEngineManagerForEachEngine(engineIterCallback callback,
+                                         void *context);
+
+/*
+ * Engine API functions.
+ */
+sds scriptingEngineGetName(scriptingEngine *engine);
+client *scriptingEngineGetClient(scriptingEngine *engine);
+ValkeyModule *scriptingEngineGetModule(scriptingEngine *engine);
+
+/*
+ * API to call engine callback functions.
+ */
+compiledFunction **scriptingEngineCallCreateFunctionsLibrary(scriptingEngine *engine,
+                                                             const char *code,
+                                                             size_t timeout,
+                                                             size_t *out_num_compiled_functions,
+                                                             robj **err);
+void scriptingEngineCallFunction(scriptingEngine *engine,
+                                 functionCtx *func_ctx,
+                                 client *caller,
+                                 void *compiled_function,
+                                 robj **keys,
+                                 size_t nkeys,
+                                 robj **args,
+                                 size_t nargs);
+void scriptingEngineCallFreeFunction(scriptingEngine *engine,
+                                     void *compiled_func);
+size_t scriptingEngineCallGetFunctionMemoryOverhead(scriptingEngine *engine,
+                                                    void *compiled_function);
+engineMemoryInfo scriptingEngineCallGetMemoryInfo(scriptingEngine *engine);
+
+#endif /* _SCRIPTING_ENGINE_H_ */
diff --git a/src/server.c b/src/server.c
index 6338ed524e..bd6fa2e8cd 100644
--- a/src/server.c
+++ b/src/server.c
@@ -43,6 +43,7 @@
 #include "io_threads.h"
 #include "sds.h"
 #include "module.h"
+#include "scripting_engine.h"
 
 #include <time.h>
 #include <signal.h>
@@ -2895,12 +2896,15 @@ void initServer(void) {
         server.maxmemory_policy = MAXMEMORY_NO_EVICTION;
     }
 
+    if (scriptingEngineManagerInit() == C_ERR) {
+        serverPanic("Scripting engine manager initialization failed, check the server logs.");
+    }
+
     /* Initialize the LUA scripting engine. */
     scriptingInit(1);
     /* Initialize the functions engine based off of LUA initialization. */
     if (functionsInit() == C_ERR) {
         serverPanic("Functions initialization failed, check the server logs.");
-        exit(1);
     }
     slowlogInit();
     latencyMonitorInit();
diff --git a/src/util.c b/src/util.c
index 6e44392ce1..ea4f7d72d7 100644
--- a/src/util.c
+++ b/src/util.c
@@ -1381,23 +1381,3 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...) {
     va_end(args);
     return result;
 }
-
-/* A printf-like function that returns a freshly allocated string.
- *
- * This function is similar to asprintf function, but it uses zmalloc for
- * allocating the string buffer. */
-char *valkey_asprintf(char const *fmt, ...) {
-    va_list args;
-
-    va_start(args, fmt);
-    size_t str_len = vsnprintf(NULL, 0, fmt, args) + 1;
-    va_end(args);
-
-    char *str = zmalloc(str_len);
-
-    va_start(args, fmt);
-    vsnprintf(str, str_len, fmt, args);
-    va_end(args);
-
-    return str;
-}
diff --git a/src/util.h b/src/util.h
index 61095ddb65..51eb38f0b4 100644
--- a/src/util.h
+++ b/src/util.h
@@ -99,6 +99,5 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...);
 #endif
 size_t valkey_strlcpy(char *dst, const char *src, size_t dsize);
 size_t valkey_strlcat(char *dst, const char *src, size_t dsize);
-char *valkey_asprintf(char const *fmt, ...);
 
 #endif
diff --git a/src/valkeymodule.h b/src/valkeymodule.h
index 1d99d2ff7a..c501b373fd 100644
--- a/src/valkeymodule.h
+++ b/src/valkeymodule.h
@@ -818,7 +818,12 @@ typedef struct ValkeyModuleScriptingEngineCompiledFunction {
 } ValkeyModuleScriptingEngineCompiledFunction;
 
 /* This struct is used to return the memory information of the scripting
- * engine. */
+ * engine.
+ *
+ * IMPORTANT: If we ever need to add/remove fields from this struct, we need
+ * to bump the version number defined in the
+ * `VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION` constant.
+ */
 typedef struct ValkeyModuleScriptingEngineMemoryInfo {
     /* The memory used by the scripting engine runtime. */
     size_t used_memory;
@@ -826,14 +831,55 @@ typedef struct ValkeyModuleScriptingEngineMemoryInfo {
     size_t engine_memory_overhead;
 } ValkeyModuleScriptingEngineMemoryInfo;
 
+/* The callback function called when `FUNCTION LOAD` command is called to load
+ * a library of functions.
+ * This callback function evaluates the source code passed to `FUNCTION LOAD`
+ * and registers the functions declared in the source code.
+ *
+ * - `engine_ctx`: the engine specific context pointer.
+ *
+ * - `code`: string pointer to the source code.
+ *
+ * - `timeout`: timeout for the library creation (0 for no timeout).
+ *
+ * - `out_num_compiled_functions`: out param with the number of objects
+ *   returned by this function.
+ *
+ * - `err` - out param with the description of error (if occurred).
+ *
+ * Returns an array of compiled function objects, or `NULL` if some error
+ * occurred.
+ */
 typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)(
     ValkeyModuleCtx *module_ctx,
     ValkeyModuleScriptingEngineCtx *engine_ctx,
     const char *code,
     size_t timeout,
     size_t *out_num_compiled_functions,
-    char **err);
+    ValkeyModuleString **err);
 
+/* The callback function called when `FCALL` command is called on a function
+ * registered in the scripting engine.
+ * This callback function executes the `compiled_function` code.
+ *
+ * - `module_ctx`: the module runtime context.
+ *
+ * - `engine_ctx`: the engine specific context pointer.
+ *
+ * - `func_ctx`: the context opaque structure that represents the runtime
+ *               context for the function.
+ *
+ * - `compiled_function`: pointer to the compiled function registered by the
+ *   engine.
+ *
+ * - `keys`: the array of key strings passed in the `FCALL` command.
+ *
+ * - `nkeys`: the number of elements present in the `keys` array.
+ *
+ * - `args`: the array of string arguments passed in the `FCALL` command.
+ *
+ * - `nargs`: the number of elements present in the `args` array.
+ */
 typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)(
     ValkeyModuleCtx *module_ctx,
     ValkeyModuleScriptingEngineCtx *engine_ctx,
@@ -844,10 +890,15 @@ typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)(
     ValkeyModuleString **args,
     size_t nargs);
 
+
+/* Return memory overhead for a given function, such memory is not counted as
+ * engine memory but as general structs memory that hold different information
+ */
 typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)(
     ValkeyModuleCtx *module_ctx,
     void *compiled_function);
 
+/* Free the given function */
 typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)(
     ValkeyModuleCtx *module_ctx,
     ValkeyModuleScriptingEngineCtx *engine_ctx,
@@ -865,13 +916,13 @@ typedef struct ValkeyModuleScriptingEngineMethodsV1 {
      * ValkeyModuleScriptingEngineCompiledFunc objects. */
     ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library;
 
-    /* Function callback to free the memory of a registered engine function. */
-    ValkeyModuleScriptingEngineFreeFunctionFunc free_function;
-
     /* The callback function called when `FCALL` command is called on a function
      * registered in this engine. */
     ValkeyModuleScriptingEngineCallFunctionFunc call_function;
 
+    /* Function callback to free the memory of a registered engine function. */
+    ValkeyModuleScriptingEngineFreeFunctionFunc free_function;
+
     /* Function callback to return memory overhead for a given function. */
     ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead;
 
diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c
index c912164bda..5a34e89f68 100644
--- a/tests/modules/helloscripting.c
+++ b/tests/modules/helloscripting.c
@@ -72,6 +72,7 @@ typedef struct HelloFunc {
     char *name;
     HelloInst instructions[256];
     uint32_t num_instructions;
+    uint32_t index;
 } HelloFunc;
 
 /*
@@ -151,8 +152,9 @@ static void helloLangParseArgs(HelloFunc *func) {
 /*
  * Parses an HELLO program source code.
  */
-static HelloProgram *helloLangParseCode(const char *code,
-                                        HelloProgram *program) {
+static int helloLangParseCode(const char *code,
+                              HelloProgram *program,
+                              ValkeyModuleString **err) {
     char *_code = ValkeyModule_Alloc(sizeof(char) * strlen(code) + 1);
     strcpy(_code, code);
 
@@ -171,6 +173,7 @@ static HelloProgram *helloLangParseCode(const char *code,
             ValkeyModule_Assert(currentFunc == NULL);
             currentFunc = ValkeyModule_Alloc(sizeof(HelloFunc));
             memset(currentFunc, 0, sizeof(HelloFunc));
+            currentFunc->index = program->num_functions;
             program->functions[program->num_functions++] = currentFunc;
             helloLangParseFunction(currentFunc);
             break;
@@ -188,7 +191,9 @@ static HelloProgram *helloLangParseCode(const char *code,
             currentFunc = NULL;
             break;
         default:
-            ValkeyModule_Assert(0);
+            *err = ValkeyModule_CreateStringPrintf(NULL, "Failed to parse instruction: '%s'", token);
+            ValkeyModule_Free(_code);
+            return -1;
         }
 
         token = strtok(NULL, " \n");
@@ -196,7 +201,7 @@ static HelloProgram *helloLangParseCode(const char *code,
 
     ValkeyModule_Free(_code);
 
-    return program;
+    return 0;
 }
 
 /*
@@ -223,6 +228,7 @@ static uint32_t executeHelloLangFunction(HelloFunc *func,
             break;
         }
         case RETURN: {
+            ValkeyModule_Assert(sp > 0);
             uint32_t val = stack[--sp];
             ValkeyModule_Assert(sp == 0);
             return val;
@@ -248,8 +254,10 @@ static ValkeyModuleScriptingEngineMemoryInfo engineGetMemoryInfo(ValkeyModuleCtx
 
         for (uint32_t i = 0; i < ctx->program->num_functions; i++) {
             HelloFunc *func = ctx->program->functions[i];
-            mem_info.used_memory += ValkeyModule_MallocSize(func);
-            mem_info.used_memory += ValkeyModule_MallocSize(func->name);
+            if (func != NULL) {
+                mem_info.used_memory += ValkeyModule_MallocSize(func);
+                mem_info.used_memory += ValkeyModule_MallocSize(func->name);
+            }
         }
     }
 
@@ -273,7 +281,9 @@ static void engineFreeFunction(ValkeyModuleCtx *module_ctx,
                                void *compiled_function) {
     VALKEYMODULE_NOT_USED(module_ctx);
     VALKEYMODULE_NOT_USED(engine_ctx);
+    HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx;
     HelloFunc *func = (HelloFunc *)compiled_function;
+    ctx->program->functions[func->index] = NULL;
     ValkeyModule_Free(func->name);
     func->name = NULL;
     ValkeyModule_Free(func);
@@ -284,7 +294,7 @@ static ValkeyModuleScriptingEngineCompiledFunction **createHelloLangEngine(Valke
                                                                            const char *code,
                                                                            size_t timeout,
                                                                            size_t *out_num_compiled_functions,
-                                                                           char **err) {
+                                                                           ValkeyModuleString **err) {
     VALKEYMODULE_NOT_USED(module_ctx);
     VALKEYMODULE_NOT_USED(timeout);
     VALKEYMODULE_NOT_USED(err);
@@ -298,7 +308,17 @@ static ValkeyModuleScriptingEngineCompiledFunction **createHelloLangEngine(Valke
         ctx->program->num_functions = 0;
     }
 
-    ctx->program = helloLangParseCode(code, ctx->program);
+    int ret = helloLangParseCode(code, ctx->program, err);
+    if (ret < 0) {
+        for (uint32_t i = 0; i < ctx->program->num_functions; i++) {
+            HelloFunc *func = ctx->program->functions[i];
+            ValkeyModule_Free(func->name);
+            ValkeyModule_Free(func);
+            ctx->program->functions[i] = NULL;
+        }
+        ctx->program->num_functions = 0;
+        return NULL;
+    }
 
     ValkeyModuleScriptingEngineCompiledFunction **compiled_functions =
         ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction *) * ctx->program->num_functions);
@@ -341,7 +361,8 @@ callHelloLangFunction(ValkeyModuleCtx *module_ctx,
     ValkeyModule_ReplyWithLongLong(module_ctx, result);
 }
 
-int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv,
+int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx,
+                        ValkeyModuleString **argv,
                         int argc) {
     VALKEYMODULE_NOT_USED(argv);
     VALKEYMODULE_NOT_USED(argc);
diff --git a/tests/unit/moduleapi/scriptingengine.tcl b/tests/unit/moduleapi/scriptingengine.tcl
index c350633dd8..3a37339ea8 100644
--- a/tests/unit/moduleapi/scriptingengine.tcl
+++ b/tests/unit/moduleapi/scriptingengine.tcl
@@ -51,6 +51,10 @@ start_server {tags {"modules"}} {
         assert_error {ERR Function already exists in the library} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION foo\nARGS 0\nRETURN"}
     }
 
+    test {Load script with syntax error} {
+        assert_error {ERR Failed to parse instruction: 'SEND'} {r function load replace "#!hello name=mylib3\nFUNCTION foo\nARGS 0\nSEND"}
+    }
+
     test {Call scripting engine function: calling foo works} {
         r fcall foo 0 134
     } {134}

From b023e3bea6bd5e8f50f59ca1455a24f5ab3b37a7 Mon Sep 17 00:00:00 2001
From: Harkrishn Patro <bunty.hari@gmail.com>
Date: Thu, 16 Jan 2025 11:25:37 -0800
Subject: [PATCH 088/101] Fix cluster info sent stats for message with light
 header (#1563)

This issue affected only two message types (CLUSTERMSG_TYPE_PUBLISH and CLUSTERMSG_TYPE_PUBLISHSHARD) because they used a light message header, which caused the CLUSTER INFO stats to miss sent/received message information for those types.

---------

Signed-off-by: Harkrishn Patro <harkrisp@amazon.com>
Signed-off-by: Harkrishn Patro <bunty.hari@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c          |  2 +-
 tests/unit/cluster/pubsub.tcl | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 5c4bb65aae..a171fe3fda 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3920,7 +3920,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) {
     server.stat_cluster_links_memory += sizeof(listNode);
 
     /* Populate sent messages stats. */
-    uint16_t type = ntohs(getMessageFromSendBlock(msgblock)->type);
+    uint16_t type = ntohs(getMessageFromSendBlock(msgblock)->type) & ~CLUSTERMSG_MODIFIER_MASK;
     if (type < CLUSTERMSG_TYPE_COUNT) server.cluster->stats_bus_messages_sent[type]++;
 }
 
diff --git a/tests/unit/cluster/pubsub.tcl b/tests/unit/cluster/pubsub.tcl
index 12cb409fd9..5b62c96da7 100644
--- a/tests/unit/cluster/pubsub.tcl
+++ b/tests/unit/cluster/pubsub.tcl
@@ -34,5 +34,17 @@ test "Test publishing to master" {
 test "Test publishing to slave" {
     test_cluster_publish 5 10
 }
-
 } ;# start_cluster
+
+start_cluster 3 0 {tags {external:skip cluster}} {
+    test "Test cluster info stats for publish" {
+        R 0 PUBLISH hello world
+        assert_equal 2 [CI 0 cluster_stats_messages_publish_sent]
+        wait_for_condition 50 100 {
+            [CI 1 cluster_stats_messages_publish_received] eq 1 &&
+            [CI 2 cluster_stats_messages_publish_received] eq 1
+        } else {
+            fail "node 2 or node 3 didn't receive clusterbus publish packet"
+        }
+    }
+}

From 91a0918e6b828373e3c29308626d28ddf7363830 Mon Sep 17 00:00:00 2001
From: Pierre <105686771+pieturin@users.noreply.github.com>
Date: Thu, 16 Jan 2025 15:38:15 -0800
Subject: [PATCH 089/101] Fix memory leak in forgotten node ping ext code path
 (#1574)

When processing a cluster bus PING extension, there is a memory leak
when adding a new key to the `nodes_black_list` dict. We now make sure
to free the key `sds` if the dict did not take ownership of it.

Signed-off-by: Pierre Turin <pieturin@amazon.com>
---
 src/cluster_legacy.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index a171fe3fda..3689e76d61 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -2936,6 +2936,10 @@ void clusterProcessPingExtensions(clusterMsg *hdr, clusterLink *link) {
             if (n && n != myself && !(nodeIsReplica(myself) && myself->replicaof == n)) {
                 sds id = sdsnewlen(forgotten_node_ext->name, CLUSTER_NAMELEN);
                 dictEntry *de = dictAddOrFind(server.cluster->nodes_black_list, id);
+                if (dictGetKey(de) != id) {
+                    /* The dict did not take ownership of the id string, so we need to free it. */
+                    sdsfree(id);
+                }
                 uint64_t expire = server.unixtime + ntohu64(forgotten_node_ext->ttl);
                 dictSetUnsignedIntegerVal(de, expire);
                 clusterDelNode(n);

From 6de3fd53df7b0dfffe94a5c309fc9d3d4808355e Mon Sep 17 00:00:00 2001
From: Pierre <105686771+pieturin@users.noreply.github.com>
Date: Thu, 16 Jan 2025 23:56:52 -0800
Subject: [PATCH 090/101] Update comments and log message in cluster_legacy.c
 (#1561)

Update comments and log message in `cluster_legacy.c`.

Follow-up from #1441.

Signed-off-by: Pierre Turin <pieturin@amazon.com>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 3689e76d61..94d3532dfc 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3282,8 +3282,8 @@ int clusterProcessPacket(clusterLink *link) {
                         /* Unable to retrieve the node's IP address from the connection. Without a
                          * valid IP, the node becomes unusable in the cluster. This failure might be
                          * due to the connection being closed. */
-                        serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, "
-                                             "because the connection has an error");
+                        serverLog(LL_NOTICE, "Closing cluster link due to failure to retrieve IP from the connection, "
+                                             "possibly caused by a closed connection.");
                         freeClusterLink(link);
                         return 0;
                     }
@@ -3306,14 +3306,14 @@ int clusterProcessPacket(clusterLink *link) {
                     clusterAddNode(node);
                     clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
                 } else {
-                    /* A second MEET packet was received on an existing link during the handshake process.
-                     * This happens when the other node detects no inbound link, and re-sends a MEET packet
-                     * before this node can respond with a PING. This MEET is a no-op.
+                    /* A second MEET packet was received on an existing link during the handshake
+                     * process. This happens when the other node detects no inbound link, and
+                     * re-sends a MEET packet before this node can respond with a PING.
+                     * This MEET is a no-op.
                      *
-                     * Note: Nodes in HANDSHAKE state are not fully "known" (random names), so the sender
-                     * remains unidentified at this point. The MEET packet might be re-sent if the inbound
-                     * connection is still unestablished by the next cron cycle.
-                     */
+                     * Note: Nodes in HANDSHAKE state are not fully "known" (random names), so the
+                     * sender remains unidentified at this point. The MEET packet might be re-sent
+                     * if the inbound connection is still unestablished by the next cron cycle. */
                     debugServerAssert(link->inbound && nodeInHandshake(link->node));
                 }
 
@@ -3322,16 +3322,19 @@ int clusterProcessPacket(clusterLink *link) {
                  * of the message type. */
                 clusterProcessGossipSection(hdr, link);
             } else if (sender->link && nodeExceedsHandshakeTimeout(sender, now)) {
-                /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not
-                 * know it.
-                 * Free my outbound link to that node, triggering a reconnect and a PING over the new link.
-                 * Once that node receives our PING, it should recognize the new connection as an inbound link from me.
-                 * We should only free the outbound link if the node is known for more time than the handshake timeout,
-                 * since during this time, the other side might still be trying to complete the handshake. */
+                /* The MEET packet is from a known node, after the handshake timeout, so the sender
+                 * thinks that I do not know it.
+                 * Free my outbound link to that node, triggering a reconnect and a PING over the
+                 * new link.
+                 * Once that node receives our PING, it should recognize the new connection as an
+                 * inbound link from me. We should only free the outbound link if the node is known
+                 * for more time than the handshake timeout, since during this time, the other side
+                 * might still be trying to complete the handshake. */
 
                 /* We should always receive a MEET packet on an inbound link. */
                 serverAssert(link != sender->link);
-                serverLog(LL_NOTICE, "Freeing outbound link to node %.40s (%s) after receiving a MEET packet from this known node",
+                serverLog(LL_NOTICE, "Freeing outbound link to node %.40s (%s) after receiving a MEET packet "
+                                     "from this known node",
                           sender->name, sender->human_nodename);
                 freeClusterLink(sender->link);
             }

From f61931431657f4d3e4931726f1bc5d884b3b3630 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Sun, 19 Jan 2025 19:23:00 +0100
Subject: [PATCH 091/101] Lower latenct-monitor-threashold in expire-cycle test
 case (#1584)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test case checks for expire-cycle in LATENCY LATEST, but with the
new hash table, the expiry-cycle is too fast to be logged by latency
monitor. Lower the latency monitor threshold to make it more likely to
be logged.

Fixes #1580

---------

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 tests/unit/latency-monitor.tcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/latency-monitor.tcl b/tests/unit/latency-monitor.tcl
index e4f45389d7..cabb90d382 100644
--- a/tests/unit/latency-monitor.tcl
+++ b/tests/unit/latency-monitor.tcl
@@ -143,7 +143,7 @@ tags {"needs:debug"} {
 } ;# tag
 
     test {LATENCY of expire events are correctly collected} {
-        r config set latency-monitor-threshold 20
+        r config set latency-monitor-threshold 1
         r config set lazyfree-lazy-expire no
         r flushdb
         if {$::valgrind} {set count 100000} else {set count 1000000}

From fc20f296b1c4d788b29f0eea039d6b4281709265 Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Mon, 20 Jan 2025 08:04:47 +0200
Subject: [PATCH 092/101] Change the shared format for dual channel replication
 logs (#1586)

change the format of the dual channel replication logs so that it will
not
conflict with existing log formats like modules.

Fixes: https://github.com/valkey-io/valkey/issues/1509

Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
---
 src/server.h                                   | 2 +-
 tests/integration/dual-channel-replication.tcl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/server.h b/src/server.h
index a0955c4d68..d8e353daae 100644
--- a/src/server.h
+++ b/src/server.h
@@ -3916,7 +3916,7 @@ void debugPauseProcess(void);
 /* dualChannelServerLog - Log messages related to dual-channel operations
  * This macro wraps the serverLog function, prepending "<Dual Channel>"
  * to the log message. */
-#define dualChannelServerLog(level, ...) serverLog(level, "<Dual Channel> " __VA_ARGS__)
+#define dualChannelServerLog(level, ...) serverLog(level, "Dual channel replication: " __VA_ARGS__)
 
 #define serverDebug(fmt, ...) printf("DEBUG %s:%d > " fmt "\n", __FILE__, __LINE__, __VA_ARGS__)
 #define serverDebugMark() printf("-- MARK %s:%d --\n", __FILE__, __LINE__)
diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index ec6783c1c9..dc0de5a734 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -487,7 +487,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             }
             wait_for_value_to_propagate_to_replica $primary $replica "key1"
             # Confirm the occurrence of a race condition.
-            wait_for_log_messages -1 {"*<Dual Channel> Psync established after rdb load*"} 0 2000 1
+            wait_for_log_messages -1 {"*Dual channel replication: Psync established after rdb load*"} 0 2000 1
         }
     }
 }

From 4b1c8619eddc3e2ee0fa85590ffdfed6fbb34771 Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Mon, 20 Jan 2025 20:28:45 +0200
Subject: [PATCH 093/101] Fix Protocol desync regression test (#1590)

The desync regression test was created as a regression test for the
following bug:
in case we embed NULL termination inside inline/multi-bulk message we
will not be able to perform strchr in order to
identify the newline(\n)/carriage-return(\r) in the client query buffer.
this can influence (for example) replica reading primary stream and keep
filling it's query buffer endlessly consuming more and more memory.

In order to handle the above risk, a check was added to verify the
inline bulk and multi-bulk size are not exceeding the 64K bytes in the
query-buffer. A test was placed in order to verify this.

This PR introduce the following fixes to the desync regression test:
1. fix the sent payload to flush 1024 bytes block of 'A's instead of
'payload' which was sent by mistake.
2. Make sure that the connection is correctly terminated on protocol
error by the server after exceeding the 64K and not over 64K.
3. add another test intrinsic which will also verify the nested bulk
with embedded null termination (was not verified before)

fixes https://github.com/valkey-io/valkey/issues/1583


NOTE: Although it is possible to change the use of strchr to a more
"safe" utility (eg memchr) which will not pause scan at first occurrence
of '\0', we still like to protect against over excessive usage of the
query buffer and also preserve the current behavior(?). We will look
into improving this though in a followup issue.

---------

Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
---
 tests/unit/protocol.tcl | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl
index 0d05f6dee5..7153173e0d 100644
--- a/tests/unit/protocol.tcl
+++ b/tests/unit/protocol.tcl
@@ -69,7 +69,7 @@ start_server {tags {"protocol network"}} {
     }
 
     set c 0
-    foreach seq [list "\x00" "*\x00" "$\x00"] {
+    foreach seq [list "\x00" "*\x00" "$\x00" "*1\r\n$\x00"] {
         incr c
         test "Protocol desync regression test #$c" {
             if {$::tls} {
@@ -77,29 +77,31 @@ start_server {tags {"protocol network"}} {
             } else {
                 set s [socket [srv 0 host] [srv 0 port]]
             }
+            fconfigure $s -blocking 0
             puts -nonewline $s $seq
+            # PROTO_INLINE_MAX_SIZE is hardcoded in Valkey code to 64K. doing the same here 
+            # since we would like to validate it is enforced. 
+            set PROTO_INLINE_MAX_SIZE [expr 1024 * 64]
             set payload [string repeat A 1024]"\n"
-            set test_start [clock seconds]
-            set test_time_limit 30
-            while 1 {
+            set payload_size 0
+            while {$payload_size <= $PROTO_INLINE_MAX_SIZE} {
                 if {[catch {
-                    puts -nonewline $s payload
-                    flush $s
                     incr payload_size [string length $payload]
+                    puts -nonewline $s $payload
+                    flush $s
                 }]} {
-                    set retval [gets $s]
-                    close $s
+                    assert_morethan $payload_size $PROTO_INLINE_MAX_SIZE
                     break
-                } else {
-                    set elapsed [expr {[clock seconds]-$test_start}]
-                    if {$elapsed > $test_time_limit} {
-                        close $s
-                        error "assertion:Valkey did not closed connection after protocol desync"
-                    }
                 }
             }
-            set retval
-        } {*Protocol error*}
+           
+            wait_for_condition 50 100 {
+                [string match {*Protocol error*} [gets $s]]
+            } else {
+                fail "expected connection to be closed on protocol error after sending $payload_size bytes"
+            }
+            close $s
+        }
     }
     unset c
 

From d68667d034845257ae0b028e194298c23f09c13f Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Tue, 21 Jan 2025 08:57:01 +0200
Subject: [PATCH 094/101] fix test Protocol desync regression test with TLS
 (#1593)

remove socket nonblocking and simplify the validation

fixes https://github.com/valkey-io/valkey/issues/1592

Signed-off-by: ranshid <ranshid@amazon.com>
---
 tests/unit/protocol.tcl | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl
index 7153173e0d..5baba99f8b 100644
--- a/tests/unit/protocol.tcl
+++ b/tests/unit/protocol.tcl
@@ -77,7 +77,6 @@ start_server {tags {"protocol network"}} {
             } else {
                 set s [socket [srv 0 host] [srv 0 port]]
             }
-            fconfigure $s -blocking 0
             puts -nonewline $s $seq
             # PROTO_INLINE_MAX_SIZE is hardcoded in Valkey code to 64K. doing the same here 
             # since we would like to validate it is enforced. 
@@ -94,12 +93,7 @@ start_server {tags {"protocol network"}} {
                     break
                 }
             }
-           
-            wait_for_condition 50 100 {
-                [string match {*Protocol error*} [gets $s]]
-            } else {
-                fail "expected connection to be closed on protocol error after sending $payload_size bytes"
-            }
+            assert_match {*Protocol error*} [gets $s]]
             close $s
         }
     }

From 7ea9b048375d6f9d7684aac496f81c3ebb4bf489 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Thu, 23 Jan 2025 11:26:54 +0100
Subject: [PATCH 095/101] Add cross-version test framework (and a simple test)
 (#1371)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This includes a way to run two versions of the server from the TCL test
framework. It's a preparation to add more cross-version tests. The
runtest script accepts a new parameter

    ./runtest --other-server-path path/to/valkey-server

and a new tag "needs:other-server" for test cases and start_server.
Tests with this tag are automatically skipped if `--other-server-path`
is not provided.

This PR adds it in a CI job with Valkey 7.2.7 by downloading a binary
release.

Fixes #76

---------

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 .github/workflows/ci.yml                      |  7 ++-
 tests/README.md                               | 29 +++++++-----
 .../integration/cross-version-replication.tcl | 34 ++++++++++++++
 tests/support/server.tcl                      | 46 +++++++++++++------
 tests/test_helper.tcl                         | 34 ++++++++++----
 5 files changed, 116 insertions(+), 34 deletions(-)
 create mode 100644 tests/integration/cross-version-replication.tcl

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index df3eaa1905..a1014faa99 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,10 +18,15 @@ jobs:
         # Fail build if there are warnings
         # build with TLS just for compilation coverage
         run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes USE_FAST_FLOAT=yes
+      - name: install old server for compatibility testing
+        run: |
+          cd tests/tmp
+          wget https://download.valkey.io/releases/valkey-7.2.7-noble-x86_64.tar.gz
+          tar -xvf valkey-7.2.7-noble-x86_64.tar.gz
       - name: test
         run: |
           sudo apt-get install tcl8.6 tclx
-          ./runtest --verbose --tags -slow --dump-logs
+          ./runtest --verbose --tags -slow --dump-logs --other-server-path tests/tmp/valkey-7.2.7-noble-x86_64/bin/valkey-server
       - name: module api test
         run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs
       - name: validate commands.def up to date
diff --git a/tests/README.md b/tests/README.md
index 06756f394b..8cac6c19fa 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -24,16 +24,17 @@ There are additional runtime options that can further adjust the test suite to
 match different external server configurations. All options are listed by
 `./runtest --help`. The following table is just a subset of the options:
 
-| Option               | Impact                                                   |
-| -------------------- | -------------------------------------------------------- |
-| `--singledb`         | Only use database 0, don't assume others are supported. |
-| `--ignore-encoding`  | Skip all checks for specific encoding.  |
-| `--ignore-digest`    | Skip key value digest validations. |
-| `--cluster-mode`     | Run in strict Valkey Cluster compatibility mode. |
-| `--large-memory`     | Enables tests that consume more than 100MB |
-| `--tls`              | Run tests with TLS. See below. |
-| `--tls-module`       | Run tests with TLS, when TLS support is built as a module. |
-| `--help`             | Displays the full set of options. |
+| Option                     | Impact                                                   |
+| -------------------------- | -------------------------------------------------------- |
+| `--singledb`               | Only use database 0, don't assume others are supported. |
+| `--ignore-encoding`        | Skip all checks for specific encoding.  |
+| `--ignore-digest`          | Skip key value digest validations. |
+| `--cluster-mode`           | Run in strict Valkey Cluster compatibility mode. |
+| `--large-memory`           | Enables tests that consume more than 100MB |
+| `--tls`                    | Run tests with TLS. See below. |
+| `--tls-module`             | Run tests with TLS, when TLS support is built as a module. |
+| `--other-server-path PATH` | Run compatibility tests with an other server executable. |
+| `--help`                   | Displays the full set of options. |
 
 Running with TLS requires the following preparations:
 
@@ -86,18 +87,22 @@ Tags can be applied in different context levels:
 The following compatibility and capability tags are currently used:
 
 | Tag                       | Indicates |
-| ---------------------     | --------- |
+| ------------------------- | --------- |
 | `external:skip`           | Not compatible with external servers. |
+| `cluster`                 | Uses cluster with multiple nodes. |
 | `cluster:skip`            | Not compatible with `--cluster-mode`. |
 | `large-memory`            | Test that requires more than 100MB |
+| `tls`                     | Uses TLS. |
 | `tls:skip`                | Not compatible with `--tls`. |
-| `needs:repl`              | Uses replication and needs to be able to `SYNC` from server. |
+| `ipv6`                    | Uses IPv6. |
+| `needs:repl`, `repl`      | Uses replication and needs to be able to `SYNC` from server. |
 | `needs:debug`             | Uses the `DEBUG` command or other debugging focused commands (like `OBJECT REFCOUNT`). |
 | `needs:pfdebug`           | Uses the `PFDEBUG` command. |
 | `needs:config-maxmemory`  | Uses `CONFIG SET` to manipulate memory limit, eviction policies, etc. |
 | `needs:config-resetstat`  | Uses `CONFIG RESETSTAT` to reset statistics. |
 | `needs:reset`             | Uses `RESET` to reset client connections. |
 | `needs:save`              | Uses `SAVE` or `BGSAVE` to create an RDB file. |
+| `needs:other-server`      | Requires `--other-server-path`. |
 
 When using an external server (`--host` and `--port`), filtering using the
 `external:skip` tags is done automatically.
diff --git a/tests/integration/cross-version-replication.tcl b/tests/integration/cross-version-replication.tcl
new file mode 100644
index 0000000000..6ec3d0db80
--- /dev/null
+++ b/tests/integration/cross-version-replication.tcl
@@ -0,0 +1,34 @@
+# Test replication from an older version primary.
+#
+# Use minimal.conf to make sure we don't use any configs not supported on the old version.
+
+proc server_name_and_version {} {
+    set server_name [s server_name]
+    if {$server_name eq {}} {
+        set server_name redis
+    }
+    set server_version [s "${server_name}_version"]
+    return "$server_name $server_version"
+}
+
+start_server {tags {"repl needs:other-server external:skip"} start-other-server 1 config "minimal.conf"} {
+    set primary_name_and_version [server_name_and_version]
+    r set foo bar
+
+    start_server {} {
+        test "Start replication from $primary_name_and_version" {
+            r replicaof [srv -1 host] [srv -1 port]
+            wait_for_sync r
+            # The key has been transferred.
+            assert_equal bar [r get foo]
+            assert_equal up [s master_link_status]
+        }
+
+        test "Replicate a SET command from $primary_name_and_version" {
+            r -1 set baz quux
+            wait_for_ofs_sync [srv 0 client] [srv -1 client]
+            set reply [r get baz]
+            assert_equal $reply quux
+        }
+    }
+}
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index bd3135e9d9..c081e51338 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -2,9 +2,9 @@ set ::global_overrides {}
 set ::tags {}
 set ::valgrind_errors {}
 
-proc start_server_error {config_file error} {
+proc start_server_error {executable config_file error} {
     set err {}
-    append err "Can't start the Valkey server\n"
+    append err "Can't start $executable\n"
     append err "CONFIGURATION:\n"
     append err [exec cat $config_file]
     append err "\nERROR:\n"
@@ -216,6 +216,11 @@ proc tags_acceptable {tags err_return} {
         return 0
     }
 
+    if {$::other_server_path eq {} && [lsearch $tags "needs:other-server"] >= 0} {
+        set err "Other server path not provided"
+        return 0
+    }
+
     if {$::external && [lsearch $tags "external:skip"] >= 0} {
         set err "Not supported on external server"
         return 0
@@ -289,8 +294,8 @@ proc create_server_config_file {filename config config_lines} {
     close $fp
 }
 
-proc spawn_server {config_file stdout stderr args} {
-    set cmd [list src/valkey-server $config_file]
+proc spawn_server {executable config_file stdout stderr args} {
+    set cmd [list $executable $config_file]
     set args {*}$args
     if {[llength $args] > 0} {
         lappend cmd {*}$args
@@ -319,7 +324,7 @@ proc spawn_server {config_file stdout stderr args} {
 }
 
 # Wait for actual startup, return 1 if port is busy, 0 otherwise
-proc wait_server_started {config_file stdout stderr pid} {
+proc wait_server_started {executable config_file stdout stderr pid} {
     set checkperiod 100; # Milliseconds
     set maxiter [expr {120*1000/$checkperiod}] ; # Wait up to 2 minutes.
     set port_busy 0
@@ -330,7 +335,7 @@ proc wait_server_started {config_file stdout stderr pid} {
         after $checkperiod
         incr maxiter -1
         if {$maxiter == 0} {
-            start_server_error $config_file "No PID detected in log $stdout"
+            start_server_error $executable $config_file "No PID detected in log $stdout"
             puts "--- LOG CONTENT ---"
             puts [exec cat $stdout]
             puts "-------------------"
@@ -347,7 +352,7 @@ proc wait_server_started {config_file stdout stderr pid} {
         # Configuration errors are unexpected, but it's helpful to fail fast
         # to give the feedback to the test runner.
         if {[regexp {FATAL CONFIG FILE ERROR} [exec cat $stderr]]} {
-            start_server_error $config_file "Configuration issue prevented Valkey startup"
+            start_server_error $executable $config_file "Configuration issue prevented Valkey startup"
             break
         }
     }
@@ -441,6 +446,7 @@ proc start_server {options {code undefined}} {
     set args {}
     set keep_persistence false
     set config_lines {}
+    set start_other_server 0
 
     # Wait for the server to be ready and check for server liveness/client connectivity before starting the test.
     set wait_ready true
@@ -448,6 +454,9 @@ proc start_server {options {code undefined}} {
     # parse options
     foreach {option value} $options {
         switch $option {
+            "start-other-server" {
+                set start_other_server $value ; # boolean, 0 or 1
+            }
             "config" {
                 set baseconfig $value
             }
@@ -498,6 +507,15 @@ proc start_server {options {code undefined}} {
         return
     }
 
+    if {$start_other_server} {
+        set executable $::other_server_path
+        if {![file executable $executable]} {
+            error "File not found or not executable: $executable"
+        }
+    } else {
+        set executable "src/valkey-server"
+    }
+
     set data [split [exec cat "tests/assets/$baseconfig"] "\n"]
     set config {}
     if {$::tls} {
@@ -588,15 +606,15 @@ proc start_server {options {code undefined}} {
     set server_started 0
     while {$server_started == 0} {
         if {$::verbose} {
-            puts -nonewline "=== ($tags) Starting server ${::host}:${port} "
+            puts -nonewline "=== ($tags) Starting server on ${::host}:${port} "
         }
 
         send_data_packet $::test_server_fd "server-spawning" "port $port"
 
-        set pid [spawn_server $config_file $stdout $stderr $args]
+        set pid [spawn_server $executable $config_file $stdout $stderr $args]
 
         # check that the server actually started
-        set port_busy [wait_server_started $config_file $stdout $stderr $pid]
+        set port_busy [wait_server_started $executable $config_file $stdout $stderr $pid]
 
         # Sometimes we have to try a different port, even if we checked
         # for availability. Other test clients may grab the port before we
@@ -634,7 +652,7 @@ proc start_server {options {code undefined}} {
         if {!$serverisup} {
             set err {}
             append err [exec cat $stdout] "\n" [exec cat $stderr]
-            start_server_error $config_file $err
+            start_server_error $executable $config_file $err
             return
         }
         set server_started 1
@@ -647,6 +665,7 @@ proc start_server {options {code undefined}} {
     if {[dict exists $config $port_param]} { set port [dict get $config $port_param] }
 
     # setup config dict
+    dict set srv "executable" $executable
     dict set srv "config_file" $config_file
     dict set srv "config" $config
     dict set srv "pid" $pid
@@ -801,12 +820,13 @@ proc restart_server {level wait_ready rotate_logs {reconnect 1} {shutdown sigter
         close $fd
     }
 
+    set executable [dict get $srv "executable"]
     set config_file [dict get $srv "config_file"]
 
-    set pid [spawn_server $config_file $stdout $stderr {}]
+    set pid [spawn_server $executable $config_file $stdout $stderr {}]
 
     # check that the server actually started
-    wait_server_started $config_file $stdout $stderr $pid
+    wait_server_started $executable $config_file $stdout $stderr $pid
 
     # update the pid in the servers list
     dict set srv "pid" $pid
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 54bb923674..662449134a 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -69,6 +69,7 @@ set ::single_tests {}
 set ::run_solo_tests {}
 set ::skip_till ""
 set ::external 0; # If "1" this means, we are running against external instance
+set ::other_server_path {}; # Used in upgrade and inter-version tests
 set ::file ""; # If set, runs only the tests in this comma separated list
 set ::curfile ""; # Hold the filename of the current suite
 set ::accurate 0; # If true runs fuzz tests with more iterations
@@ -568,26 +569,36 @@ proc send_data_packet {fd status data {elapsed 0}} {
 
 proc print_help_screen {} {
     puts [join {
-        "--cluster          Run the cluster tests, by default cluster tests run along with all tests."
-        "--moduleapi        Run the module API tests, this option should only be used in runtest-moduleapi which will build the test module."
+        # This is for terminal output, so assume default term width of 80 columns. -----|
+        "--cluster          Run the cluster tests, by default cluster tests run along"
+        "                   with all tests."
+        "--moduleapi        Run the module API tests, this option should only be used in"
+        "                   runtest-moduleapi which will build the test module."
         "--valgrind         Run the test over valgrind."
         "--durable          suppress test crashes and keep running"
         "--stack-logging    Enable OSX leaks/malloc stack logging."
         "--accurate         Run slow randomized tests for more iterations."
         "--quiet            Don't show individual tests."
-        "--single <unit>    Just execute the specified unit (see next option). This option can be repeated."
+        "--single <unit>    Just execute the specified unit (see next option). This"
+        "                   option can be repeated."
         "--verbose          Increases verbosity."
         "--list-tests       List all the available test units."
-        "--only <test>      Just execute the specified test by test name or tests that match <test> regexp (if <test> starts with '/'). This option can be repeated."
+        "--only <test>      Just execute the specified test by test name or tests that"
+        "                   match <test> regexp (if <test> starts with '/'). This option"
+        "                   can be repeated."
         "--skip-till <unit> Skip all units until (and including) the specified one."
         "--skipunit <unit>  Skip one unit."
         "--clients <num>    Number of test clients (default 16)."
         "--timeout <sec>    Test timeout in seconds (default 20 min)."
         "--force-failure    Force the execution of a test that always fails."
         "--config <k> <v>   Extra config file argument."
-        "--skipfile <file>  Name of a file containing test names or regexp patterns (if <test> starts with '/') that should be skipped (one per line). This option can be repeated."
-        "--skiptest <test>  Test name or regexp pattern (if <test> starts with '/') to skip. This option can be repeated."
-        "--tags <tags>      Run only tests having specified tags or not having '-' prefixed tags."
+        "--skipfile <file>  Name of a file containing test names or regexp patterns (if"
+        "                   <test> starts with '/') that should be skipped (one per"
+        "                   line). This option can be repeated."
+        "--skiptest <test>  Test name or regexp pattern (if <test> starts with '/') to"
+        "                   skip. This option can be repeated."
+        "--tags <tags>      Run only tests having specified tags or not having '-'"
+        "                   prefixed tags."
         "--dont-clean       Don't delete valkey log files after the run."
         "--dont-pre-clean   Don't delete existing valkey log files before the run."
         "--no-latency       Skip latency measurements and validation by some tests."
@@ -595,13 +606,17 @@ proc print_help_screen {} {
         "--stop             Blocks once the first test fails."
         "--loop             Execute the specified set of tests forever."
         "--loops <count>    Execute the specified set of tests several times."
-        "--wait-server      Wait after server is started (so that you can attach a debugger)."
+        "--wait-server      Wait after server is started (so that you can attach a"
+        "                   debugger)."
         "--dump-logs        Dump server log on test failure."
         "--io-threads       Run tests with IO threads."
         "--tls              Run tests in TLS mode."
         "--tls-module       Run tests in TLS mode with Valkey module."
         "--host <addr>      Run tests against an external host."
         "--port <port>      TCP port to use against external host."
+        "--other-server-path <path>"
+        "                   Path to another version of valkey-server, used for inter-"
+        "                   version compatibility testing."
         "--baseport <port>  Initial port number for spawned valkey servers."
         "--portcount <num>  Port range for spawned valkey servers."
         "--singledb         Use a single database, avoid SELECT."
@@ -676,6 +691,9 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
     } elseif {$opt eq {--port}} {
         set ::port $arg
         incr j
+    } elseif {$opt eq {--other-server-path}} {
+        set ::other_server_path $arg
+        incr j
     } elseif {$opt eq {--baseport}} {
         set ::baseport $arg
         incr j

From 9df9d61c259c1fa3f31b11a0522188f59828e414 Mon Sep 17 00:00:00 2001
From: Nadav Gigi <95503908+NadavGigi@users.noreply.github.com>
Date: Thu, 23 Jan 2025 13:17:20 +0200
Subject: [PATCH 096/101] Accelerate hash table iterator with value prefetching
 (#1568)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR builds upon the [previous entry prefetching
optimization](https://github.com/valkey-io/valkey/pull/1501) to further
enhance performance by implementing value prefetching for hashtable
iterators.

## Implementation
Modified `hashtableInitIterator` to accept a new flags parameter,
allowing control over iterator behavior.
Implemented conditional value prefetching within `hashtableNext` based
on the new `HASHTABLE_ITER_PREFETCH_VALUES` flag.
When the flag is set, hashtableNext now calls `prefetchBucketValues` at
the start of each new bucket, preemptively loading the values of filled
entries into the CPU cache.
The actual prefetching of values is performed using type-specific
callback functions implemented in `server.c`:
- For `robj` the `hashtableObjectPrefetchValue` callback is used to
prefetch the value if not embeded.

This implementation is specifically focused on main database iterations
at this stage. Applying it to hashtables that hold other object types
should not be problematic, but its performance benefits for those cases
will need to be proven through testing and benchmarking.

## Performance

### Setup:
- 64cores Graviton 3 Amazon EC2 instance.
-  50 mil keys with different value sizes.
-  Running valkey server over RAM file system.
-  crc checksum and comperssion off.

### Action
- save command.

### Results
The results regarding the duration of “save” command was taken from
“info all” command.
```
+--------------------+------------------+------------------+
| Prefetching        | Value size (byte)| Time (seconds)   |
+--------------------+------------------+------------------+
| No                 | 100              | 20.112279        |
| Yes                | 100              | 12.758519        |
| No                 | 40               | 16.945366        |
| Yes                | 40               | 10.902022        |
| No                 | 20               | 9.817000         |
| Yes                | 20               | 9.626821         |
| No                 | 10               | 9.71510          |
| Yes                | 10               | 9.510565         |
+--------------------+------------------+------------------+
```
The results largely align with our expectations, showing significant
improvements for larger values (100 bytes and 40 bytes) that are stored
outside the robj. For smaller values (20 bytes and 10 bytes) that are
embedded within the robj, we see almost no improvement, which is as
expected.

However, the small improvement observed even for these embedded values
is somewhat surprising. Given that we are not actively prefetching these
embedded values, this minor performance gain was not anticipated.

perf record on save command **without** value prefetching:
```
                --99.98%--rdbSaveDb
                          |
                          |--91.38%--rdbSaveKeyValuePair
                          |          |
                          |          |--42.72%--rdbSaveRawString
                          |          |          |
                          |          |          |--26.69%--rdbWriteRaw
                          |          |          |          |
                          |          |          |           --25.75%--rioFileWrite.lto_priv.0
                          |          |          |
                          |          |           --15.41%--rdbSaveLen
                          |          |                     |
                          |          |                     |--7.58%--rdbWriteRaw
                          |          |                     |          |
                          |          |                     |           --7.08%--rioFileWrite.lto_priv.0
                          |          |                     |                     |
                          |          |                     |                      --6.54%--_IO_fwrite
                          |          |                     |
                          |          |                     |
                          |          |                      --7.42%--rdbWriteRaw.constprop.1
                          |          |                                |
                          |          |                                 --7.18%--rioFileWrite.lto_priv.0
                          |          |                                           |
                          |          |                                            --6.73%--_IO_fwrite
                          |          |
                          |          |
                          |          |--40.44%--rdbSaveStringObject
                          |          |
                          |           --7.62%--rdbSaveObjectType
                          |                     |
                          |                      --7.39%--rdbWriteRaw.constprop.1
                          |                                |
                          |                                 --7.04%--rioFileWrite.lto_priv.0
                          |                                           |
                          |                                            --6.59%--_IO_fwrite
                          |
                          |
                           --7.33%--hashtableNext.constprop.1
                                     |
                                      --6.28%--prefetchNextBucketEntries.lto_priv.0
```
perf record on save command **with** value prefetching:
```
               rdbSaveRio
               |
                --99.93%--rdbSaveDb
                          |
                          |--79.81%--rdbSaveKeyValuePair
                          |          |
                          |          |--66.79%--rdbSaveRawString
                          |          |          |
                          |          |          |--42.31%--rdbWriteRaw
                          |          |          |          |
                          |          |          |           --40.74%--rioFileWrite.lto_priv.0
                          |          |          |
                          |          |           --23.37%--rdbSaveLen
                          |          |                     |
                          |          |                     |--11.78%--rdbWriteRaw
                          |          |                     |          |
                          |          |                     |           --11.03%--rioFileWrite.lto_priv.0
                          |          |                     |                     |
                          |          |                     |                      --10.30%--_IO_fwrite
                          |          |                     |                                |
                          |          |                     |
                          |          |                      --10.98%--rdbWriteRaw.constprop.1
                          |          |                                |
                          |          |                                 --10.44%--rioFileWrite.lto_priv.0
                          |          |                                           |
                          |          |                                            --9.74%--_IO_fwrite
                          |          |                                                      |
                          |          |
                          |          |--11.33%--rdbSaveObjectType
                          |          |          |
                          |          |           --10.96%--rdbWriteRaw.constprop.1
                          |          |                     |
                          |          |                      --10.51%--rioFileWrite.lto_priv.0
                          |          |                                |
                          |          |                                 --9.75%--_IO_fwrite
                          |          |                                           |
                          |          |
                          |           --0.77%--rdbSaveStringObject
                          |
                           --18.39%--hashtableNext
                                     |
                                     |--10.04%--hashtableObjectPrefetchValue
                                     |
                                      --6.06%--prefetchNextBucketEntries

```
Conclusions:

The prefetching strategy appears to be working as intended, shifting the
performance bottleneck from data access to I/O operations.
The significant reduction in rdbSaveStringObject time suggests that
string objects(which are the values) are being accessed more
efficiently.

Signed-off-by: NadavGigi <nadavgigi102@gmail.com>
---
 src/acl.c                 |   8 +--
 src/aof.c                 |   4 +-
 src/cluster.c             |   2 +-
 src/cluster_legacy.c      |   2 +-
 src/db.c                  |   4 +-
 src/debug.c               |   4 +-
 src/hashtable.c           | 110 ++++++++++++++++++++++++--------------
 src/hashtable.h           |  13 +++--
 src/kvstore.c             |  22 +++-----
 src/kvstore.h             |   5 +-
 src/latency.c             |   4 +-
 src/module.c              |   4 +-
 src/object.c              |   8 +--
 src/pubsub.c              |   4 +-
 src/rdb.c                 |   6 +--
 src/server.c              |  29 ++++++----
 src/sort.c                |   2 +-
 src/t_hash.c              |   2 +-
 src/t_set.c               |   4 +-
 src/t_zset.c              |   6 +--
 src/unit/test_hashtable.c |   6 +--
 src/unit/test_kvstore.c   |   8 +--
 22 files changed, 147 insertions(+), 110 deletions(-)

diff --git a/src/acl.c b/src/acl.c
index 184fa54116..807ef744d2 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -655,7 +655,7 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int
     ACLResetFirstArgsForCommand(selector, id);
     if (cmd->subcommands_ht) {
         hashtableIterator iter;
-        hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+        hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE);
         void *next;
         while (hashtableNext(&iter, &next)) {
             struct serverCommand *sub = next;
@@ -673,7 +673,7 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int
  * found and the operation was performed. */
 void ACLSetSelectorCommandBitsForCategory(hashtable *commands, aclSelector *selector, uint64_t cflag, int value) {
     hashtableIterator iter;
-    hashtableInitIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, 0);
     void *next;
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
@@ -741,7 +741,7 @@ void ACLCountCategoryBitsForCommands(hashtable *commands,
                                      unsigned long *off,
                                      uint64_t cflag) {
     hashtableIterator iter;
-    hashtableInitIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, 0);
     void *next;
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
@@ -2765,7 +2765,7 @@ sds getAclErrorMessage(int acl_res, user *user, struct serverCommand *cmd, sds e
 /* ACL CAT category */
 void aclCatWithFlags(client *c, hashtable *commands, uint64_t cflag, int *arraylen) {
     hashtableIterator iter;
-    hashtableInitIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, 0);
     void *next;
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
diff --git a/src/aof.c b/src/aof.c
index 17c462169e..fd7693cf00 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1897,7 +1897,7 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) {
     } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = o->ptr;
         hashtableIterator iter;
-        hashtableInitIterator(&iter, zs->ht);
+        hashtableInitIterator(&iter, zs->ht, 0);
         void *next;
         while (hashtableNext(&iter, &next)) {
             zskiplistNode *node = next;
@@ -2223,7 +2223,7 @@ int rewriteAppendOnlyFileRio(rio *aof) {
         if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr;
         if (rioWriteBulkLongLong(aof, j) == 0) goto werr;
 
-        kvs_it = kvstoreIteratorInit(db->keys);
+        kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES);
         /* Iterate this DB writing every entry */
         void *next;
         while (kvstoreIteratorNext(kvs_it, &next)) {
diff --git a/src/cluster.c b/src/cluster.c
index 309279e0be..cedcd9ecb1 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -910,7 +910,7 @@ void clusterCommand(client *c) {
         unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys;
         addReplyArrayLen(c, numkeys);
         kvstoreHashtableIterator *kvs_di = NULL;
-        kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot);
+        kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot, 0);
         for (unsigned int i = 0; i < numkeys; i++) {
             void *next;
             serverAssert(kvstoreHashtableIteratorNext(kvs_di, &next));
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 94d3532dfc..5e976d3060 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -6347,7 +6347,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) {
 
     kvstoreHashtableIterator *kvs_di = NULL;
     void *next;
-    kvs_di = kvstoreGetHashtableSafeIterator(server.db->keys, hashslot);
+    kvs_di = kvstoreGetHashtableIterator(server.db->keys, hashslot, HASHTABLE_ITER_SAFE);
     while (kvstoreHashtableIteratorNext(kvs_di, &next)) {
         robj *valkey = next;
         enterExecutionUnit(1, 0);
diff --git a/src/db.c b/src/db.c
index 535d493954..f2a000030b 100644
--- a/src/db.c
+++ b/src/db.c
@@ -895,9 +895,9 @@ void keysCommand(client *c) {
     kvstoreHashtableIterator *kvs_di = NULL;
     kvstoreIterator *kvs_it = NULL;
     if (pslot != -1) {
-        kvs_di = kvstoreGetHashtableSafeIterator(c->db->keys, pslot);
+        kvs_di = kvstoreGetHashtableIterator(c->db->keys, pslot, HASHTABLE_ITER_SAFE);
     } else {
-        kvs_it = kvstoreIteratorInit(c->db->keys);
+        kvs_it = kvstoreIteratorInit(c->db->keys, HASHTABLE_ITER_SAFE);
     }
     void *next;
     while (kvs_di ? kvstoreHashtableIteratorNext(kvs_di, &next) : kvstoreIteratorNext(kvs_it, &next)) {
diff --git a/src/debug.c b/src/debug.c
index 915e0c264d..b7f8df04fa 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -207,7 +207,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o)
         } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
             zset *zs = o->ptr;
             hashtableIterator iter;
-            hashtableInitIterator(&iter, zs->ht);
+            hashtableInitIterator(&iter, zs->ht, 0);
 
             void *next;
             while (hashtableNext(&iter, &next)) {
@@ -291,7 +291,7 @@ void computeDatasetDigest(unsigned char *final) {
     for (int j = 0; j < server.dbnum; j++) {
         serverDb *db = server.db + j;
         if (kvstoreSize(db->keys) == 0) continue;
-        kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys);
+        kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES);
 
         /* hash the DB id, so the same dataset moved in a different DB will lead to a different digest */
         aux = htonl(j);
diff --git a/src/hashtable.c b/src/hashtable.c
index 3f1eff19c1..23097eb246 100644
--- a/src/hashtable.c
+++ b/src/hashtable.c
@@ -300,7 +300,7 @@ typedef struct {
     long index;
     uint16_t pos_in_bucket;
     uint8_t table;
-    uint8_t safe;
+    uint8_t flags;
     union {
         /* Unsafe iterator fingerprint for misuse detection. */
         uint64_t fingerprint;
@@ -936,6 +936,7 @@ static inline incrementalFind *incrementalFindFromOpaque(hashtableIncrementalFin
 
 /* Prefetches all filled entries in the given bucket to optimize future memory access. */
 static void prefetchBucketEntries(bucket *b) {
+    if (!b->presence) return;
     for (int pos = 0; pos < numBucketPositions(b); pos++) {
         if (isPositionFilled(b, pos)) {
             valkey_prefetch(b->entries[pos]);
@@ -979,6 +980,26 @@ static void prefetchNextBucketEntries(iter *iter, bucket *current_bucket) {
     }
 }
 
+/* Prefetches the values associated with the entries in the given bucket by
+ * calling the entryPrefetchValue callback in the hashtableType */
+static void prefetchBucketValues(bucket *b, hashtable *ht) {
+    if (!b->presence) return;
+    assert(ht->type->entryPrefetchValue != NULL);
+    for (int pos = 0; pos < numBucketPositions(b); pos++) {
+        if (isPositionFilled(b, pos)) {
+            ht->type->entryPrefetchValue(b->entries[pos]);
+        }
+    }
+}
+
+static inline int isSafe(iter *iter) {
+    return (iter->flags & HASHTABLE_ITER_SAFE);
+}
+
+static inline int shouldPrefetchValues(iter *iter) {
+    return (iter->flags & HASHTABLE_ITER_PREFETCH_VALUES);
+}
+
 /* --- API functions --- */
 
 /* Allocates and initializes a new hashtable specified by the given type. */
@@ -1792,31 +1813,32 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f
 
 /* --- Iterator --- */
 
-/* Initialize a iterator, that is not allowed to insert, delete or even lookup
- * entries in the hashtable, because such operations can trigger incremental
- * rehashing which moves entries around and confuses the iterator. Only
- * hashtableNext is allowed. Each entry is returned exactly once. Call
- * hashtableResetIterator when you are done. See also
- * hashtableInitSafeIterator. */
-void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) {
-    iter *iter;
-    iter = iteratorFromOpaque(iterator);
-    iter->hashtable = ht;
-    iter->table = 0;
-    iter->index = -1;
-    iter->safe = 0;
-}
-
-/* Initialize a safe iterator, which is allowed to modify the hash table while
- * iterating. It pauses incremental rehashing to prevent entries from moving
- * around. Call hashtableNext to fetch each entry. You must call
- * hashtableResetIterator when you are done with a safe iterator.
+/* Initialize an iterator for a hashtable.
  *
- * It's allowed to insert and replace entries. Deleting entries is only allowed
- * for the entry that was just returned by hashtableNext. Deleting other entries
- * is possible, but doing so can cause internal fragmentation, so don't.
+ * The 'flags' argument can be used to tweak the behaviour. It's a bitwise-or
+ * (zero means no flags) of the following:
+ *
+ * - HASHTABLE_ITER_SAFE: Use a safe iterator that can handle
+ *   modifications to the hash table while iterating.
+ * - HASHTABLE_ITER_PREFETCH_VALUES: Enables prefetching of entries values,
+ *   which can improve performance in some scenarios. Because the hashtable is generic and
+ *   doesn't care which object we store, the callback entryPrefetchValue must be set to help
+ *   us prefetch necessary fields of specific object types stored in the hashtable.
+ *
+ * For a non-safe iterator (default, when HASHTABLE_ITER_SAFE is not set):
+ * It is not allowed to insert, delete or even lookup entries in the hashtable,
+ * because such operations can trigger incremental rehashing which moves entries
+ * around and confuses the iterator. Only hashtableNext is allowed. Each entry
+ * is returned exactly once.
+ *
+ * For a safe iterator (when HASHTABLE_ITER_SAFE is set):
+ * It is allowed to modify the hash table while iterating. It pauses incremental
+ * rehashing to prevent entries from moving around. It's allowed to insert and
+ * replace entries. Deleting entries is only allowed for the entry that was just
+ * returned by hashtableNext. Deleting other entries is possible, but doing so
+ * can cause internal fragmentation, so don't.
  *
- * Guarantees:
+ * Guarantees for safe iterators:
  *
  * - Entries that are in the hash table for the entire iteration are returned
  *   exactly once.
@@ -1829,18 +1851,31 @@ void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) {
  *
  * - Entries that are inserted during the iteration may or may not be returned
  *   by the iterator.
+ *
+ * Call hashtableNext to fetch each entry. You must call hashtableResetIterator
+ * when you are done with the iterator.
  */
-void hashtableInitSafeIterator(hashtableIterator *iterator, hashtable *ht) {
-    hashtableInitIterator(iterator, ht);
+void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht, uint8_t flags) {
+    iter *iter;
+    iter = iteratorFromOpaque(iterator);
+    iter->hashtable = ht;
+    iter->table = 0;
+    iter->index = -1;
+    iter->flags = flags;
+}
+
+/* Reinitializes the iterator for the provided hashtable while
+ * preserving the flags from its previous initialization. */
+void hashtableReinitIterator(hashtableIterator *iterator, hashtable *ht) {
     iter *iter = iteratorFromOpaque(iterator);
-    iter->safe = 1;
+    hashtableInitIterator(iterator, ht, iter->flags);
 }
 
 /* Resets a stack-allocated iterator. */
 void hashtableResetIterator(hashtableIterator *iterator) {
     iter *iter = iteratorFromOpaque(iterator);
     if (!(iter->index == -1 && iter->table == 0)) {
-        if (iter->safe) {
+        if (isSafe(iter)) {
             hashtableResumeRehashing(iter->hashtable);
             assert(iter->hashtable->pause_rehash >= 0);
         } else {
@@ -1850,21 +1885,13 @@ void hashtableResetIterator(hashtableIterator *iterator) {
 }
 
 /* Allocates and initializes an iterator. */
-hashtableIterator *hashtableCreateIterator(hashtable *ht) {
+hashtableIterator *hashtableCreateIterator(hashtable *ht, uint8_t flags) {
     iter *iter = zmalloc(sizeof(*iter));
     hashtableIterator *opaque = iteratorToOpaque(iter);
-    hashtableInitIterator(opaque, ht);
+    hashtableInitIterator(opaque, ht, flags);
     return opaque;
 }
 
-/* Allocates and initializes a safe iterator. */
-hashtableIterator *hashtableCreateSafeIterator(hashtable *ht) {
-    hashtableIterator *iterator = hashtableCreateIterator(ht);
-    iter *iter = iteratorFromOpaque(iterator);
-    iter->safe = 1;
-    return iterator;
-}
-
 /* Resets and frees the memory of an allocated iterator, i.e. one created using
  * hashtableCreate(Safe)Iterator. */
 void hashtableReleaseIterator(hashtableIterator *iterator) {
@@ -1880,7 +1907,7 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) {
     while (1) {
         if (iter->index == -1 && iter->table == 0) {
             /* It's the first call to next. */
-            if (iter->safe) {
+            if (isSafe(iter)) {
                 hashtablePauseRehashing(iter->hashtable);
                 iter->last_seen_size = iter->hashtable->used[iter->table];
             } else {
@@ -1907,7 +1934,7 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) {
                 iter->bucket = getChildBucket(iter->bucket);
             } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) {
                 /* Bucket index done. */
-                if (iter->safe) {
+                if (isSafe(iter)) {
                     /* If entries in this bucket chain have been deleted,
                      * they've left empty spaces in the buckets. The chain is
                      * not automatically compacted when rehashing is paused. If
@@ -1936,6 +1963,9 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) {
         }
         bucket *b = iter->bucket;
         if (iter->pos_in_bucket == 0) {
+            if (shouldPrefetchValues(iter)) {
+                prefetchBucketValues(b, iter->hashtable);
+            }
             prefetchNextBucketEntries(iter, b);
         }
         if (!isPositionFilled(b, iter->pos_in_bucket)) {
diff --git a/src/hashtable.h b/src/hashtable.h
index 4291cf5a5d..67e8a139f8 100644
--- a/src/hashtable.h
+++ b/src/hashtable.h
@@ -60,6 +60,8 @@ typedef struct {
     /* Callback to free an entry when it's overwritten or deleted.
      * Optional. */
     void (*entryDestructor)(void *entry);
+    /* Callback to prefetch the value associated with a hashtable entry. */
+    void (*entryPrefetchValue)(const void *entry);
     /* Callback to control when resizing should be allowed. */
     int (*resizeAllowed)(size_t moreMem, double usedRatio);
     /* Invoked at the start of rehashing. */
@@ -91,6 +93,10 @@ typedef void (*hashtableScanFunction)(void *privdata, void *entry);
 /* Scan flags */
 #define HASHTABLE_SCAN_EMIT_REF (1 << 0)
 
+/* Iterator flags */
+#define HASHTABLE_ITER_SAFE (1 << 0)
+#define HASHTABLE_ITER_PREFETCH_VALUES (1 << 1)
+
 /* --- Prototypes --- */
 
 /* Hash function (global seed) */
@@ -144,11 +150,10 @@ int hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, void
 /* Iteration & scan */
 size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata);
 size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags);
-void hashtableInitIterator(hashtableIterator *iter, hashtable *ht);
-void hashtableInitSafeIterator(hashtableIterator *iter, hashtable *ht);
+void hashtableInitIterator(hashtableIterator *iter, hashtable *ht, uint8_t flags);
+void hashtableReinitIterator(hashtableIterator *iterator, hashtable *ht);
 void hashtableResetIterator(hashtableIterator *iter);
-hashtableIterator *hashtableCreateIterator(hashtable *ht);
-hashtableIterator *hashtableCreateSafeIterator(hashtable *ht);
+hashtableIterator *hashtableCreateIterator(hashtable *ht, uint8_t flags);
 void hashtableReleaseIterator(hashtableIterator *iter);
 int hashtableNext(hashtableIterator *iter, void **elemptr);
 
diff --git a/src/kvstore.c b/src/kvstore.c
index d6db4d3fe1..76bfb35d98 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -467,7 +467,7 @@ void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) {
     hashtableStats *mainHtStats = NULL;
     hashtableStats *rehashHtStats = NULL;
     hashtable *ht;
-    kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs);
+    kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs, HASHTABLE_ITER_SAFE);
     while ((ht = kvstoreIteratorNextHashtable(kvs_it))) {
         hashtableStats *stats = hashtableGetStatsHt(ht, 0, full);
         if (!mainHtStats) {
@@ -576,12 +576,12 @@ int kvstoreNumHashtables(kvstore *kvs) {
 /* Returns kvstore iterator that can be used to iterate through sub-hash tables.
  *
  * The caller should free the resulting kvs_it with kvstoreIteratorRelease. */
-kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) {
+kvstoreIterator *kvstoreIteratorInit(kvstore *kvs, uint8_t flags) {
     kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it));
     kvs_it->kvs = kvs;
     kvs_it->didx = -1;
     kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */
-    hashtableInitSafeIterator(&kvs_it->di, NULL);
+    hashtableInitIterator(&kvs_it->di, NULL, flags);
     return kvs_it;
 }
 
@@ -625,7 +625,7 @@ int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) {
         /* No current hashtable or reached the end of the hash table. */
         hashtable *ht = kvstoreIteratorNextHashtable(kvs_it);
         if (!ht) return 0;
-        hashtableInitSafeIterator(&kvs_it->di, ht);
+        hashtableReinitIterator(&kvs_it->di, ht);
         return hashtableNext(&kvs_it->di, next);
     }
 }
@@ -691,23 +691,15 @@ unsigned long kvstoreHashtableSize(kvstore *kvs, int didx) {
     return hashtableSize(ht);
 }
 
-kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx) {
+kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx, uint8_t flags) {
     kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di));
     kvs_di->kvs = kvs;
     kvs_di->didx = didx;
-    hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx));
+    hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx), flags);
     return kvs_di;
 }
 
-kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx) {
-    kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di));
-    kvs_di->kvs = kvs;
-    kvs_di->didx = didx;
-    hashtableInitSafeIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx));
-    return kvs_di;
-}
-
-/* Free the kvs_di returned by kvstoreGetHashtableIterator and kvstoreGetHashtableSafeIterator. */
+/* Free the kvs_di returned by kvstoreGetHashtableIterator. */
 void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_di) {
     /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */
     if (kvstoreGetHashtable(kvs_di->kvs, kvs_di->didx)) {
diff --git a/src/kvstore.h b/src/kvstore.h
index 1a8c74a6b9..d5db1a89aa 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -43,7 +43,7 @@ void kvstoreHashtableTrackMemUsage(hashtable *s, ssize_t delta);
 size_t kvstoreHashtableMetadataSize(void);
 
 /* kvstore iterator specific functions */
-kvstoreIterator *kvstoreIteratorInit(kvstore *kvs);
+kvstoreIterator *kvstoreIteratorInit(kvstore *kvs, uint8_t flags);
 void kvstoreIteratorRelease(kvstoreIterator *kvs_it);
 int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it);
 int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next);
@@ -57,8 +57,7 @@ unsigned long kvstoreHashtableRehashingCount(kvstore *kvs);
 
 /* Specific hashtable access by hashtable-index */
 unsigned long kvstoreHashtableSize(kvstore *kvs, int didx);
-kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx);
-kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx);
+kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx, uint8_t flags);
 void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_id);
 int kvstoreHashtableIteratorNext(kvstoreHashtableIterator *kvs_di, void **next);
 int kvstoreHashtableRandomEntry(kvstore *kvs, int didx, void **found);
diff --git a/src/latency.c b/src/latency.c
index 2beb4859d1..fa448dac35 100644
--- a/src/latency.c
+++ b/src/latency.c
@@ -528,7 +528,7 @@ void fillCommandCDF(client *c, struct hdr_histogram *histogram) {
  * a per command cumulative distribution of latencies. */
 void latencyAllCommandsFillCDF(client *c, hashtable *commands, int *command_with_data) {
     hashtableIterator iter;
-    hashtableInitSafeIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE);
     void *next;
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
@@ -565,7 +565,7 @@ void latencySpecificCommandsFillCDF(client *c) {
 
         if (cmd->subcommands_ht) {
             hashtableIterator iter;
-            hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+            hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE);
             void *next;
             while (hashtableNext(&iter, &next)) {
                 struct serverCommand *sub = next;
diff --git a/src/module.c b/src/module.c
index 75dcd81cd6..17ac4ddf02 100644
--- a/src/module.c
+++ b/src/module.c
@@ -12162,7 +12162,7 @@ int moduleFreeCommand(struct ValkeyModule *module, struct serverCommand *cmd) {
     if (cmd->subcommands_ht) {
         hashtableIterator iter;
         void *next;
-        hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+        hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE);
         while (hashtableNext(&iter, &next)) {
             struct serverCommand *sub = next;
             if (moduleFreeCommand(module, sub) != C_OK) continue;
@@ -12185,7 +12185,7 @@ void moduleUnregisterCommands(struct ValkeyModule *module) {
     /* Unregister all the commands registered by this module. */
     hashtableIterator iter;
     void *next;
-    hashtableInitSafeIterator(&iter, server.commands);
+    hashtableInitIterator(&iter, server.commands, HASHTABLE_ITER_SAFE);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
         if (moduleFreeCommand(module, cmd) != C_OK) continue;
diff --git a/src/object.c b/src/object.c
index b8200dd815..94c2985edb 100644
--- a/src/object.c
+++ b/src/object.c
@@ -630,7 +630,7 @@ void dismissSetObject(robj *o, size_t size_hint) {
          * page size, and there's a high chance we'll actually dismiss something. */
         if (size_hint / hashtableSize(ht) >= server.page_size) {
             hashtableIterator iter;
-            hashtableInitIterator(&iter, ht);
+            hashtableInitIterator(&iter, ht, 0);
             void *next;
             while (hashtableNext(&iter, &next)) {
                 sds item = next;
@@ -682,7 +682,7 @@ void dismissHashObject(robj *o, size_t size_hint) {
          * a page size, and there's a high chance we'll actually dismiss something. */
         if (size_hint / hashtableSize(ht) >= server.page_size) {
             hashtableIterator iter;
-            hashtableInitIterator(&iter, ht);
+            hashtableInitIterator(&iter, ht, 0);
             void *next;
             while (hashtableNext(&iter, &next)) {
                 dismissHashTypeEntry(next);
@@ -1156,7 +1156,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
             asize = sizeof(*o) + hashtableMemUsage(ht);
 
             hashtableIterator iter;
-            hashtableInitIterator(&iter, ht);
+            hashtableInitIterator(&iter, ht, 0);
             void *next;
             while (hashtableNext(&iter, &next) && samples < sample_size) {
                 sds element = next;
@@ -1197,7 +1197,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
         } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
             hashtable *ht = o->ptr;
             hashtableIterator iter;
-            hashtableInitIterator(&iter, ht);
+            hashtableInitIterator(&iter, ht, 0);
             void *next;
 
             asize = sizeof(*o) + hashtableMemUsage(ht);
diff --git a/src/pubsub.c b/src/pubsub.c
index 27b5611788..be6c739e98 100644
--- a/src/pubsub.c
+++ b/src/pubsub.c
@@ -366,7 +366,7 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty
 void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) {
     if (!kvstoreHashtableSize(server.pubsubshard_channels, slot)) return;
 
-    kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableSafeIterator(server.pubsubshard_channels, slot);
+    kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(server.pubsubshard_channels, slot, HASHTABLE_ITER_SAFE);
     void *element;
     while (kvstoreHashtableIteratorNext(kvs_di, &element)) {
         dict *clients = element;
@@ -730,7 +730,7 @@ void channelList(client *c, sds pat, kvstore *pubsub_channels) {
     replylen = addReplyDeferredLen(c);
     for (unsigned int i = 0; i < slot_cnt; i++) {
         if (!kvstoreHashtableSize(pubsub_channels, i)) continue;
-        kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(pubsub_channels, i);
+        kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(pubsub_channels, i, 0);
         void *next;
         while (kvstoreHashtableIteratorNext(kvs_di, &next)) {
             dict *clients = next;
diff --git a/src/rdb.c b/src/rdb.c
index 0bb5d7d45d..6653e99c3a 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -887,7 +887,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
             nwritten += n;
 
             hashtableIterator iterator;
-            hashtableInitIterator(&iterator, set);
+            hashtableInitIterator(&iterator, set, 0);
             void *next;
             while (hashtableNext(&iterator, &next)) {
                 sds ele = next;
@@ -959,7 +959,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
             nwritten += n;
 
             hashtableIterator iter;
-            hashtableInitIterator(&iter, ht);
+            hashtableInitIterator(&iter, ht, 0);
             void *next;
             while (hashtableNext(&iter, &next)) {
                 sds field = hashTypeEntryGetField(next);
@@ -1349,7 +1349,7 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) {
     if ((res = rdbSaveLen(rdb, expires_size)) < 0) goto werr;
     written += res;
 
-    kvs_it = kvstoreIteratorInit(db->keys);
+    kvs_it = kvstoreIteratorInit(db->keys, HASHTABLE_ITER_SAFE | HASHTABLE_ITER_PREFETCH_VALUES);
     int last_slot = -1;
     /* Iterate this DB writing every entry */
     void *next;
diff --git a/src/server.c b/src/server.c
index bd6fa2e8cd..90beebebdd 100644
--- a/src/server.c
+++ b/src/server.c
@@ -577,6 +577,15 @@ const void *hashtableObjectGetKey(const void *entry) {
     return objectGetKey(entry);
 }
 
+/* Prefetch the value if it's not embedded. */
+void hashtableObjectPrefetchValue(const void *entry) {
+    const robj *obj = entry;
+    if (obj->encoding != OBJ_ENCODING_EMBSTR &&
+        obj->encoding != OBJ_ENCODING_INT) {
+        valkey_prefetch(obj->ptr);
+    }
+}
+
 int hashtableObjKeyCompare(const void *key1, const void *key2) {
     const robj *o1 = key1, *o2 = key2;
     return hashtableSdsKeyCompare(o1->ptr, o2->ptr);
@@ -589,6 +598,7 @@ void hashtableObjectDestructor(void *val) {
 
 /* Kvstore->keys, keys are sds strings, vals are Objects. */
 hashtableType kvstoreKeysHashtableType = {
+    .entryPrefetchValue = hashtableObjectPrefetchValue,
     .entryGetKey = hashtableObjectGetKey,
     .hashFunction = hashtableSdsHash,
     .keyCompare = hashtableSdsKeyCompare,
@@ -602,6 +612,7 @@ hashtableType kvstoreKeysHashtableType = {
 
 /* Kvstore->expires */
 hashtableType kvstoreExpiresHashtableType = {
+    .entryPrefetchValue = hashtableObjectPrefetchValue,
     .entryGetKey = hashtableObjectGetKey,
     .hashFunction = hashtableSdsHash,
     .keyCompare = hashtableSdsKeyCompare,
@@ -3205,7 +3216,7 @@ void populateCommandTable(void) {
 void resetCommandTableStats(hashtable *commands) {
     hashtableIterator iter;
     void *next;
-    hashtableInitSafeIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *c = next;
         c->microseconds = 0;
@@ -4988,7 +4999,7 @@ void addReplyCommandSubCommands(client *c,
 
     void *next;
     hashtableIterator iter;
-    hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+    hashtableInitIterator(&iter, cmd->subcommands_ht, HASHTABLE_ITER_SAFE);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *sub = next;
         if (use_map) addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname));
@@ -5150,7 +5161,7 @@ void commandCommand(client *c) {
     hashtableIterator iter;
     void *next;
     addReplyArrayLen(c, hashtableSize(server.commands));
-    hashtableInitIterator(&iter, server.commands);
+    hashtableInitIterator(&iter, server.commands, 0);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
         addReplyCommandInfo(c, cmd);
@@ -5209,7 +5220,7 @@ int shouldFilterFromCommandList(struct serverCommand *cmd, commandListFilter *fi
 void commandListWithFilter(client *c, hashtable *commands, commandListFilter filter, int *numcmds) {
     hashtableIterator iter;
     void *next;
-    hashtableInitIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, 0);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
         if (!shouldFilterFromCommandList(cmd, &filter)) {
@@ -5228,7 +5239,7 @@ void commandListWithFilter(client *c, hashtable *commands, commandListFilter fil
 void commandListWithoutFilter(client *c, hashtable *commands, int *numcmds) {
     hashtableIterator iter;
     void *next;
-    hashtableInitIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, 0);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *cmd = next;
         addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
@@ -5290,7 +5301,7 @@ void commandInfoCommand(client *c) {
         hashtableIterator iter;
         void *next;
         addReplyArrayLen(c, hashtableSize(server.commands));
-        hashtableInitIterator(&iter, server.commands);
+        hashtableInitIterator(&iter, server.commands, 0);
         while (hashtableNext(&iter, &next)) {
             struct serverCommand *cmd = next;
             addReplyCommandInfo(c, cmd);
@@ -5312,7 +5323,7 @@ void commandDocsCommand(client *c) {
         hashtableIterator iter;
         void *next;
         addReplyMapLen(c, hashtableSize(server.commands));
-        hashtableInitIterator(&iter, server.commands);
+        hashtableInitIterator(&iter, server.commands, 0);
         while (hashtableNext(&iter, &next)) {
             struct serverCommand *cmd = next;
             addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
@@ -5441,7 +5452,7 @@ const char *getSafeInfoString(const char *s, size_t len, char **tmp) {
 sds genValkeyInfoStringCommandStats(sds info, hashtable *commands) {
     hashtableIterator iter;
     void *next;
-    hashtableInitSafeIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *c = next;
         char *tmpsafe;
@@ -5478,7 +5489,7 @@ sds genValkeyInfoStringACLStats(sds info) {
 sds genValkeyInfoStringLatencyStats(sds info, hashtable *commands) {
     hashtableIterator iter;
     void *next;
-    hashtableInitSafeIterator(&iter, commands);
+    hashtableInitIterator(&iter, commands, HASHTABLE_ITER_SAFE);
     while (hashtableNext(&iter, &next)) {
         struct serverCommand *c = next;
         char *tmpsafe;
diff --git a/src/sort.c b/src/sort.c
index 7af96141e8..754ebef4a2 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -447,7 +447,7 @@ void sortCommandGeneric(client *c, int readonly) {
     } else if (sortval->type == OBJ_ZSET) {
         hashtable *ht = ((zset *)sortval->ptr)->ht;
         hashtableIterator iter;
-        hashtableInitIterator(&iter, ht);
+        hashtableInitIterator(&iter, ht, 0);
         void *next;
         while (hashtableNext(&iter, &next)) {
             zskiplistNode *node = next;
diff --git a/src/t_hash.c b/src/t_hash.c
index b6e6457bb6..b347ecf31f 100644
--- a/src/t_hash.c
+++ b/src/t_hash.c
@@ -382,7 +382,7 @@ void hashTypeInitIterator(robj *subject, hashTypeIterator *hi) {
         hi->fptr = NULL;
         hi->vptr = NULL;
     } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) {
-        hashtableInitIterator(&hi->iter, subject->ptr);
+        hashtableInitIterator(&hi->iter, subject->ptr, 0);
     } else {
         serverPanic("Unknown hash encoding");
     }
diff --git a/src/t_set.c b/src/t_set.c
index 4279baf82f..a69345de4f 100644
--- a/src/t_set.c
+++ b/src/t_set.c
@@ -317,7 +317,7 @@ setTypeIterator *setTypeInitIterator(robj *subject) {
     si->subject = subject;
     si->encoding = subject->encoding;
     if (si->encoding == OBJ_ENCODING_HASHTABLE) {
-        si->hashtable_iterator = hashtableCreateIterator(subject->ptr);
+        si->hashtable_iterator = hashtableCreateIterator(subject->ptr, 0);
     } else if (si->encoding == OBJ_ENCODING_INTSET) {
         si->ii = 0;
     } else if (si->encoding == OBJ_ENCODING_LISTPACK) {
@@ -1179,7 +1179,7 @@ void srandmemberWithCountCommand(client *c) {
     /* CASE 3 & 4: send the result to the user. */
     {
         hashtableIterator iter;
-        hashtableInitIterator(&iter, ht);
+        hashtableInitIterator(&iter, ht, 0);
 
         addReplyArrayLen(c, count);
         serverAssert(count == hashtableSize(ht));
diff --git a/src/t_zset.c b/src/t_zset.c
index 77c96613b7..2444f3ecd0 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -2092,7 +2092,7 @@ static void zuiInitIterator(zsetopsrc *op) {
             it->is.is = op->subject->ptr;
             it->is.ii = 0;
         } else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
-            it->ht.iter = hashtableCreateIterator(op->subject->ptr);
+            it->ht.iter = hashtableCreateIterator(op->subject->ptr, 0);
         } else if (op->encoding == OBJ_ENCODING_LISTPACK) {
             it->lp.lp = op->subject->ptr;
             it->lp.p = lpFirst(it->lp.lp);
@@ -2349,7 +2349,7 @@ static size_t zsetHashtableGetMaxElementLength(hashtable *ht, size_t *totallen)
     size_t maxelelen = 0;
 
     hashtableIterator iter;
-    hashtableInitIterator(&iter, ht);
+    hashtableInitIterator(&iter, ht, 0);
     void *next;
     while (hashtableNext(&iter, &next)) {
         zskiplistNode *node = next;
@@ -2749,7 +2749,7 @@ static void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIn
 
         /* Step 2: Create the skiplist using final score ordering */
         hashtableIterator iter;
-        hashtableInitIterator(&iter, dstzset->ht);
+        hashtableInitIterator(&iter, dstzset->ht, 0);
 
         void *next;
         while (hashtableNext(&iter, &next)) {
diff --git a/src/unit/test_hashtable.c b/src/unit/test_hashtable.c
index 689440e43d..71a7dde841 100644
--- a/src/unit/test_hashtable.c
+++ b/src/unit/test_hashtable.c
@@ -547,7 +547,7 @@ int test_iterator(int argc, char **argv, int flags) {
     size_t num_returned = 0;
     hashtableIterator iter;
     void *next;
-    hashtableInitIterator(&iter, ht);
+    hashtableInitIterator(&iter, ht, 0);
     while (hashtableNext(&iter, &next)) {
         uint8_t *entry = next;
         num_returned++;
@@ -592,7 +592,7 @@ int test_safe_iterator(int argc, char **argv, int flags) {
     size_t num_returned = 0;
     hashtableIterator iter;
     void *next;
-    hashtableInitSafeIterator(&iter, ht);
+    hashtableInitIterator(&iter, ht, HASHTABLE_ITER_SAFE);
     while (hashtableNext(&iter, &next)) {
         uint8_t *entry = next;
         size_t index = entry - entry_counts;
@@ -657,7 +657,7 @@ int test_compact_bucket_chain(int argc, char **argv, int flags) {
     size_t num_chained_buckets = hashtableChainedBuckets(ht, 0);
     size_t num_returned = 0;
     hashtableIterator iter;
-    hashtableInitSafeIterator(&iter, ht);
+    hashtableInitIterator(&iter, ht, HASHTABLE_ITER_SAFE);
     void *entry;
     while (hashtableNext(&iter, &entry)) {
         /* As long as the iterator is still returning entries from the same
diff --git a/src/unit/test_kvstore.c b/src/unit/test_kvstore.c
index d4cc91d6d8..55b311c4ba 100644
--- a/src/unit/test_kvstore.c
+++ b/src/unit/test_kvstore.c
@@ -77,7 +77,7 @@ int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **arg
         TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i)));
     }
 
-    kvs_it = kvstoreIteratorInit(kvs1);
+    kvs_it = kvstoreIteratorInit(kvs1, HASHTABLE_ITER_SAFE);
     while (kvstoreIteratorNext(kvs_it, &key)) {
         curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it);
         TEST_ASSERT(kvstoreHashtableDelete(kvs1, curr_slot, key));
@@ -110,7 +110,7 @@ int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv,
         TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i)));
     }
 
-    kvs_it = kvstoreIteratorInit(kvs2);
+    kvs_it = kvstoreIteratorInit(kvs2, HASHTABLE_ITER_SAFE);
     while (kvstoreIteratorNext(kvs_it, &key)) {
         curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it);
         TEST_ASSERT(kvstoreHashtableDelete(kvs2, curr_slot, key));
@@ -146,7 +146,7 @@ int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, c
         TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i)));
     }
 
-    kvs_di = kvstoreGetHashtableSafeIterator(kvs1, didx);
+    kvs_di = kvstoreGetHashtableIterator(kvs1, didx, HASHTABLE_ITER_SAFE);
     while (kvstoreHashtableIteratorNext(kvs_di, &key)) {
         TEST_ASSERT(kvstoreHashtableDelete(kvs1, didx, key));
     }
@@ -177,7 +177,7 @@ int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, cha
         TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i)));
     }
 
-    kvs_di = kvstoreGetHashtableSafeIterator(kvs2, didx);
+    kvs_di = kvstoreGetHashtableIterator(kvs2, didx, HASHTABLE_ITER_SAFE);
     while (kvstoreHashtableIteratorNext(kvs_di, &key)) {
         TEST_ASSERT(kvstoreHashtableDelete(kvs2, didx, key));
     }

From ca68e8ac61b091d725efe55898bb3661b6082ff0 Mon Sep 17 00:00:00 2001
From: "zhaozhao.zz" <zhaozhao.zz@alibaba-inc.com>
Date: Fri, 24 Jan 2025 11:41:40 +0800
Subject: [PATCH 097/101] Feature COMMANDLOG to record slow execution and large
 request/reply (#1294)

As discussed in PR #336.

We have different types of resources like CPU, memory, network, etc. The
`slowlog` can only record commands eat lots of CPU during the processing
phase (doesn't include read/write network time), but can not record
commands eat too many memory and network. For example:

1. run "SET key value(10 megabytes)" command would not be recored in
slowlog, since when processing it the SET command only insert the
value's pointer into db dict. But that command eats huge memory in query
buffer and bandwidth from network. In this case, just 1000 tps can cause
10GB/s network flow.
2. run "GET key" command and the key's value length is 10 megabytes. The
get command can eat huge memory in output buffer and bandwidth to
network.

This PR introduces a new command `COMMANDLOG`, to log commands that
consume significant network bandwidth, including both input and output.
Users can retrieve the results using `COMMANDLOG get <count>
large-request` and `COMMANDLOG get <count> large-reply`, all subcommands
for `COMMANDLOG` are:

* `COMMANDLOG HELP`
* `COMMANDLOG GET <count> <slow|large-request|large-reply>`
* `COMMANDLOG LEN <slow|large-request|large-reply>`
* `COMMANDLOG RESET <slow|large-request|large-reply>`

And the slowlog is also incorporated into the commandlog.

For each of these three types, additional configs have been added for
control:

* `commandlog-request-larger-than` and
`commandlog-large-request-max-len` represent the threshold for large
requests(the unit is Bytes) and the maximum number of commands that can
be recorded.
* `commandlog-reply-larger-than` and `commandlog-large-reply-max-len`
represent the threshold for large replies(the unit is Bytes) and the
maximum number of commands that can be recorded.
* `commandlog-execution-slower-than` and
`commandlog-slow-execution-max-len` represent the threshold for slow
executions(the unit is microseconds) and the maximum number of commands
that can be recorded.
* Additionally, `slowlog-log-slower-than` and `slowlog-max-len` are now
set as aliases for these two new configs.

---------

Signed-off-by: zhaozhao.zz <zhaozhao.zz@alibaba-inc.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Ping Xie <pingxie@outlook.com>
---
 cmake/Modules/SourceFiles.cmake    |   2 +-
 src/Makefile                       |   2 +-
 src/blocked.c                      |  16 +-
 src/commandlog.c                   | 265 +++++++++++++++++++++
 src/{slowlog.h => commandlog.h}    |  29 ++-
 src/commands.def                   | 154 +++++++++++-
 src/commands/commandlog-get.json   |  85 +++++++
 src/commands/commandlog-help.json  |  22 ++
 src/commands/commandlog-len.json   |  46 ++++
 src/commands/commandlog-reset.json |  43 ++++
 src/commands/commandlog.json       |   9 +
 src/commands/exec.json             |   2 +-
 src/commands/slowlog-get.json      |   5 +
 src/commands/slowlog-help.json     |   5 +
 src/commands/slowlog-len.json      |   5 +
 src/commands/slowlog-reset.json    |   5 +
 src/commands/slowlog.json          |   7 +-
 src/config.c                       |  10 +-
 src/latency.c                      |   4 +-
 src/module.c                       |   8 +-
 src/multi.c                        |   2 +-
 src/networking.c                   |   4 +-
 src/server.c                       |  33 +--
 src/server.h                       |  28 ++-
 src/slowlog.c                      | 193 ---------------
 tests/unit/commandlog.tcl          | 361 +++++++++++++++++++++++++++++
 valkey.conf                        |  85 +++++--
 27 files changed, 1138 insertions(+), 292 deletions(-)
 create mode 100644 src/commandlog.c
 rename src/{slowlog.h => commandlog.h} (71%)
 create mode 100644 src/commands/commandlog-get.json
 create mode 100644 src/commands/commandlog-help.json
 create mode 100644 src/commands/commandlog-len.json
 create mode 100644 src/commands/commandlog-reset.json
 create mode 100644 src/commands/commandlog.json
 delete mode 100644 src/slowlog.c
 create mode 100644 tests/unit/commandlog.tcl

diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake
index 80c3f0c876..e51f9b7600 100644
--- a/cmake/Modules/SourceFiles.cmake
+++ b/cmake/Modules/SourceFiles.cmake
@@ -47,7 +47,7 @@ set(VALKEY_SERVER_SRCS
     ${CMAKE_SOURCE_DIR}/src/cluster_slot_stats.c
     ${CMAKE_SOURCE_DIR}/src/crc16.c
     ${CMAKE_SOURCE_DIR}/src/endianconv.c
-    ${CMAKE_SOURCE_DIR}/src/slowlog.c
+    ${CMAKE_SOURCE_DIR}/src/commandlog.c
     ${CMAKE_SOURCE_DIR}/src/eval.c
     ${CMAKE_SOURCE_DIR}/src/bio.c
     ${CMAKE_SOURCE_DIR}/src/rio.c
diff --git a/src/Makefile b/src/Makefile
index 9e4075660d..7a951193e4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -416,7 +416,7 @@ endif
 ENGINE_NAME=valkey
 SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
 ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
-ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o scripting_engine.o
+ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o commandlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o scripting_engine.o
 ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
 ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
 ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
diff --git a/src/blocked.c b/src/blocked.c
index d2d6a5d314..1edb7728cc 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -61,7 +61,7 @@
  */
 
 #include "server.h"
-#include "slowlog.h"
+#include "commandlog.h"
 #include "latency.h"
 #include "monotonic.h"
 #include "cluster_slot_stats.h"
@@ -117,15 +117,15 @@ void blockClient(client *c, int btype) {
  * he will attempt to reprocess the command which will update the statistics.
  * However in case the client was timed out or in case of module blocked client is being unblocked
  * the command will not be reprocessed and we need to make stats update.
- * This function will make updates to the commandstats, slot-stats, slowlog and monitors.
+ * This function will make updates to the commandstats, slot-stats, commandlog and monitors.
  * The failed_or_rejected parameter is an indication that the blocked command was either failed internally or
  * rejected/aborted externally. In case the command was rejected the value ERROR_COMMAND_REJECTED should be passed.
  * In case the command failed internally, ERROR_COMMAND_FAILED should be passed.
  * A value of zero indicate no error was reported after the command was unblocked  */
 void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_or_rejected) {
-    const ustime_t total_cmd_duration = c->duration + blocked_us + reply_us;
-    c->lastcmd->microseconds += total_cmd_duration;
-    clusterSlotStatsAddCpuDuration(c, total_cmd_duration);
+    c->duration += blocked_us + reply_us;
+    c->lastcmd->microseconds += c->duration;
+    clusterSlotStatsAddCpuDuration(c, c->duration);
     c->lastcmd->calls++;
     c->commands_processed++;
     server.stat_numcommands++;
@@ -139,9 +139,9 @@ void updateStatsOnUnblock(client *c, long blocked_us, long reply_us, int failed_
             debugServerAssertWithInfo(c, NULL, 0);
     }
     if (server.latency_tracking_enabled)
-        updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), total_cmd_duration * 1000);
-    /* Log the command into the Slow log if needed. */
-    slowlogPushCurrentCommand(c, c->lastcmd, total_cmd_duration);
+        updateCommandLatencyHistogram(&(c->lastcmd->latency_histogram), c->duration * 1000);
+    /* Log the command into the commandlog if needed. */
+    commandlogPushCurrentCommand(c, c->lastcmd);
     c->duration = 0;
     /* Log the reply duration event. */
     latencyAddSampleIfNeeded("command-unblocking", reply_us / 1000);
diff --git a/src/commandlog.c b/src/commandlog.c
new file mode 100644
index 0000000000..cf25cbf2c2
--- /dev/null
+++ b/src/commandlog.c
@@ -0,0 +1,265 @@
+/* Commandlog implements a system that is able to remember the latest N
+ * queries that took more than M microseconds to execute, or consumed
+ * too much network bandwidth and memory for input/output buffers.
+ *
+ * The execution time to reach to be logged in the slow log is set
+ * using the 'commandlog-execution-slower-than' config directive, that is also
+ * readable and writable using the CONFIG SET/GET command.
+ *
+ * Other configurations such as `commandlog-request-larger-than` and
+ * `commandlog-reply-larger-than` can be found with more detailed
+ * explanations in the config file.
+ *
+ * The command log is actually not "logged" in the server log file
+ * but is accessible thanks to the COMMANDLOG command.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * Copyright Valkey Contributors.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD 3-Clause
+ */
+
+#include "commandlog.h"
+#include "script.h"
+
+/* Create a new commandlog entry.
+ * Incrementing the ref count of all the objects retained is up to
+ * this function. */
+static commandlogEntry *commandlogCreateEntry(client *c, robj **argv, int argc, long long value, int type) {
+    commandlogEntry *ce = zmalloc(sizeof(*ce));
+    int j, ceargc = argc;
+
+    if (ceargc > COMMANDLOG_ENTRY_MAX_ARGC) ceargc = COMMANDLOG_ENTRY_MAX_ARGC;
+    ce->argc = ceargc;
+    ce->argv = zmalloc(sizeof(robj *) * ceargc);
+    for (j = 0; j < ceargc; j++) {
+        /* Logging too many arguments is a useless memory waste, so we stop
+         * at COMMANDLOG_ENTRY_MAX_ARGC, but use the last argument to specify
+         * how many remaining arguments there were in the original command. */
+        if (ceargc != argc && j == ceargc - 1) {
+            ce->argv[j] =
+                createObject(OBJ_STRING, sdscatprintf(sdsempty(), "... (%d more arguments)", argc - ceargc + 1));
+        } else {
+            /* Trim too long strings as well... */
+            if (argv[j]->type == OBJ_STRING && sdsEncodedObject(argv[j]) &&
+                sdslen(argv[j]->ptr) > COMMANDLOG_ENTRY_MAX_STRING) {
+                sds s = sdsnewlen(argv[j]->ptr, COMMANDLOG_ENTRY_MAX_STRING);
+
+                s = sdscatprintf(s, "... (%lu more bytes)",
+                                 (unsigned long)sdslen(argv[j]->ptr) - COMMANDLOG_ENTRY_MAX_STRING);
+                ce->argv[j] = createObject(OBJ_STRING, s);
+            } else if (argv[j]->refcount == OBJ_SHARED_REFCOUNT) {
+                ce->argv[j] = argv[j];
+            } else {
+                /* Here we need to duplicate the string objects composing the
+                 * argument vector of the command, because those may otherwise
+                 * end shared with string objects stored into keys. Having
+                 * shared objects between any part of the server, and the data
+                 * structure holding the data, is a problem: FLUSHALL ASYNC
+                 * may release the shared string object and create a race. */
+                ce->argv[j] = dupStringObject(argv[j]);
+            }
+        }
+    }
+    ce->time = time(NULL);
+    ce->value = value;
+    ce->id = server.commandlog[type].entry_id++;
+    ce->peerid = sdsnew(getClientPeerId(c));
+    ce->cname = c->name ? sdsnew(c->name->ptr) : sdsempty();
+    return ce;
+}
+
+/* Free a command log entry. The argument is void so that the prototype of this
+ * function matches the one of the 'free' method of adlist.c.
+ *
+ * This function will take care to release all the retained object. */
+static void commandlogFreeEntry(void *ceptr) {
+    commandlogEntry *ce = ceptr;
+    int j;
+
+    for (j = 0; j < ce->argc; j++) decrRefCount(ce->argv[j]);
+    zfree(ce->argv);
+    sdsfree(ce->peerid);
+    sdsfree(ce->cname);
+    zfree(ce);
+}
+
+/* Initialize the command log. This function should be called a single time
+ * at server startup. */
+void commandlogInit(void) {
+    for (int i = 0; i < COMMANDLOG_TYPE_NUM; i++) {
+        server.commandlog[i].entries = listCreate();
+        server.commandlog[i].entry_id = 0;
+        listSetFreeMethod(server.commandlog[i].entries, commandlogFreeEntry);
+    }
+}
+
+/* Push a new entry into the command log.
+ * This function will make sure to trim the command log accordingly to the
+ * configured max length. */
+static void commandlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long value, int type) {
+    if (server.commandlog[type].threshold < 0 || server.commandlog[type].max_len == 0) return; /* The corresponding commandlog disabled */
+    if (value >= server.commandlog[type].threshold)
+        listAddNodeHead(server.commandlog[type].entries, commandlogCreateEntry(c, argv, argc, value, type));
+
+    /* Remove old entries if needed. */
+    while (listLength(server.commandlog[type].entries) > server.commandlog[type].max_len) listDelNode(server.commandlog[type].entries, listLast(server.commandlog[type].entries));
+}
+
+/* Remove all the entries from the current command log of the specified type. */
+static void commandlogReset(int type) {
+    while (listLength(server.commandlog[type].entries) > 0) listDelNode(server.commandlog[type].entries, listLast(server.commandlog[type].entries));
+}
+
+/* Reply command logs to client. */
+static void commandlogGetReply(client *c, int type, long count) {
+    listIter li;
+    listNode *ln;
+    commandlogEntry *ce;
+
+    if (count > (long)listLength(server.commandlog[type].entries)) {
+        count = listLength(server.commandlog[type].entries);
+    }
+    addReplyArrayLen(c, count);
+    listRewind(server.commandlog[type].entries, &li);
+    while (count--) {
+        int j;
+
+        ln = listNext(&li);
+        ce = ln->value;
+        addReplyArrayLen(c, 6);
+        addReplyLongLong(c, ce->id);
+        addReplyLongLong(c, ce->time);
+        addReplyLongLong(c, ce->value);
+        addReplyArrayLen(c, ce->argc);
+        for (j = 0; j < ce->argc; j++) addReplyBulk(c, ce->argv[j]);
+        addReplyBulkCBuffer(c, ce->peerid, sdslen(ce->peerid));
+        addReplyBulkCBuffer(c, ce->cname, sdslen(ce->cname));
+    }
+}
+
+/* Log the last command a client executed into the commandlog. */
+void commandlogPushCurrentCommand(client *c, struct serverCommand *cmd) {
+    /* Some commands may contain sensitive data that should not be available in the commandlog.
+     */
+    if (cmd->flags & CMD_SKIP_COMMANDLOG) return;
+
+    /* If command argument vector was rewritten, use the original
+     * arguments. */
+    robj **argv = c->original_argv ? c->original_argv : c->argv;
+    int argc = c->original_argv ? c->original_argc : c->argc;
+
+    /* If a script is currently running, the client passed in is a
+     * fake client. Or the client passed in is the original client
+     * if this is a EVAL or alike, doesn't matter. In this case,
+     * use the original client to get the client information. */
+    c = scriptIsRunning() ? scriptGetCaller() : c;
+
+    commandlogPushEntryIfNeeded(c, argv, argc, c->duration, COMMANDLOG_TYPE_SLOW);
+    commandlogPushEntryIfNeeded(c, argv, argc, c->net_input_bytes_curr_cmd, COMMANDLOG_TYPE_LARGE_REQUEST);
+    commandlogPushEntryIfNeeded(c, argv, argc, c->net_output_bytes_curr_cmd, COMMANDLOG_TYPE_LARGE_REPLY);
+}
+
+/* The SLOWLOG command. Implements all the subcommands needed to handle the
+ * slow log. */
+void slowlogCommand(client *c) {
+    if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) {
+        const char *help[] = {
+            "GET [<count>]",
+            "    Return top <count> entries from the slowlog (default: 10, -1 mean all).",
+            "    Entries are made of:",
+            "    id, timestamp, time in microseconds, arguments array, client IP and port,",
+            "    client name",
+            "LEN",
+            "    Return the length of the slowlog.",
+            "RESET",
+            "    Reset the slowlog.",
+            NULL,
+        };
+        addReplyHelp(c, help);
+    } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "reset")) {
+        commandlogReset(COMMANDLOG_TYPE_SLOW);
+        addReply(c, shared.ok);
+    } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "len")) {
+        addReplyLongLong(c, listLength(server.commandlog[COMMANDLOG_TYPE_SLOW].entries));
+    } else if ((c->argc == 2 || c->argc == 3) && !strcasecmp(c->argv[1]->ptr, "get")) {
+        long count = 10;
+
+        if (c->argc == 3) {
+            /* Consume count arg. */
+            if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count,
+                                              "count should be greater than or equal to -1") != C_OK)
+                return;
+
+            if (count == -1) {
+                /* We treat -1 as a special value, which means to get all slow logs.
+                 * Simply set count to the length of server.commandlog. */
+                count = listLength(server.commandlog[COMMANDLOG_TYPE_SLOW].entries);
+            }
+        }
+
+        commandlogGetReply(c, COMMANDLOG_TYPE_SLOW, count);
+    } else {
+        addReplySubcommandSyntaxError(c);
+    }
+}
+
+static int commandlogGetTypeOrReply(client *c, robj *o) {
+    if (!strcasecmp(o->ptr, "slow")) return COMMANDLOG_TYPE_SLOW;
+    if (!strcasecmp(o->ptr, "large-request")) return COMMANDLOG_TYPE_LARGE_REQUEST;
+    if (!strcasecmp(o->ptr, "large-reply")) return COMMANDLOG_TYPE_LARGE_REPLY;
+    addReplyError(c, "type should be one of the following: slow, large-request, large-reply");
+    return -1;
+}
+
+/* The COMMANDLOG command. Implements all the subcommands needed to handle the
+ * command log. */
+void commandlogCommand(client *c) {
+    int type;
+    if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) {
+        const char *help[] = {
+            "GET <count> <type>",
+            "    Return top <count> entries of the specified <type> from the commandlog (-1 mean all).",
+            "    Entries are made of:",
+            "    id, timestamp,",
+            "        time in microseconds for type of slow,",
+            "        or size in bytes for type of large-request,",
+            "        or size in bytes for type of large-reply",
+            "    arguments array, client IP and port,",
+            "    client name",
+            "LEN <type>",
+            "    Return the length of the specified type of commandlog.",
+            "RESET <type>",
+            "    Reset the specified type of commandlog.",
+            NULL,
+        };
+        addReplyHelp(c, help);
+    } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr, "reset")) {
+        if ((type = commandlogGetTypeOrReply(c, c->argv[2])) == -1) return;
+        commandlogReset(type);
+        addReply(c, shared.ok);
+    } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr, "len")) {
+        if ((type = commandlogGetTypeOrReply(c, c->argv[2])) == -1) return;
+        addReplyLongLong(c, listLength(server.commandlog[type].entries));
+    } else if (c->argc == 4 && !strcasecmp(c->argv[1]->ptr, "get")) {
+        long count;
+
+        /* Consume count arg. */
+        if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count,
+                                          "count should be greater than or equal to -1") != C_OK)
+            return;
+
+        if ((type = commandlogGetTypeOrReply(c, c->argv[3])) == -1) return;
+
+        if (count == -1) {
+            /* We treat -1 as a special value, which means to get all command logs.
+             * Simply set count to the length of server.commandlog. */
+            count = listLength(server.commandlog[type].entries);
+        }
+
+        commandlogGetReply(c, type, count);
+    } else {
+        addReplySubcommandSyntaxError(c);
+    }
+}
diff --git a/src/slowlog.h b/src/commandlog.h
similarity index 71%
rename from src/slowlog.h
rename to src/commandlog.h
index 12d9097ffa..825014746e 100644
--- a/src/slowlog.h
+++ b/src/commandlog.h
@@ -27,27 +27,26 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __SLOWLOG_H__
-#define __SLOWLOG_H__
+#ifndef __COMMANDLOG_H__
+#define __COMMANDLOG_H__
 
 #include "server.h"
 
-#define SLOWLOG_ENTRY_MAX_ARGC 32
-#define SLOWLOG_ENTRY_MAX_STRING 128
+#define COMMANDLOG_ENTRY_MAX_ARGC 32
+#define COMMANDLOG_ENTRY_MAX_STRING 128
 
-/* This structure defines an entry inside the slow log list */
-typedef struct slowlogEntry {
+/* This structure defines an entry inside the command log list */
+typedef struct commandlogEntry {
     robj **argv;
     int argc;
-    long long id;       /* Unique entry identifier. */
-    long long duration; /* Time spent by the query, in microseconds. */
-    time_t time;        /* Unix time at which the query was executed. */
-    sds cname;          /* Client name. */
-    sds peerid;         /* Client network address. */
-} slowlogEntry;
+    long long id;    /* Unique entry identifier. */
+    long long value; /* The meaning is determined by the type of command log. */
+    time_t time;     /* Unix time at which the query was executed. */
+    sds cname;       /* Client name. */
+    sds peerid;      /* Client network address. */
+} commandlogEntry;
 
 /* Exported API */
-void slowlogInit(void);
-void slowlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long duration);
+void commandlogInit(void);
 
-#endif /* __SLOWLOG_H__ */
+#endif /* __COMMANDLOG_H__ */
diff --git a/src/commands.def b/src/commands.def
index cd919a80e1..613eb16c9b 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -6668,6 +6668,147 @@ const char *COMMAND_Tips[] = {
 #define COMMAND_Keyspecs NULL
 #endif
 
+/********** COMMANDLOG GET ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* COMMANDLOG GET history */
+#define COMMANDLOG_GET_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* COMMANDLOG GET tips */
+const char *COMMANDLOG_GET_Tips[] = {
+"request_policy:all_nodes",
+"nondeterministic_output",
+};
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* COMMANDLOG GET key specs */
+#define COMMANDLOG_GET_Keyspecs NULL
+#endif
+
+/* COMMANDLOG GET type argument table */
+struct COMMAND_ARG COMMANDLOG_GET_type_Subargs[] = {
+{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
+/* COMMANDLOG GET argument table */
+struct COMMAND_ARG COMMANDLOG_GET_Args[] = {
+{MAKE_ARG("count",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_GET_type_Subargs},
+};
+
+/********** COMMANDLOG HELP ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* COMMANDLOG HELP history */
+#define COMMANDLOG_HELP_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* COMMANDLOG HELP tips */
+#define COMMANDLOG_HELP_Tips NULL
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* COMMANDLOG HELP key specs */
+#define COMMANDLOG_HELP_Keyspecs NULL
+#endif
+
+/********** COMMANDLOG LEN ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* COMMANDLOG LEN history */
+#define COMMANDLOG_LEN_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* COMMANDLOG LEN tips */
+const char *COMMANDLOG_LEN_Tips[] = {
+"request_policy:all_nodes",
+"response_policy:agg_sum",
+"nondeterministic_output",
+};
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* COMMANDLOG LEN key specs */
+#define COMMANDLOG_LEN_Keyspecs NULL
+#endif
+
+/* COMMANDLOG LEN type argument table */
+struct COMMAND_ARG COMMANDLOG_LEN_type_Subargs[] = {
+{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
+/* COMMANDLOG LEN argument table */
+struct COMMAND_ARG COMMANDLOG_LEN_Args[] = {
+{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_LEN_type_Subargs},
+};
+
+/********** COMMANDLOG RESET ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* COMMANDLOG RESET history */
+#define COMMANDLOG_RESET_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* COMMANDLOG RESET tips */
+const char *COMMANDLOG_RESET_Tips[] = {
+"request_policy:all_nodes",
+"response_policy:all_succeeded",
+};
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* COMMANDLOG RESET key specs */
+#define COMMANDLOG_RESET_Keyspecs NULL
+#endif
+
+/* COMMANDLOG RESET type argument table */
+struct COMMAND_ARG COMMANDLOG_RESET_type_Subargs[] = {
+{MAKE_ARG("slow",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("large-request",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("large-reply",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
+/* COMMANDLOG RESET argument table */
+struct COMMAND_ARG COMMANDLOG_RESET_Args[] = {
+{MAKE_ARG("type",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=COMMANDLOG_RESET_type_Subargs},
+};
+
+/* COMMANDLOG command table */
+struct COMMAND_STRUCT COMMANDLOG_Subcommands[] = {
+{MAKE_CMD("get","Returns the specified command log's entries.","O(N) where N is the number of entries returned","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_GET_History,0,COMMANDLOG_GET_Tips,2,commandlogCommand,4,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_GET_Keyspecs,0,NULL,2),.args=COMMANDLOG_GET_Args},
+{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_HELP_History,0,COMMANDLOG_HELP_Tips,0,commandlogCommand,2,CMD_LOADING|CMD_STALE,0,COMMANDLOG_HELP_Keyspecs,0,NULL,0)},
+{MAKE_CMD("len","Returns the number of entries in the specified type of command log.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_LEN_History,0,COMMANDLOG_LEN_Tips,3,commandlogCommand,3,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_LEN_Keyspecs,0,NULL,1),.args=COMMANDLOG_LEN_Args},
+{MAKE_CMD("reset","Clears all entries from the specified type of command log.","O(N) where N is the number of entries in the commandlog","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_RESET_History,0,COMMANDLOG_RESET_Tips,2,commandlogCommand,3,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,COMMANDLOG_RESET_Keyspecs,0,NULL,1),.args=COMMANDLOG_RESET_Args},
+{0}
+};
+
+/********** COMMANDLOG ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* COMMANDLOG history */
+#define COMMANDLOG_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* COMMANDLOG tips */
+#define COMMANDLOG_Tips NULL
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* COMMANDLOG key specs */
+#define COMMANDLOG_Keyspecs NULL
+#endif
+
 /********** CONFIG GET ********************/
 
 #ifndef SKIP_CMD_HISTORY_TABLE
@@ -7831,10 +7972,10 @@ const char *SLOWLOG_RESET_Tips[] = {
 
 /* SLOWLOG command table */
 struct COMMAND_STRUCT SLOWLOG_Subcommands[] = {
-{MAKE_CMD("get","Returns the slow log's entries.","O(N) where N is the number of entries returned","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_GET_History,1,SLOWLOG_GET_Tips,2,slowlogCommand,-2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_GET_Keyspecs,0,NULL,1),.args=SLOWLOG_GET_Args},
-{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_HELP_History,0,SLOWLOG_HELP_Tips,0,slowlogCommand,2,CMD_LOADING|CMD_STALE,0,SLOWLOG_HELP_Keyspecs,0,NULL,0)},
-{MAKE_CMD("len","Returns the number of entries in the slow log.","O(1)","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_LEN_History,0,SLOWLOG_LEN_Tips,3,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_LEN_Keyspecs,0,NULL,0)},
-{MAKE_CMD("reset","Clears all entries from the slow log.","O(N) where N is the number of entries in the slowlog","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_RESET_History,0,SLOWLOG_RESET_Tips,2,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_RESET_Keyspecs,0,NULL,0)},
+{MAKE_CMD("get","Returns the slow log's entries.","O(N) where N is the number of entries returned","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG GET <count> SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_GET_History,1,SLOWLOG_GET_Tips,2,slowlogCommand,-2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_GET_Keyspecs,0,NULL,1),.args=SLOWLOG_GET_Args},
+{MAKE_CMD("help","Show helpful text about the different subcommands","O(1)","6.2.0",CMD_DOC_DEPRECATED,"`COMMANDLOG HELP`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_HELP_History,0,SLOWLOG_HELP_Tips,0,slowlogCommand,2,CMD_LOADING|CMD_STALE,0,SLOWLOG_HELP_Keyspecs,0,NULL,0)},
+{MAKE_CMD("len","Returns the number of entries in the slow log.","O(1)","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG LEN SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_LEN_History,0,SLOWLOG_LEN_Tips,3,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_LEN_Keyspecs,0,NULL,0)},
+{MAKE_CMD("reset","Clears all entries from the slow log.","O(N) where N is the number of entries in the slowlog","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG RESET SLOW`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_RESET_History,0,SLOWLOG_RESET_Tips,2,slowlogCommand,2,CMD_ADMIN|CMD_LOADING|CMD_STALE,0,SLOWLOG_RESET_Keyspecs,0,NULL,0)},
 {0}
 };
 
@@ -11043,6 +11184,7 @@ struct COMMAND_STRUCT serverCommandTable[] = {
 {MAKE_CMD("bgrewriteaof","Asynchronously rewrites the append-only file to disk.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,BGREWRITEAOF_History,0,BGREWRITEAOF_Tips,0,bgrewriteaofCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT,0,BGREWRITEAOF_Keyspecs,0,NULL,0)},
 {MAKE_CMD("bgsave","Asynchronously saves the database(s) to disk.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,BGSAVE_History,2,BGSAVE_Tips,0,bgsaveCommand,-1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT,0,BGSAVE_Keyspecs,0,NULL,1),.args=BGSAVE_Args},
 {MAKE_CMD("command","Returns detailed information about all commands.","O(N) where N is the total number of commands","2.8.13",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMAND_History,0,COMMAND_Tips,1,commandCommand,-1,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,COMMAND_Keyspecs,0,NULL,0),.subcommands=COMMAND_Subcommands},
+{MAKE_CMD("commandlog","A container for command log commands.","Depends on subcommand.","8.1.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,COMMANDLOG_History,0,COMMANDLOG_Tips,0,NULL,-2,0,0,COMMANDLOG_Keyspecs,0,NULL,0),.subcommands=COMMANDLOG_Subcommands},
 {MAKE_CMD("config","A container for server configuration commands.","Depends on subcommand.","2.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,CONFIG_History,0,CONFIG_Tips,0,NULL,-2,0,0,CONFIG_Keyspecs,0,NULL,0),.subcommands=CONFIG_Subcommands},
 {MAKE_CMD("dbsize","Returns the number of keys in the database.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,DBSIZE_History,0,DBSIZE_Tips,2,dbsizeCommand,1,CMD_READONLY|CMD_FAST,ACL_CATEGORY_KEYSPACE,DBSIZE_Keyspecs,0,NULL,0)},
 {MAKE_CMD("debug","A container for debugging commands.","Depends on subcommand.","1.0.0",CMD_DOC_SYSCMD,NULL,NULL,"server",COMMAND_GROUP_SERVER,DEBUG_History,0,DEBUG_Tips,0,debugCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_PROTECTED,0,DEBUG_Keyspecs,0,NULL,0)},
@@ -11064,7 +11206,7 @@ struct COMMAND_STRUCT serverCommandTable[] = {
 {MAKE_CMD("save","Synchronously saves the database(s) to disk.","O(N) where N is the total number of keys in all databases","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SAVE_History,0,SAVE_Tips,0,saveCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_NO_MULTI,0,SAVE_Keyspecs,0,NULL,0)},
 {MAKE_CMD("shutdown","Synchronously saves the database(s) to disk and shuts down the server.","O(N) when saving, where N is the total number of keys in all databases when saving data, otherwise O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SHUTDOWN_History,1,SHUTDOWN_Tips,0,shutdownCommand,-1,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_NO_MULTI|CMD_SENTINEL|CMD_ALLOW_BUSY,0,SHUTDOWN_Keyspecs,0,NULL,1),.args=SHUTDOWN_Args},
 {MAKE_CMD("slaveof","Sets a server as a replica of another, or promotes it to being a primary.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"`REPLICAOF`","5.0.0","server",COMMAND_GROUP_SERVER,SLAVEOF_History,0,SLAVEOF_Tips,0,replicaofCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NOSCRIPT|CMD_STALE,0,SLAVEOF_Keyspecs,0,NULL,1),.args=SLAVEOF_Args},
-{MAKE_CMD("slowlog","A container for slow log commands.","Depends on subcommand.","2.2.12",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SLOWLOG_History,0,SLOWLOG_Tips,0,NULL,-2,0,0,SLOWLOG_Keyspecs,0,NULL,0),.subcommands=SLOWLOG_Subcommands},
+{MAKE_CMD("slowlog","A container for slow log commands.","Depends on subcommand.","2.2.12",CMD_DOC_DEPRECATED,"`COMMANDLOG`","8.1.0","server",COMMAND_GROUP_SERVER,SLOWLOG_History,0,SLOWLOG_Tips,0,NULL,-2,0,0,SLOWLOG_Keyspecs,0,NULL,0),.subcommands=SLOWLOG_Subcommands},
 {MAKE_CMD("swapdb","Swaps two databases.","O(N) where N is the count of clients watching or blocking on keys from both databases.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SWAPDB_History,0,SWAPDB_Tips,0,swapdbCommand,3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_KEYSPACE|ACL_CATEGORY_DANGEROUS,SWAPDB_Keyspecs,0,NULL,2),.args=SWAPDB_Args},
 {MAKE_CMD("sync","An internal command used in replication.",NULL,"1.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,SYNC_History,0,SYNC_Tips,0,syncCommand,1,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_NO_MULTI|CMD_NOSCRIPT,0,SYNC_Keyspecs,0,NULL,0)},
 {MAKE_CMD("time","Returns the server time.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,TIME_History,0,TIME_Tips,1,timeCommand,1,CMD_LOADING|CMD_STALE|CMD_FAST,0,TIME_Keyspecs,0,NULL,0)},
@@ -11163,7 +11305,7 @@ struct COMMAND_STRUCT serverCommandTable[] = {
 {MAKE_CMD("substr","Returns a substring from a string value.","O(N) where N is the length of the returned string. The complexity is ultimately determined by the returned length, but because creating a substring from an existing string is very cheap, it can be considered O(1) for small strings.","1.0.0",CMD_DOC_DEPRECATED,"`GETRANGE`","2.0.0","string",COMMAND_GROUP_STRING,SUBSTR_History,0,SUBSTR_Tips,0,getrangeCommand,4,CMD_READONLY,ACL_CATEGORY_STRING,SUBSTR_Keyspecs,1,NULL,3),.args=SUBSTR_Args},
 /* transactions */
 {MAKE_CMD("discard","Discards a transaction.","O(N), when N is the number of queued commands","2.0.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,DISCARD_History,0,DISCARD_Tips,0,discardCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,DISCARD_Keyspecs,0,NULL,0)},
-{MAKE_CMD("exec","Executes all commands in a transaction.","Depends on commands in the transaction","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,EXEC_History,0,EXEC_Tips,0,execCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SKIP_SLOWLOG,ACL_CATEGORY_TRANSACTION,EXEC_Keyspecs,0,NULL,0)},
+{MAKE_CMD("exec","Executes all commands in a transaction.","Depends on commands in the transaction","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,EXEC_History,0,EXEC_Tips,0,execCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SKIP_COMMANDLOG,ACL_CATEGORY_TRANSACTION,EXEC_Keyspecs,0,NULL,0)},
 {MAKE_CMD("multi","Starts a transaction.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,MULTI_History,0,MULTI_Tips,0,multiCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_MULTI|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,MULTI_Keyspecs,0,NULL,0)},
 {MAKE_CMD("unwatch","Forgets about watched keys of a transaction.","O(1)","2.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,UNWATCH_History,0,UNWATCH_Tips,0,unwatchCommand,1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,UNWATCH_Keyspecs,0,NULL,0)},
 {MAKE_CMD("watch","Monitors changes to keys to determine the execution of a transaction.","O(1) for every key.","2.2.0",CMD_DOC_NONE,NULL,NULL,"transactions",COMMAND_GROUP_TRANSACTIONS,WATCH_History,0,WATCH_Tips,0,watchCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_MULTI|CMD_ALLOW_BUSY,ACL_CATEGORY_TRANSACTION,WATCH_Keyspecs,1,NULL,1),.args=WATCH_Args},
diff --git a/src/commands/commandlog-get.json b/src/commands/commandlog-get.json
new file mode 100644
index 0000000000..00a5a01b0f
--- /dev/null
+++ b/src/commands/commandlog-get.json
@@ -0,0 +1,85 @@
+{
+    "GET": {
+        "summary": "Returns the specified command log's entries.",
+        "complexity": "O(N) where N is the number of entries returned",
+        "group": "server",
+        "since": "8.1.0",
+        "arity": 4,
+        "container": "COMMANDLOG",
+        "function": "commandlogCommand",
+        "command_flags": [
+            "ADMIN",
+            "LOADING",
+            "STALE"
+        ],
+        "command_tips": [
+            "REQUEST_POLICY:ALL_NODES",
+            "NONDETERMINISTIC_OUTPUT"
+        ],
+        "reply_schema": {
+            "type": "array",
+            "description": "Entries from the command log in chronological order.",
+            "uniqueItems": true,
+            "items": {
+                "type": "array",
+                "minItems": 6,
+                "maxItems": 6,
+                "items": [
+                    {
+                        "type": "integer",
+                        "description": "Command log entry ID."
+                    },
+                    {
+                        "type": "integer",
+                        "description": "The unix timestamp at which the logged command was processed.",
+                        "minimum": 0
+                    },
+                    {
+                        "type": "integer",
+                        "description": "Determined by the type parameter.",
+                        "minimum": 0
+                    },
+                    {
+                        "type": "array",
+                        "description": "The arguments of the command.",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "type": "string",
+                        "description": "Client IP address and port."
+                    },
+                    {
+                        "type": "string",
+                        "description": "Client name if set via the CLIENT SETNAME command."
+                    }
+                ]
+            }
+        },
+        "arguments": [
+            {
+                "name": "count",
+                "type": "integer"
+            },
+            {
+                "name": "type",
+                "type": "oneof",
+                "arguments": [
+                    {
+                        "name": "slow",
+                        "type": "string"
+                    },
+                    {
+                        "name": "large-request",
+                        "type": "string"
+                    },
+                    {
+                        "name": "large-reply",
+                        "type": "string"
+                    }
+                ]
+            }
+        ]
+    }
+}
diff --git a/src/commands/commandlog-help.json b/src/commands/commandlog-help.json
new file mode 100644
index 0000000000..dacf7d6209
--- /dev/null
+++ b/src/commands/commandlog-help.json
@@ -0,0 +1,22 @@
+{
+    "HELP": {
+        "summary": "Show helpful text about the different subcommands",
+        "complexity": "O(1)",
+        "group": "server",
+        "since": "8.1.0",
+        "arity": 2,
+        "container": "COMMANDLOG",
+        "function": "commandlogCommand",
+        "command_flags": [
+            "LOADING",
+            "STALE"
+        ],
+        "reply_schema": {
+            "type": "array",
+            "description": "Helpful text about subcommands.",
+            "items": {
+                "type": "string"
+            }
+        }
+    }
+}
diff --git a/src/commands/commandlog-len.json b/src/commands/commandlog-len.json
new file mode 100644
index 0000000000..4f1bb44075
--- /dev/null
+++ b/src/commands/commandlog-len.json
@@ -0,0 +1,46 @@
+{
+    "LEN": {
+        "summary": "Returns the number of entries in the specified type of command log.",
+        "complexity": "O(1)",
+        "group": "server",
+        "since": "8.1.0",
+        "arity": 3,
+        "container": "COMMANDLOG",
+        "function": "commandlogCommand",
+        "command_flags": [
+            "ADMIN",
+            "LOADING",
+            "STALE"
+        ],
+        "command_tips": [
+            "REQUEST_POLICY:ALL_NODES",
+            "RESPONSE_POLICY:AGG_SUM",
+            "NONDETERMINISTIC_OUTPUT"
+        ],
+        "reply_schema": {
+            "type": "integer",
+            "description": "Number of entries in the command log.",
+            "minimum": 0
+        },
+        "arguments": [
+            {
+                "name": "type",
+                "type": "oneof",
+                "arguments": [
+                    {
+                        "name": "slow",
+                        "type": "string"
+                    },
+                    {
+                        "name": "large-request",
+                        "type": "string"
+                    },
+                    {
+                        "name": "large-reply",
+                        "type": "string"
+                    }
+                ]
+            }
+        ]
+    }
+}
diff --git a/src/commands/commandlog-reset.json b/src/commands/commandlog-reset.json
new file mode 100644
index 0000000000..e43c4d4404
--- /dev/null
+++ b/src/commands/commandlog-reset.json
@@ -0,0 +1,43 @@
+{
+    "RESET": {
+        "summary": "Clears all entries from the specified type of command log.",
+        "complexity": "O(N) where N is the number of entries in the commandlog",
+        "group": "server",
+        "since": "8.1.0",
+        "arity": 3,
+        "container": "COMMANDLOG",
+        "function": "commandlogCommand",
+        "command_flags": [
+            "ADMIN",
+            "LOADING",
+            "STALE"
+        ],
+        "command_tips": [
+            "REQUEST_POLICY:ALL_NODES",
+            "RESPONSE_POLICY:ALL_SUCCEEDED"
+        ],
+        "reply_schema": {
+            "const": "OK"
+        },
+        "arguments": [
+            {
+                "name": "type",
+                "type": "oneof",
+                "arguments": [
+                    {
+                        "name": "slow",
+                        "type": "string"
+                    },
+                    {
+                        "name": "large-request",
+                        "type": "string"
+                    },
+                    {
+                        "name": "large-reply",
+                        "type": "string"
+                    }
+                ]
+            }
+        ]
+    }
+}
diff --git a/src/commands/commandlog.json b/src/commands/commandlog.json
new file mode 100644
index 0000000000..2ff2376436
--- /dev/null
+++ b/src/commands/commandlog.json
@@ -0,0 +1,9 @@
+{
+    "COMMANDLOG": {
+        "summary": "A container for command log commands.",
+        "complexity": "Depends on subcommand.",
+        "group": "server",
+        "since": "8.1.0",
+        "arity": -2
+    }
+}
diff --git a/src/commands/exec.json b/src/commands/exec.json
index 5f03d76e08..3b1b1faff1 100644
--- a/src/commands/exec.json
+++ b/src/commands/exec.json
@@ -10,7 +10,7 @@
             "NOSCRIPT",
             "LOADING",
             "STALE",
-            "SKIP_SLOWLOG"
+            "SKIP_COMMANDLOG"
         ],
         "acl_categories": [
             "TRANSACTION"
diff --git a/src/commands/slowlog-get.json b/src/commands/slowlog-get.json
index ffc54b5454..3f57b87ed8 100644
--- a/src/commands/slowlog-get.json
+++ b/src/commands/slowlog-get.json
@@ -7,6 +7,11 @@
         "arity": -2,
         "container": "SLOWLOG",
         "function": "slowlogCommand",
+        "deprecated_since": "8.1.0",
+        "replaced_by": "`COMMANDLOG GET <count> SLOW`",
+        "doc_flags": [
+            "DEPRECATED"
+        ],
         "history": [
             [
                 "4.0.0",
diff --git a/src/commands/slowlog-help.json b/src/commands/slowlog-help.json
index dde8fd4598..1db5520e1f 100644
--- a/src/commands/slowlog-help.json
+++ b/src/commands/slowlog-help.json
@@ -7,6 +7,11 @@
         "arity": 2,
         "container": "SLOWLOG",
         "function": "slowlogCommand",
+        "deprecated_since": "8.1.0",
+        "replaced_by": "`COMMANDLOG HELP`",
+        "doc_flags": [
+            "DEPRECATED"
+        ],
         "command_flags": [
             "LOADING",
             "STALE"
diff --git a/src/commands/slowlog-len.json b/src/commands/slowlog-len.json
index 717a8ad416..8b4e2b86c8 100644
--- a/src/commands/slowlog-len.json
+++ b/src/commands/slowlog-len.json
@@ -7,6 +7,11 @@
         "arity": 2,
         "container": "SLOWLOG",
         "function": "slowlogCommand",
+        "deprecated_since": "8.1.0",
+        "replaced_by": "`COMMANDLOG LEN SLOW`",
+        "doc_flags": [
+            "DEPRECATED"
+        ],
         "command_flags": [
             "ADMIN",
             "LOADING",
diff --git a/src/commands/slowlog-reset.json b/src/commands/slowlog-reset.json
index cfc1e4da7f..d5fd4b02a2 100644
--- a/src/commands/slowlog-reset.json
+++ b/src/commands/slowlog-reset.json
@@ -7,6 +7,11 @@
         "arity": 2,
         "container": "SLOWLOG",
         "function": "slowlogCommand",
+        "deprecated_since": "8.1.0",
+        "replaced_by": "`COMMANDLOG RESET SLOW`",
+        "doc_flags": [
+            "DEPRECATED"
+        ],
         "command_flags": [
             "ADMIN",
             "LOADING",
diff --git a/src/commands/slowlog.json b/src/commands/slowlog.json
index 1b9526b191..0120d55eda 100644
--- a/src/commands/slowlog.json
+++ b/src/commands/slowlog.json
@@ -4,6 +4,11 @@
         "complexity": "Depends on subcommand.",
         "group": "server",
         "since": "2.2.12",
-        "arity": -2
+        "arity": -2,
+        "deprecated_since": "8.1.0",
+        "replaced_by": "`COMMANDLOG`",
+        "doc_flags": [
+            "DEPRECATED"
+        ]
     }
 }
diff --git a/src/config.c b/src/config.c
index 0b459bb6e5..de8d00dce0 100644
--- a/src/config.c
+++ b/src/config.c
@@ -818,7 +818,7 @@ void configSetCommand(client *c) {
 
         /* Note: it's important we run over ALL passed configs and check if we need to call
          * `redactClientCommandArgument()`. This is in order to avoid anyone using this command for a
-         * log/slowlog/monitor/etc. displaying sensitive info. So even if we encounter an error we still continue
+         * log/commandlog/monitor/etc. displaying sensitive info. So even if we encounter an error we still continue
          * running over the remaining arguments. */
         if (config->flags & SENSITIVE_CONFIG) {
             redactClientCommandArgument(c, 2 + i * 2 + 1);
@@ -3302,7 +3302,9 @@ standardConfig static_configs[] = {
 
     /* Unsigned Long configs */
     createULongConfig("active-defrag-max-scan-fields", NULL, MODIFIABLE_CONFIG, 1, LONG_MAX, server.active_defrag_max_scan_fields, 1000, INTEGER_CONFIG, NULL, NULL), /* Default: keys with more than 1000 fields will be processed separately */
-    createULongConfig("slowlog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.slowlog_max_len, 128, INTEGER_CONFIG, NULL, NULL),
+    createULongConfig("commandlog-slow-execution-max-len", "slowlog-max-len", MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_SLOW].max_len, 128, INTEGER_CONFIG, NULL, NULL),
+    createULongConfig("commandlog-large-request-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REQUEST].max_len, 128, INTEGER_CONFIG, NULL, NULL),
+    createULongConfig("commandlog-large-reply-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REPLY].max_len, 128, INTEGER_CONFIG, NULL, NULL),
     createULongConfig("acllog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.acllog_max_len, 128, INTEGER_CONFIG, NULL, NULL),
     createULongConfig("cluster-blacklist-ttl", NULL, MODIFIABLE_CONFIG, 0, ULONG_MAX, server.cluster_blacklist_ttl, 60, INTEGER_CONFIG, NULL, NULL),
 
@@ -3310,7 +3312,9 @@ standardConfig static_configs[] = {
     createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL), /* milliseconds */
     createLongLongConfig("cluster-node-timeout", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.cluster_node_timeout, 15000, INTEGER_CONFIG, NULL, NULL),
     createLongLongConfig("cluster-ping-interval", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, LLONG_MAX, server.cluster_ping_interval, 0, INTEGER_CONFIG, NULL, NULL),
-    createLongLongConfig("slowlog-log-slower-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.slowlog_log_slower_than, 10000, INTEGER_CONFIG, NULL, NULL),
+    createLongLongConfig("commandlog-execution-slower-than", "slowlog-log-slower-than", MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_SLOW].threshold, 10000, INTEGER_CONFIG, NULL, NULL),
+    createLongLongConfig("commandlog-request-larger-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REQUEST].threshold, 1024 * 1024, INTEGER_CONFIG, NULL, NULL),
+    createLongLongConfig("commandlog-reply-larger-than", NULL, MODIFIABLE_CONFIG, -1, LLONG_MAX, server.commandlog[COMMANDLOG_TYPE_LARGE_REPLY].threshold, 1024 * 1024, INTEGER_CONFIG, NULL, NULL),
     createLongLongConfig("latency-monitor-threshold", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.latency_monitor_threshold, 0, INTEGER_CONFIG, NULL, NULL),
     createLongLongConfig("proto-max-bulk-len", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, 1024 * 1024, LONG_MAX, server.proto_max_bulk_len, 512ll * 1024 * 1024, MEMORY_CONFIG, NULL, NULL), /* Bulk request max size */
     createLongLongConfig("stream-node-max-entries", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, server.stream_node_max_entries, 100, INTEGER_CONFIG, NULL, NULL),
diff --git a/src/latency.c b/src/latency.c
index fa448dac35..7dcdaea967 100644
--- a/src/latency.c
+++ b/src/latency.c
@@ -266,10 +266,10 @@ sds createLatencyReport(void) {
 
         /* Potentially commands. */
         if (!strcasecmp(event, "command")) {
-            if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) {
+            if (server.commandlog[COMMANDLOG_TYPE_SLOW].threshold < 0 || server.commandlog[COMMANDLOG_TYPE_SLOW].max_len == 0) {
                 advise_slowlog_enabled = 1;
                 advices++;
-            } else if (server.slowlog_log_slower_than / 1000 > server.latency_monitor_threshold) {
+            } else if (server.commandlog[COMMANDLOG_TYPE_SLOW].threshold / 1000 > server.latency_monitor_threshold) {
                 advise_slowlog_tuning = 1;
                 advices++;
             }
diff --git a/src/module.c b/src/module.c
index 17ac4ddf02..37bd310427 100644
--- a/src/module.c
+++ b/src/module.c
@@ -53,7 +53,7 @@
 
 #include "server.h"
 #include "cluster.h"
-#include "slowlog.h"
+#include "commandlog.h"
 #include "rdb.h"
 #include "monotonic.h"
 #include "script.h"
@@ -1221,7 +1221,8 @@ int64_t commandFlagsFromString(char *s) {
         else if (!strcasecmp(t,"blocking")) flags |= CMD_BLOCKING;
         else if (!strcasecmp(t,"allow-stale")) flags |= CMD_STALE;
         else if (!strcasecmp(t,"no-monitor")) flags |= CMD_SKIP_MONITOR;
-        else if (!strcasecmp(t,"no-slowlog")) flags |= CMD_SKIP_SLOWLOG;
+        else if (!strcasecmp(t,"no-slowlog")) flags |= CMD_SKIP_COMMANDLOG;
+        else if (!strcasecmp(t,"no-commandlog")) flags |= CMD_SKIP_COMMANDLOG;
         else if (!strcasecmp(t,"fast")) flags |= CMD_FAST;
         else if (!strcasecmp(t,"no-auth")) flags |= CMD_NO_AUTH;
         else if (!strcasecmp(t,"may-replicate")) flags |= CMD_MAY_REPLICATE;
@@ -1296,7 +1297,8 @@ ValkeyModuleCommand *moduleCreateCommandProxy(struct ValkeyModule *module,
  *                      this means.
  * * **"no-monitor"**: Don't propagate the command on monitor. Use this if
  *                     the command has sensitive data among the arguments.
- * * **"no-slowlog"**: Don't log this command in the slowlog. Use this if
+ * * **"no-slowlog"**: Deprecated, please use "no-commandlog".
+ * * **"no-commandlog"**: Don't log this command in the commandlog. Use this if
  *                     the command has sensitive data among the arguments.
  * * **"fast"**:      The command time complexity is not greater
  *                    than O(log(N)) where N is the size of the collection or
diff --git a/src/multi.c b/src/multi.c
index 9e3aad9d3c..0318c418cc 100644
--- a/src/multi.c
+++ b/src/multi.c
@@ -250,7 +250,7 @@ void execCommand(client *c) {
         c->mstate->commands[j].argv_len = c->argv_len;
         c->mstate->commands[j].cmd = c->cmd;
 
-        /* The original argv has already been processed for slowlog and monitor,
+        /* The original argv has already been processed for commandlog and monitor,
          * so we can safely free it before proceeding to the next command. */
         freeClientOriginalArgv(c);
     }
diff --git a/src/networking.c b/src/networking.c
index 0ff0fed4c0..093d579ef4 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -4297,7 +4297,7 @@ void securityWarningCommand(client *c) {
     freeClientAsync(c);
 }
 
-/* This function preserves the original command arguments for accurate slowlog recording.
+/* This function preserves the original command arguments for accurate commandlog recording.
  *
  * It performs the following operations:
  * - Stores the initial command vector if not already saved
@@ -4348,7 +4348,7 @@ static void backupAndUpdateClientArgv(client *c, int new_argc, robj **new_argv)
 }
 
 /* Redact a given argument to prevent it from being shown
- * in the slowlog. This information is stored in the
+ * in the commandlog. This information is stored in the
  * original_argv array. */
 void redactClientCommandArgument(client *c, int argc) {
     backupAndUpdateClientArgv(c, c->argc, NULL);
diff --git a/src/server.c b/src/server.c
index 90beebebdd..21e42e5884 100644
--- a/src/server.c
+++ b/src/server.c
@@ -31,7 +31,7 @@
 #include "monotonic.h"
 #include "cluster.h"
 #include "cluster_slot_stats.h"
-#include "slowlog.h"
+#include "commandlog.h"
 #include "bio.h"
 #include "latency.h"
 #include "mt19937-64.h"
@@ -2917,7 +2917,7 @@ void initServer(void) {
     if (functionsInit() == C_ERR) {
         serverPanic("Functions initialization failed, check the server logs.");
     }
-    slowlogInit();
+    commandlogInit();
     latencyMonitorInit();
     initSharedQueryBuf();
 
@@ -3487,25 +3487,6 @@ void preventCommandReplication(client *c) {
     c->flag.prevent_repl_prop = 1;
 }
 
-/* Log the last command a client executed into the slowlog. */
-void slowlogPushCurrentCommand(client *c, struct serverCommand *cmd, ustime_t duration) {
-    /* Some commands may contain sensitive data that should not be available in the slowlog. */
-    if (cmd->flags & CMD_SKIP_SLOWLOG) return;
-
-    /* If command argument vector was rewritten, use the original
-     * arguments. */
-    robj **argv = c->original_argv ? c->original_argv : c->argv;
-    int argc = c->original_argv ? c->original_argc : c->argc;
-
-    /* If a script is currently running, the client passed in is a
-     * fake client. Or the client passed in is the original client
-     * if this is a EVAL or alike, doesn't matter. In this case,
-     * use the original client to get the client information. */
-    c = scriptIsRunning() ? scriptGetCaller() : c;
-
-    slowlogPushEntryIfNeeded(c, argv, argc, duration);
-}
-
 /* This function is called in order to update the total command histogram duration.
  * The latency unit is nano-seconds.
  * If needed it will allocate the histogram memory and trim the duration to the upper/lower tracking limits*/
@@ -3659,7 +3640,7 @@ void call(client *c, int flags) {
     server.executing_client = c;
 
     /* When call() is issued during loading the AOF we don't want commands called
-     * from module, exec or LUA to go into the slowlog or to populate statistics. */
+     * from module, exec or LUA to go into the commandlog or to populate statistics. */
     int update_command_stats = !isAOFLoadingContext();
 
     /* We want to be aware of a client which is making a first time attempt to execute this command
@@ -3756,9 +3737,9 @@ void call(client *c, int flags) {
         if (server.execution_nesting == 0) durationAddSample(EL_DURATION_TYPE_CMD, duration);
     }
 
-    /* Log the command into the Slow log if needed.
-     * If the client is blocked we will handle slowlog when it is unblocked. */
-    if (update_command_stats && !c->flag.blocked) slowlogPushCurrentCommand(c, real_cmd, c->duration);
+    /* Log the command into the commandlog if needed.
+     * If the client is blocked we will handle commandlog when it is unblocked. */
+    if (update_command_stats && !c->flag.blocked) commandlogPushCurrentCommand(c, real_cmd);
 
     /* Send the command to clients in MONITOR mode if applicable,
      * since some administrative commands are considered too dangerous to be shown.
@@ -4725,7 +4706,7 @@ void addReplyFlagsForCommand(client *c, struct serverCommand *cmd) {
                                   {CMD_LOADING, "loading"},
                                   {CMD_STALE, "stale"},
                                   {CMD_SKIP_MONITOR, "skip_monitor"},
-                                  {CMD_SKIP_SLOWLOG, "skip_slowlog"},
+                                  {CMD_SKIP_COMMANDLOG, "skip_commandlog"},
                                   {CMD_ASKING, "asking"},
                                   {CMD_FAST, "fast"},
                                   {CMD_NO_AUTH, "no_auth"},
diff --git a/src/server.h b/src/server.h
index d8e353daae..3c6e229028 100644
--- a/src/server.h
+++ b/src/server.h
@@ -230,7 +230,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
 #define CMD_LOADING (1ULL << 9)
 #define CMD_STALE (1ULL << 10)
 #define CMD_SKIP_MONITOR (1ULL << 11)
-#define CMD_SKIP_SLOWLOG (1ULL << 12)
+#define CMD_SKIP_COMMANDLOG (1ULL << 12)
 #define CMD_ASKING (1ULL << 13)
 #define CMD_FAST (1ULL << 14)
 #define CMD_NO_AUTH (1ULL << 15)
@@ -382,6 +382,22 @@ typedef enum blocking_type {
                                     buffer configuration. Just the first  \
                                     three: normal, replica, pubsub. */
 
+/* Type of commandlog */
+typedef enum {
+    COMMANDLOG_TYPE_SLOW = 0,
+    COMMANDLOG_TYPE_LARGE_REQUEST,
+    COMMANDLOG_TYPE_LARGE_REPLY,
+    COMMANDLOG_TYPE_NUM
+} commandlog_type;
+
+/* Configuration and entry list of different types of command logs */
+typedef struct commandlog {
+    list *entries;
+    long long entry_id;
+    long long threshold;
+    unsigned long max_len;
+} commandlog;
+
 /* Replica replication state. Used in server.repl_state for replicas to remember
  * what to do next. */
 typedef enum {
@@ -1688,10 +1704,7 @@ struct valkeyServer {
     long long stat_sync_full;                      /* Number of full resyncs with replicas. */
     long long stat_sync_partial_ok;                /* Number of accepted PSYNC requests. */
     long long stat_sync_partial_err;               /* Number of unaccepted PSYNC requests. */
-    list *slowlog;                                 /* SLOWLOG list of commands */
-    long long slowlog_entry_id;                    /* SLOWLOG current entry ID */
-    long long slowlog_log_slower_than;             /* SLOWLOG time limit (to get logged) */
-    unsigned long slowlog_max_len;                 /* SLOWLOG max number of items logged */
+    commandlog commandlog[COMMANDLOG_TYPE_NUM];    /* Logs of commands. */
     struct malloc_stats cron_malloc_stats;         /* sampled in serverCron(). */
     long long stat_net_input_bytes;                /* Bytes read from network. */
     long long stat_net_output_bytes;               /* Bytes written to network. */
@@ -2340,7 +2353,7 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc,
  *
  * CMD_SKIP_MONITOR:  Do not automatically propagate the command on MONITOR.
  *
- * CMD_SKIP_SLOWLOG:  Do not automatically propagate the command to the slowlog.
+ * CMD_SKIP_COMMANDLOG:  Do not automatically propagate the command to the commandlog.
  *
  * CMD_ASKING:      Perform an implicit ASKING for this command, so the
  *                  command will be accepted in cluster mode if the slot is marked
@@ -3152,7 +3165,7 @@ void forceCommandPropagation(client *c, int flags);
 void preventCommandPropagation(client *c);
 void preventCommandAOF(client *c);
 void preventCommandReplication(client *c);
-void slowlogPushCurrentCommand(client *c, struct serverCommand *cmd, ustime_t duration);
+void commandlogPushCurrentCommand(client *c, struct serverCommand *cmd);
 void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int64_t duration_hist);
 int prepareForShutdown(client *c, int flags);
 void replyToClientsBlockedOnShutdown(void);
@@ -3633,6 +3646,7 @@ void bgsaveCommand(client *c);
 void bgrewriteaofCommand(client *c);
 void shutdownCommand(client *c);
 void slowlogCommand(client *c);
+void commandlogCommand(client *c);
 void moveCommand(client *c);
 void copyCommand(client *c);
 void renameCommand(client *c);
diff --git a/src/slowlog.c b/src/slowlog.c
deleted file mode 100644
index b98a2dda43..0000000000
--- a/src/slowlog.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Slowlog implements a system that is able to remember the latest N
- * queries that took more than M microseconds to execute.
- *
- * The execution time to reach to be logged in the slow log is set
- * using the 'slowlog-log-slower-than' config directive, that is also
- * readable and writable using the CONFIG SET/GET command.
- *
- * The slow queries log is actually not "logged" in the server log file
- * but is accessible thanks to the SLOWLOG command.
- *
- * ----------------------------------------------------------------------------
- *
- * Copyright (c) 2009-2012, Redis Ltd.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *   * Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *   * Neither the name of Redis nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without
- *     specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "slowlog.h"
-
-/* Create a new slowlog entry.
- * Incrementing the ref count of all the objects retained is up to
- * this function. */
-slowlogEntry *slowlogCreateEntry(client *c, robj **argv, int argc, long long duration) {
-    slowlogEntry *se = zmalloc(sizeof(*se));
-    int j, slargc = argc;
-
-    if (slargc > SLOWLOG_ENTRY_MAX_ARGC) slargc = SLOWLOG_ENTRY_MAX_ARGC;
-    se->argc = slargc;
-    se->argv = zmalloc(sizeof(robj *) * slargc);
-    for (j = 0; j < slargc; j++) {
-        /* Logging too many arguments is a useless memory waste, so we stop
-         * at SLOWLOG_ENTRY_MAX_ARGC, but use the last argument to specify
-         * how many remaining arguments there were in the original command. */
-        if (slargc != argc && j == slargc - 1) {
-            se->argv[j] =
-                createObject(OBJ_STRING, sdscatprintf(sdsempty(), "... (%d more arguments)", argc - slargc + 1));
-        } else {
-            /* Trim too long strings as well... */
-            if (argv[j]->type == OBJ_STRING && sdsEncodedObject(argv[j]) &&
-                sdslen(argv[j]->ptr) > SLOWLOG_ENTRY_MAX_STRING) {
-                sds s = sdsnewlen(argv[j]->ptr, SLOWLOG_ENTRY_MAX_STRING);
-
-                s = sdscatprintf(s, "... (%lu more bytes)",
-                                 (unsigned long)sdslen(argv[j]->ptr) - SLOWLOG_ENTRY_MAX_STRING);
-                se->argv[j] = createObject(OBJ_STRING, s);
-            } else if (argv[j]->refcount == OBJ_SHARED_REFCOUNT) {
-                se->argv[j] = argv[j];
-            } else {
-                /* Here we need to duplicate the string objects composing the
-                 * argument vector of the command, because those may otherwise
-                 * end shared with string objects stored into keys. Having
-                 * shared objects between any part of the server, and the data
-                 * structure holding the data, is a problem: FLUSHALL ASYNC
-                 * may release the shared string object and create a race. */
-                se->argv[j] = dupStringObject(argv[j]);
-            }
-        }
-    }
-    se->time = time(NULL);
-    se->duration = duration;
-    se->id = server.slowlog_entry_id++;
-    se->peerid = sdsnew(getClientPeerId(c));
-    se->cname = c->name ? sdsnew(c->name->ptr) : sdsempty();
-    return se;
-}
-
-/* Free a slow log entry. The argument is void so that the prototype of this
- * function matches the one of the 'free' method of adlist.c.
- *
- * This function will take care to release all the retained object. */
-void slowlogFreeEntry(void *septr) {
-    slowlogEntry *se = septr;
-    int j;
-
-    for (j = 0; j < se->argc; j++) decrRefCount(se->argv[j]);
-    zfree(se->argv);
-    sdsfree(se->peerid);
-    sdsfree(se->cname);
-    zfree(se);
-}
-
-/* Initialize the slow log. This function should be called a single time
- * at server startup. */
-void slowlogInit(void) {
-    server.slowlog = listCreate();
-    server.slowlog_entry_id = 0;
-    listSetFreeMethod(server.slowlog, slowlogFreeEntry);
-}
-
-/* Push a new entry into the slow log.
- * This function will make sure to trim the slow log accordingly to the
- * configured max length. */
-void slowlogPushEntryIfNeeded(client *c, robj **argv, int argc, long long duration) {
-    if (server.slowlog_log_slower_than < 0 || server.slowlog_max_len == 0) return; /* Slowlog disabled */
-    if (duration >= server.slowlog_log_slower_than)
-        listAddNodeHead(server.slowlog, slowlogCreateEntry(c, argv, argc, duration));
-
-    /* Remove old entries if needed. */
-    while (listLength(server.slowlog) > server.slowlog_max_len) listDelNode(server.slowlog, listLast(server.slowlog));
-}
-
-/* Remove all the entries from the current slow log. */
-void slowlogReset(void) {
-    while (listLength(server.slowlog) > 0) listDelNode(server.slowlog, listLast(server.slowlog));
-}
-
-/* The SLOWLOG command. Implements all the subcommands needed to handle the
- * slow log. */
-void slowlogCommand(client *c) {
-    if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) {
-        const char *help[] = {
-            "GET [<count>]",
-            "    Return top <count> entries from the slowlog (default: 10, -1 mean all).",
-            "    Entries are made of:",
-            "    id, timestamp, time in microseconds, arguments array, client IP and port,",
-            "    client name",
-            "LEN",
-            "    Return the length of the slowlog.",
-            "RESET",
-            "    Reset the slowlog.",
-            NULL,
-        };
-        addReplyHelp(c, help);
-    } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "reset")) {
-        slowlogReset();
-        addReply(c, shared.ok);
-    } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "len")) {
-        addReplyLongLong(c, listLength(server.slowlog));
-    } else if ((c->argc == 2 || c->argc == 3) && !strcasecmp(c->argv[1]->ptr, "get")) {
-        long count = 10;
-        listIter li;
-        listNode *ln;
-        slowlogEntry *se;
-
-        if (c->argc == 3) {
-            /* Consume count arg. */
-            if (getRangeLongFromObjectOrReply(c, c->argv[2], -1, LONG_MAX, &count,
-                                              "count should be greater than or equal to -1") != C_OK)
-                return;
-
-            if (count == -1) {
-                /* We treat -1 as a special value, which means to get all slow logs.
-                 * Simply set count to the length of server.slowlog.*/
-                count = listLength(server.slowlog);
-            }
-        }
-
-        if (count > (long)listLength(server.slowlog)) {
-            count = listLength(server.slowlog);
-        }
-        addReplyArrayLen(c, count);
-        listRewind(server.slowlog, &li);
-        while (count--) {
-            int j;
-
-            ln = listNext(&li);
-            se = ln->value;
-            addReplyArrayLen(c, 6);
-            addReplyLongLong(c, se->id);
-            addReplyLongLong(c, se->time);
-            addReplyLongLong(c, se->duration);
-            addReplyArrayLen(c, se->argc);
-            for (j = 0; j < se->argc; j++) addReplyBulk(c, se->argv[j]);
-            addReplyBulkCBuffer(c, se->peerid, sdslen(se->peerid));
-            addReplyBulkCBuffer(c, se->cname, sdslen(se->cname));
-        }
-    } else {
-        addReplySubcommandSyntaxError(c);
-    }
-}
diff --git a/tests/unit/commandlog.tcl b/tests/unit/commandlog.tcl
new file mode 100644
index 0000000000..39038d7950
--- /dev/null
+++ b/tests/unit/commandlog.tcl
@@ -0,0 +1,361 @@
+start_server {tags {"commandlog"} overrides {commandlog-execution-slower-than 1000000 commandlog-request-larger-than 1048576 commandlog-reply-larger-than 1048576}} {
+    test {COMMANDLOG - check that it starts with an empty log} {
+        if {$::external} {
+            r commandlog reset slow
+            r commandlog reset large-request
+            r commandlog reset large-reply
+        }
+        assert_equal [r commandlog len slow] 0
+        assert_equal [r commandlog len large-request] 0
+        assert_equal [r commandlog len large-reply] 0
+    }
+
+    test {COMMANDLOG - only logs commands exceeding the threshold} {
+        # for slow
+        r config set commandlog-execution-slower-than 100000
+        r ping
+        assert_equal [r commandlog len slow] 0
+        r debug sleep 0.2
+        assert_equal [r commandlog len slow] 1
+
+        # for large-request
+        r config set commandlog-request-larger-than 1024
+        r ping
+        assert_equal [r commandlog len large-request] 0
+        set value [string repeat A 1024]
+        r set testkey $value
+        assert_equal [r commandlog len large-request] 1
+
+        # for large-reply
+        r config set commandlog-reply-larger-than 1024
+        r ping
+        assert_equal [r commandlog len large-reply] 0
+        r get testkey
+        assert_equal [r commandlog len large-reply] 1
+    } {} {needs:debug}
+
+    test {COMMANDLOG - zero max length is correctly handled} {
+        r commandlog reset slow
+        r commandlog reset large-request
+        r commandlog reset large-reply
+        r config set commandlog-slow-execution-max-len 0
+        r config set commandlog-execution-slower-than 0
+        r config set commandlog-large-request-max-len 0
+        r config set commandlog-request-larger-than 0
+        r config set commandlog-large-reply-max-len 0
+        r config set commandlog-reply-larger-than 0
+        for {set i 0} {$i < 100} {incr i} {
+            r ping
+        }
+        assert_equal [r commandlog len slow] 0
+        assert_equal [r commandlog len large-request] 0
+        assert_equal [r commandlog len large-reply] 0
+    }
+
+    test {COMMANDLOG - max entries is correctly handled} {
+        r config set commandlog-execution-slower-than 0
+        r config set commandlog-slow-execution-max-len 10
+        r config set commandlog-large-request-max-len 10
+        r config set commandlog-request-larger-than 0
+        r config set commandlog-large-reply-max-len 10
+        r config set commandlog-reply-larger-than 0
+        for {set i 0} {$i < 100} {incr i} {
+            r ping
+        }
+        assert_equal [r commandlog len slow] 10
+        assert_equal [r commandlog len large-request] 10
+        assert_equal [r commandlog len large-reply] 10
+    }
+
+    test {COMMANDLOG - GET optional argument to limit output len works} {
+        assert_equal 5  [llength [r commandlog get 5 slow]]
+        assert_equal 10 [llength [r commandlog get -1 slow]]
+        assert_equal 10 [llength [r commandlog get 20 slow]]
+
+        assert_equal 5  [llength [r commandlog get 5 large-request]]
+        assert_equal 10 [llength [r commandlog get -1 large-request]]
+        assert_equal 10 [llength [r commandlog get 20 large-request]]
+
+        assert_equal 5  [llength [r commandlog get 5 large-reply]]
+        assert_equal 10 [llength [r commandlog get -1 large-reply]]
+        assert_equal 10 [llength [r commandlog get 20 large-reply]]
+    }
+
+    test {COMMANDLOG - RESET subcommand works} {
+        r config set commandlog-execution-slower-than 100000
+        r config set commandlog-request-larger-than 1024
+        r config set commandlog-reply-larger-than 1024
+        r commandlog reset slow
+        r commandlog reset large-request
+        r commandlog reset large-reply
+        assert_equal [r commandlog len slow] 0
+        assert_equal [r commandlog len large-request] 0
+        assert_equal [r commandlog len large-reply] 0
+    }
+
+    test {COMMANDLOG - logged entry sanity check} {
+        r client setname foobar
+
+        # for slow
+        r debug sleep 0.2
+        set e [lindex [r commandlog get -1 slow] 0]
+        assert_equal [llength $e] 6
+        if {!$::external} {
+            assert_equal [lindex $e 0] 118
+        }
+        assert_equal [expr {[lindex $e 2] > 100000}] 1
+        assert_equal [lindex $e 3] {debug sleep 0.2}
+        assert_equal {foobar} [lindex $e 5]
+
+        # for large-request
+        set value [string repeat A 1024]
+        r set testkey $value
+        set e [lindex [r commandlog get -1 large-request] 0]
+        assert_equal [llength $e] 6
+        if {!$::external} {
+            assert_equal [lindex $e 0] 118
+        }
+        assert_equal [expr {[lindex $e 2] > 1024}] 1
+        assert_equal [lindex $e 3] {set testkey {AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA... (896 more bytes)}}
+        assert_equal {foobar} [lindex $e 5]
+
+        # for large-reply
+        r get testkey
+        set e [lindex [r commandlog get -1 large-reply] 0]
+        assert_equal [llength $e] 6
+        if {!$::external} {
+            assert_equal [lindex $e 0] 117
+        }
+        assert_equal [expr {[lindex $e 2] > 1024}] 1
+        assert_equal [lindex $e 3] {get testkey}
+        assert_equal {foobar} [lindex $e 5]
+    } {} {needs:debug}
+
+    test {COMMANDLOG slow - Certain commands are omitted that contain sensitive information} {
+        r config set commandlog-slow-execution-max-len 100
+        r config set commandlog-execution-slower-than 0
+        r commandlog reset slow
+        catch {r acl setuser "slowlog test user" +get +set} _
+        r config set primaryuser ""
+        r config set primaryauth ""
+        r config set requirepass ""
+        r config set tls-key-file-pass ""
+        r config set tls-client-key-file-pass ""
+        r acl setuser slowlog-test-user +get +set
+        r acl getuser slowlog-test-user
+        r acl deluser slowlog-test-user non-existing-user
+        r config set commandlog-execution-slower-than 0
+        r config set commandlog-execution-slower-than -1
+        set slowlog_resp [r commandlog get -1 slow]
+
+        # Make sure normal configs work, but the two sensitive
+        # commands are omitted or redacted
+        assert_equal 11 [llength $slowlog_resp]
+        assert_equal {commandlog reset slow} [lindex [lindex $slowlog_resp 10] 3]
+        assert_equal {acl setuser (redacted) (redacted) (redacted)} [lindex [lindex $slowlog_resp 9] 3]
+        assert_equal {config set primaryuser (redacted)} [lindex [lindex $slowlog_resp 8] 3]
+        assert_equal {config set primaryauth (redacted)} [lindex [lindex $slowlog_resp 7] 3]
+        assert_equal {config set requirepass (redacted)} [lindex [lindex $slowlog_resp 6] 3]
+        assert_equal {config set tls-key-file-pass (redacted)} [lindex [lindex $slowlog_resp 5] 3]
+        assert_equal {config set tls-client-key-file-pass (redacted)} [lindex [lindex $slowlog_resp 4] 3]
+        assert_equal {acl setuser (redacted) (redacted) (redacted)} [lindex [lindex $slowlog_resp 3] 3]
+        assert_equal {acl getuser (redacted)} [lindex [lindex $slowlog_resp 2] 3]
+        assert_equal {acl deluser (redacted) (redacted)} [lindex [lindex $slowlog_resp 1] 3]
+        assert_equal {config set commandlog-execution-slower-than 0} [lindex [lindex $slowlog_resp 0] 3]
+    } {} {needs:repl}
+
+    test {COMMANDLOG slow - Some commands can redact sensitive fields} {
+        r config set commandlog-execution-slower-than 0
+        r commandlog reset slow
+        r migrate [srv 0 host] [srv 0 port] key 9 5000
+        r migrate [srv 0 host] [srv 0 port] key 9 5000 AUTH user
+        r migrate [srv 0 host] [srv 0 port] key 9 5000 AUTH2 user password
+        r config set commandlog-execution-slower-than -1
+        set slowlog_resp [r commandlog get -1 slow]
+
+        # Make sure all 3 commands were logged, but the sensitive fields are omitted
+        assert_equal 4 [llength $slowlog_resp]
+        assert_match {* key 9 5000} [lindex [lindex $slowlog_resp 2] 3]
+        assert_match {* key 9 5000 AUTH (redacted)} [lindex [lindex $slowlog_resp 1] 3]
+        assert_match {* key 9 5000 AUTH2 (redacted) (redacted)} [lindex [lindex $slowlog_resp 0] 3]
+    } {} {needs:repl}
+
+    test {COMMANDLOG slow - Rewritten commands are logged as their original command} {
+        r config set commandlog-execution-slower-than 0
+
+        # Test rewriting client arguments
+        r sadd set a b c d e
+        r commandlog reset slow
+
+        # SPOP is rewritten as DEL when all keys are removed
+        r spop set 10
+        assert_equal {spop set 10} [lindex [lindex [r commandlog get -1 slow] 0] 3]
+
+        # Test replacing client arguments
+        r commandlog reset slow
+
+        # GEOADD is replicated as ZADD
+        r geoadd cool-cities -122.33207 47.60621 Seattle
+        assert_equal {geoadd cool-cities -122.33207 47.60621 Seattle} [lindex [lindex [r commandlog get -1 slow] 0] 3]
+
+        # Test replacing a single command argument
+        r set A 5
+        r commandlog reset slow
+        
+        # GETSET is replicated as SET
+        r getset a 5
+        assert_equal {getset a 5} [lindex [lindex [r commandlog get -1 slow] 0] 3]
+
+        # INCRBYFLOAT calls rewrite multiple times, so it's a special case
+        r set A 0
+        r commandlog reset slow
+        
+        # INCRBYFLOAT is replicated as SET
+        r INCRBYFLOAT A 1.0
+        assert_equal {INCRBYFLOAT A 1.0} [lindex [lindex [r commandlog get -1 slow] 0] 3]
+
+        # blocked BLPOP is replicated as LPOP
+        set rd [valkey_deferring_client]
+        $rd blpop l 0
+        wait_for_blocked_clients_count 1 50 100
+        r multi
+        r lpush l foo
+        r commandlog reset slow
+        r exec
+        $rd read
+        $rd close
+        assert_equal {blpop l 0} [lindex [lindex [r commandlog get -1 slow] 0] 3]
+    }
+
+    test {COMMANDLOG slow - commands with too many arguments are trimmed} {
+        r config set commandlog-execution-slower-than 0
+        r commandlog reset slow
+        r sadd set 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
+        set e [lindex [r commandlog get -1 slow] end-1]
+        lindex $e 3
+    } {sadd set 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 {... (2 more arguments)}}
+
+    test {COMMANDLOG slow - too long arguments are trimmed} {
+        r config set commandlog-execution-slower-than 0
+        r commandlog reset slow
+        set arg [string repeat A 129]
+        r sadd set foo $arg
+        set e [lindex [r commandlog get -1 slow] end-1]
+        lindex $e 3
+    } {sadd set foo {AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA... (1 more bytes)}}
+
+    test {COMMANDLOG slow - EXEC is not logged, just executed commands} {
+        r config set commandlog-execution-slower-than 100000
+        r commandlog reset slow
+        assert_equal [r commandlog len slow] 0
+        r multi
+        r debug sleep 0.2
+        r exec
+        assert_equal [r commandlog len slow] 1
+        set e [lindex [r commandlog get -1 slow] 0]
+        assert_equal [lindex $e 3] {debug sleep 0.2}
+    } {} {needs:debug}
+
+    test {COMMANDLOG slow - can clean older entries} {
+        r client setname lastentry_client
+        r config set commandlog-slow-execution-max-len 1
+        r debug sleep 0.2
+        assert {[llength [r commandlog get -1 slow]] == 1}
+        set e [lindex [r commandlog get -1 slow] 0]
+        assert_equal {lastentry_client} [lindex $e 5]
+    } {} {needs:debug}
+
+    test {COMMANDLOG slow - can be disabled} {
+        r config set commandlog-slow-execution-max-len 1
+        r config set commandlog-execution-slower-than 1
+        r commandlog reset slow
+        r debug sleep 0.2
+        assert_equal [r commandlog len slow] 1
+        r config set commandlog-execution-slower-than -1
+        r commandlog reset slow
+        r debug sleep 0.2
+        assert_equal [r commandlog len slow] 0
+    } {} {needs:debug}
+
+    test {COMMANDLOG slow - count must be >= -1} {
+       assert_error "ERR count should be greater than or equal to -1" {r commandlog get -2 slow}
+       assert_error "ERR count should be greater than or equal to -1" {r commandlog get -222 slow}
+    }
+
+    test {COMMANDLOG slow - get all slow logs} {
+        r config set commandlog-execution-slower-than 0
+        r config set commandlog-slow-execution-max-len 3
+        r commandlog reset slow
+
+        r set key test
+        r sadd set a b c
+        r incr num
+        r lpush list a
+
+        assert_equal [r commandlog len slow] 3
+        assert_equal 0 [llength [r commandlog get 0 slow]]
+        assert_equal 1 [llength [r commandlog get 1 slow]]
+        assert_equal 3 [llength [r commandlog get -1 slow]]
+        assert_equal 3 [llength [r commandlog get 3 slow]]
+    }
+    
+     test {COMMANDLOG slow - blocking command is reported only after unblocked} {
+        # Cleanup first
+        r del mylist
+        # create a test client
+        set rd [valkey_deferring_client]
+        
+        # config the slowlog and reset
+        r config set commandlog-execution-slower-than 0
+        r config set commandlog-slow-execution-max-len 110
+        r commandlog reset slow
+        
+        $rd BLPOP mylist 0
+        wait_for_blocked_clients_count 1 50 20
+        assert_equal 0 [llength [regexp -all -inline (?=BLPOP) [r commandlog get -1 slow]]]
+        
+        r LPUSH mylist 1
+        wait_for_blocked_clients_count 0 50 20
+        assert_equal 1 [llength [regexp -all -inline (?=BLPOP) [r commandlog get -1 slow]]]
+        
+        $rd close
+    }
+
+    foreach is_eval {0 1} {
+        test "COMMANDLOG slow - the commands in script are recorded normally - is_eval: $is_eval" {
+            if {$is_eval == 0} {
+                r function load replace "#!lua name=mylib \n redis.register_function('myfunc', function(KEYS, ARGS) server.call('ping') end)"
+            }
+
+            r client setname test-client
+            r config set commandlog-execution-slower-than 0
+            r commandlog reset slow
+
+            if {$is_eval} {
+                r eval "server.call('ping')" 0
+            } else {
+                r fcall myfunc 0
+            }
+            set slowlog_resp [r commandlog get 2 slow]
+            assert_equal 2 [llength $slowlog_resp]
+
+            # The first one is the script command, and the second one is the ping command executed in the script
+            # Each slowlog contains: id, timestamp, execution time, command array, ip:port, client name
+            set script_cmd [lindex $slowlog_resp 0]
+            set ping_cmd [lindex $slowlog_resp 1]
+
+            # Make sure the command are logged.
+            if {$is_eval} {
+                assert_equal {eval server.call('ping') 0} [lindex $script_cmd 3]
+            } else {
+                assert_equal {fcall myfunc 0} [lindex $script_cmd 3]
+            }
+            assert_equal {ping} [lindex $ping_cmd 3]
+
+            # Make sure the client info are the logged.
+            assert_equal [lindex $script_cmd 4] [lindex $ping_cmd 4]
+            assert_equal {test-client} [lindex $script_cmd 5]
+            assert_equal {test-client} [lindex $ping_cmd 5]
+        }
+    }
+}
diff --git a/valkey.conf b/valkey.conf
index 8ea5273045..0c72c0d89a 100644
--- a/valkey.conf
+++ b/valkey.conf
@@ -1972,29 +1972,70 @@ aof-max-size 0
 # cluster-announce-port 0
 # cluster-announce-bus-port 6380
 
-################################## SLOW LOG ###################################
-
-# The server Slow Log is a system to log queries that exceeded a specified
-# execution time. The execution time does not include the I/O operations
-# like talking with the client, sending the reply and so forth,
-# but just the time needed to actually execute the command (this is the only
-# stage of command execution where the thread is blocked and can not serve
-# other requests in the meantime).
-#
-# You can configure the slow log with two parameters: one tells the server
-# what is the execution time, in microseconds, to exceed in order for the
-# command to get logged, and the other parameter is the length of the
-# slow log. When a new command is logged the oldest one is removed from the
-# queue of logged commands.
-
-# The following time is expressed in microseconds, so 1000000 is equivalent
-# to one second. Note that a negative number disables the slow log, while
-# a value of zero forces the logging of every command.
-slowlog-log-slower-than 10000
-
+################################## COMMAND LOG ###################################
+
+# The Command Log system is used to record commands that consume significant resources
+# during server operation, including CPU, memory, and network bandwidth.
+# These commands and the data they access may lead to abnormal instance operations,
+# the commandlog can help users quickly and intuitively locate issues.
+#
+# Currently, three types of command logs are supported:
+#
+# SLOW: Logs commands that exceed a specified execution time. This excludes time spent 
+# on I/O operations like client communication and focuses solely on the command's 
+# processing time, where the main thread is blocked.
+#
+# LARGE-REQUEST: Logs commands with requests exceeding a defined size. This helps 
+# identify potentially problematic commands that send excessive data to the server.
+#
+# LARGE-REPLY: Logs commands that generate replies exceeding a defined size. This 
+# helps identify commands that return unusually large amounts of data, which may 
+# impact network performance or client processing.
+#
+# Each log type has two key parameters:
+# 1. A threshold value that determines when a command is logged. This threshold is specific 
+#    to the type of log (e.g., execution time, request size, or reply size). A negative value disables
+#    logging. A value of 0 logs all commands.
+# 2. A maximum length that specifies the number of entries to retain in the log. Increasing 
+#    the length allows more entries to be stored but consumes additional memory. To clear all
+#    entries for a specific log type and reclaim memory, use the `COMMANDLOG RESET` 
+#    subcommand followed by the log type.
+#
+# SLOW Command Logs
+# The SLOW log records commands that exceed a specified execution time. The execution time 
+# does not include I/O operations, such as client communication or sending responses. 
+# It only measures the time spent executing the command, during which the thread is blocked 
+# and cannot handle other requests.
+#
+# The threshold is measured in microseconds.
+#
+# Backward Compatibility: The parameter `slowlog-log-slower-than` is still supported but 
+# deprecated in favor of `commandlog-slow-execution`.
+commandlog-execution-slower-than 10000
 # There is no limit to this length. Just be aware that it will consume memory.
-# You can reclaim memory used by the slow log with SLOWLOG RESET.
-slowlog-max-len 128
+# You can reclaim memory used by the slow log with SLOWLOG RESET or COMMANDLOG RESET SLOW.
+commandlog-slow-execution-max-len 128
+#
+# LARGE_REQUEST Command Logs
+# The LARGE_REQUEST log tracks commands with requests exceeding a specified size. The request size 
+# includes the command itself and all its arguments. For example, in `SET KEY VALUE`, the size is 
+# determined by the combined size of the key and value. Commands that consume excessive network 
+# bandwidth or query buffer space are recorded here.
+#
+# The threshold is measured in bytes.
+commandlog-request-larger-than 1048576
+# Record the number of commands.
+commandlog-large-request-max-len 128
+#
+# LARGE_REPLY Command Logs
+# The LARGE_REPLY log records commands that produce replies exceeding a specified size. These replies 
+# may consume significant network bandwidth or client output buffer space. Examples include commands 
+# like `KEYS` or `HGETALL` that return large datasets. Even a `GET` command may qualify if the value 
+# is substantial.
+#
+# The threshold is measured in bytes. 
+commandlog-reply-larger-than 1048576
+commandlog-large-reply-max-len 128
 
 ################################ LATENCY MONITOR ##############################
 

From 58482ff624f1be18f95682c55b2bdcd4186a6622 Mon Sep 17 00:00:00 2001
From: Harkrishn Patro <bunty.hari@gmail.com>
Date: Fri, 24 Jan 2025 10:47:10 -0800
Subject: [PATCH 098/101] Set GH actions job timeout to a day (#1540)

Signed-off-by: Harkrishn Patro <harkrisp@amazon.com>
---
 .github/workflows/daily.yml    | 52 +++++++++++++++++-----------------
 .github/workflows/external.yml |  6 ++--
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index c0e7e4b446..309e8353f7 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -44,7 +44,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'ubuntu')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -87,7 +87,7 @@ jobs:
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'fortify')
     container: ubuntu:plucky
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -132,7 +132,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'malloc')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -171,7 +171,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'malloc')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -210,7 +210,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, '32bit')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -256,7 +256,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'tls')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -302,7 +302,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'tls')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -348,7 +348,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'iothreads')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -382,7 +382,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'tls') && !contains(github.event.inputs.skipjobs, 'iothreads')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -420,7 +420,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'specific')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -496,7 +496,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
       !contains(github.event.inputs.skipjobs, 'valgrind') && !contains(github.event.inputs.skiptests, 'valkey')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -528,7 +528,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
       !contains(github.event.inputs.skipjobs, 'valgrind') && !(contains(github.event.inputs.skiptests, 'modules') && contains(github.event.inputs.skiptests, 'unittest'))
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -565,7 +565,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
       !contains(github.event.inputs.skipjobs, 'valgrind') && !contains(github.event.inputs.skiptests, 'valkey')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -597,7 +597,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
       !contains(github.event.inputs.skipjobs, 'valgrind') && !(contains(github.event.inputs.skiptests, 'modules') && contains(github.event.inputs.skiptests, 'unittest'))
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -634,7 +634,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
       !contains(github.event.inputs.skipjobs, 'sanitizer')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     strategy:
       fail-fast: false
       matrix:
@@ -684,7 +684,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
       !contains(github.event.inputs.skipjobs, 'sanitizer')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     strategy:
       fail-fast: false
       matrix:
@@ -734,7 +734,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
       !contains(github.event.inputs.skipjobs, 'sanitizer')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     strategy:
       fail-fast: false
     steps:
@@ -801,7 +801,7 @@ jobs:
     runs-on: ubuntu-latest
 
     container: ${{ matrix.container }}
-    timeout-minutes: 14400
+    timeout-minutes: 1440
 
     steps:
       - name: prep
@@ -867,7 +867,7 @@ jobs:
     runs-on: ubuntu-latest
 
     container: ${{ matrix.container }}
-    timeout-minutes: 14400
+    timeout-minutes: 1440
 
     steps:
       - name: prep
@@ -939,7 +939,7 @@ jobs:
     runs-on: ubuntu-latest
 
     container: ${{ matrix.container }}
-    timeout-minutes: 14400
+    timeout-minutes: 1440
 
     steps:
       - name: prep
@@ -990,7 +990,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'macos') && !(contains(github.event.inputs.skiptests, 'valkey') && contains(github.event.inputs.skiptests, 'modules'))
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -1021,7 +1021,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'macos') && !contains(github.event.inputs.skiptests, 'sentinel')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -1049,7 +1049,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'macos') && !contains(github.event.inputs.skiptests, 'cluster')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -1081,7 +1081,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'macos')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0
         with:
@@ -1109,7 +1109,7 @@ jobs:
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
         (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
       !contains(github.event.inputs.skipjobs, 'freebsd')
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - name: prep
         if: github.event_name == 'workflow_dispatch'
@@ -1216,7 +1216,7 @@ jobs:
 
   reply-schemas-validator:
     runs-on: ubuntu-latest
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     if: |
       (github.event_name == 'workflow_dispatch' ||
         (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
diff --git a/.github/workflows/external.yml b/.github/workflows/external.yml
index eeacf80609..a9777538fd 100644
--- a/.github/workflows/external.yml
+++ b/.github/workflows/external.yml
@@ -17,7 +17,7 @@ jobs:
   test-external-standalone:
     runs-on: ubuntu-latest
     if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey'
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Build
@@ -42,7 +42,7 @@ jobs:
   test-external-cluster:
     runs-on: ubuntu-latest
     if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey'
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Build
@@ -70,7 +70,7 @@ jobs:
   test-external-nodebug:
     runs-on: ubuntu-latest
     if: github.event_name != 'schedule' || github.repository == 'valkey-io/valkey'
-    timeout-minutes: 14400
+    timeout-minutes: 1440
     steps:
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Build

From 5924f056297f1ac28c96154638ad8ca5563393e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Mon, 27 Jan 2025 04:38:54 +0100
Subject: [PATCH 099/101] Test coverage for COMMANDLOG HELP (#1617)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes reply-schema-validator test job which needs coverage for all
commands.

Failing job:
https://github.com/valkey-io/valkey/actions/runs/12969591890/job/36173810824

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 tests/unit/other.tcl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl
index 15c0d38136..0793283592 100644
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@@ -11,6 +11,7 @@ start_server {tags {"other"}} {
         assert_match "*MEMORY <subcommand> *" [r MEMORY HELP]
         assert_match "*PUBSUB <subcommand> *" [r PUBSUB HELP]
         assert_match "*SLOWLOG <subcommand> *" [r SLOWLOG HELP]
+        assert_match "*COMMANDLOG <subcommand> *" [r COMMANDLOG HELP]
         assert_match "*CLIENT <subcommand> *" [r CLIENT HELP]
         assert_match "*COMMAND <subcommand> *" [r COMMAND HELP]
         assert_match "*CONFIG <subcommand> *" [r CONFIG HELP]

From 9436734652b026be121c905a9a1b004a1ea04df6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Mon, 27 Jan 2025 10:13:46 +0100
Subject: [PATCH 100/101] Deflake hashtable random fairness test (#1618)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the unit test for hashtable random fairness intermittent failures when
running with the `--accurate` flag.

https://github.com/valkey-io/valkey/actions/runs/12969591890/job/36173815884#step:10:105

The test case picks a random element out of 400, repeated 1M times, and
then checks that 60% of the elements are picked within 3 standard
deviations from the number of times they're expected to be picked. In
this test run (with `--accurate`), the expected number is 2500 and the
standard deviation is 50, which is only 2% of the expected value. This
makes the check too strict and makes the test flaky.

As an alternative, we allow 80% of the elements to be picked within 10%
of the expected number. With this alternative condition, we can also
raise the check for the non-edge case from 60% to 80% of the elements to
be within 3 standard deviations. (With fewer repetitions, 3 standard
deviations is greater than 10% of the expected value, so this new
condition only affects the `--accurate` test run.)

Additional change: Set a random seed to the hash function in the test
suite. Until now, we only seeded the random number generator.

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/unit/test_hashtable.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/unit/test_hashtable.c b/src/unit/test_hashtable.c
index 71a7dde841..4fcd1f37a3 100644
--- a/src/unit/test_hashtable.c
+++ b/src/unit/test_hashtable.c
@@ -21,6 +21,9 @@ static void randomSeed(void) {
     getRandomBytes((void *)&seed, sizeof(seed));
     init_genrand64(seed);
     srandom((unsigned)seed);
+    uint8_t hashseed[16];
+    getRandomBytes(hashseed, sizeof(hashseed));
+    hashtableSetHashFunctionSeed(hashseed);
 }
 
 /* An entry holding a string key and a string value in one allocation. */
@@ -749,7 +752,7 @@ int test_random_entry(int argc, char **argv, int flags) {
     /* With large n, the distribution approaches a normal distribution and we
      * can use p68 = within 1 std dev, p95 = within 2 std dev, p99.7 = within 3
      * std dev. */
-    long p68 = 0, p95 = 0, p99 = 0, p4dev = 0, p5dev = 0;
+    long p68 = 0, p95 = 0, p99 = 0, p4dev = 0, p5dev = 0, p10percent = 0;
     for (size_t j = 0; j < count; j++) {
         double dev = expected - times_picked[j];
         p68 += (dev >= -std_dev && dev <= std_dev);
@@ -757,7 +760,9 @@ int test_random_entry(int argc, char **argv, int flags) {
         p99 += (dev >= -std_dev * 3 && dev <= std_dev * 3);
         p4dev += (dev >= -std_dev * 4 && dev <= std_dev * 4);
         p5dev += (dev >= -std_dev * 5 && dev <= std_dev * 5);
+        p10percent += (dev >= -0.1 * expected && dev <= 0.1 * expected);
     }
+
     printf("Random entry fairness test\n");
     printf("  Pick one of %zu entries, %ld times.\n", count, num_rounds);
     printf("  Expecting each entry to be picked %.2lf times, std dev %.3lf.\n", expected, std_dev);
@@ -766,12 +771,25 @@ int test_random_entry(int argc, char **argv, int flags) {
     printf("  Within 3 std dev (p99) = %.2lf%%\n", 100 * p99 / m);
     printf("  Within 4 std dev       = %.2lf%%\n", 100 * p4dev / m);
     printf("  Within 5 std dev       = %.2lf%%\n", 100 * p5dev / m);
+    printf("  Within 10%% dev         = %.2lf%%\n", 100 * p10percent / m);
 
     /* Conclusion? The number of trials (n) relative to the probabilities (p and
      * 1 − p) must be sufficiently large (n * p ≥ 5 and n * (1 − p) ≥ 5) to
      * approximate a binomial distribution with a normal distribution. */
     if (n / m >= 5 && n * (1 - 1 / m) >= 5) {
-        TEST_ASSERT_MESSAGE("Too unfair randomness", 100 * p99 / m >= 60.0);
+        /* Check that 80% of the elements are picked within 3 std deviations of
+         * the expected number. This is a low bar, since typically the 99% of
+         * the elements are within this range.
+         *
+         * There is an edge case. When n is very large and m is very small, the
+         * std dev of a binomial distribution is very small, which becomes too
+         * strict for our bucket layout and makes the test flaky. For example
+         * with m = 400 and n = 1M, we get an expected value of 2500 and a std
+         * dev of 50, which is just 2% of the expected value. We lower the bar
+         * for this case and accept that 80% of elements are just within 10% of
+         * the expected value. */
+        TEST_ASSERT_MESSAGE("Too unfair randomness",
+                            100 * p99 / m >= 80.0 || 100 * p10percent / m >= 80.0);
     } else {
         printf("To uncertain numbers to draw any conclusions about fairness.\n");
     }

From 1f3dd5ce4196642dcc9e9116d63eb274604f3e3f Mon Sep 17 00:00:00 2001
From: kronwerk <kronwerk@users.noreply.github.com>
Date: Mon, 27 Jan 2025 18:34:35 +0300
Subject: [PATCH 101/101] change error message for aox-max-size exceeding

---
 src/aof.c                   | 6 +++---
 src/script.c                | 2 +-
 src/server.c                | 6 +++++-
 src/server.h                | 1 +
 tests/unit/aof-max-size.tcl | 6 +++---
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index fd7693cf00..fc3f935db4 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1012,12 +1012,12 @@ int startAppendOnly(void) {
  * is likely to fail. However apparently in modern systems this is no longer
  * true, and in general it looks just more resilient to retry the write. If
  * there is an actual error condition we'll get it at the next try.
- * We also check for aof-max-size limit here returning "no space" on exceed. */
+ * We also check for aof-max-size limit here returning custom error on exceed. */
 ssize_t aofWrite(int fd, const char *buf, size_t len, off_t aof_current_size, unsigned long long aof_max_size) {
     ssize_t nwritten = 0, totwritten = 0, nonewritten = -1;
 
     if (aof_max_size && (unsigned long long)aof_current_size >= aof_max_size) {
-        errno = ENOSPC;
+        errno = EFBIG;
         return nonewritten;
     }
 
@@ -1158,7 +1158,7 @@ void flushAppendOnlyFile(int force) {
         /* Log the AOF write error and record the error code. */
         if (nwritten == -1) {
             if (can_log) {
-                serverLog(LL_WARNING, "Error writing to the AOF file: %s", strerror(errno));
+                serverLog(LL_WARNING, "Error writing to the AOF file: %s", getAofWriteErrStr(errno));
             }
             server.aof_last_write_errno = errno;
         } else {
diff --git a/src/script.c b/src/script.c
index a8e5b18eb9..1bd318c10e 100644
--- a/src/script.c
+++ b/src/script.c
@@ -179,7 +179,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx,
                                         "Writable scripts are blocked. Use 'no-writes' flag for read only scripts. "
                                         "AOF error: %s",
                                         server.extended_redis_compat ? "Redis" : SERVER_TITLE,
-                                        strerror(server.aof_last_write_errno));
+                                        getAofWriteErrStr(server.aof_last_write_errno));
                 return C_ERR;
             }
 
diff --git a/src/server.c b/src/server.c
index 21e42e5884..855ed60ab6 100644
--- a/src/server.c
+++ b/src/server.c
@@ -4627,13 +4627,17 @@ int writeCommandsDeniedByDiskError(void) {
     return DISK_ERROR_TYPE_NONE;
 }
 
+char* getAofWriteErrStr(int error_code) {
+    return (errno == EFBIG) ? "Reached aof-max-size" : strerror(error_code);
+}
+
 sds writeCommandsGetDiskErrorMessage(int error_code) {
     sds ret = NULL;
     if (error_code == DISK_ERROR_TYPE_RDB) {
         ret = sdsdup(shared.bgsaveerr->ptr);
     } else {
         ret = sdscatfmt(sdsempty(), "-MISCONF Errors writing to the AOF file: %s\r\n",
-                        strerror(server.aof_last_write_errno));
+                        getAofWriteErrStr(server.aof_last_write_errno));
     }
     return ret;
 }
diff --git a/src/server.h b/src/server.h
index 3c6e229028..6324607c7c 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2965,6 +2965,7 @@ int allPersistenceDisabled(void);
 #define DISK_ERROR_TYPE_RDB 2  /* Don't accept writes: RDB errors. */
 #define DISK_ERROR_TYPE_NONE 0 /* No problems, we can accept writes. */
 int writeCommandsDeniedByDiskError(void);
+char* getAofWriteErrStr(int);
 sds writeCommandsGetDiskErrorMessage(int);
 
 /* RDB persistence */
diff --git a/tests/unit/aof-max-size.tcl b/tests/unit/aof-max-size.tcl
index e5526d819f..8edae88d6d 100644
--- a/tests/unit/aof-max-size.tcl
+++ b/tests/unit/aof-max-size.tcl
@@ -16,9 +16,9 @@ start_server {tags {"external:skip"}} {
     set master_host [srv 0 host]
     set master_port [srv 0 port]
 
-    test "Low aof-max-size stops writing AOF with ENOSPC" {
+    test "Low aof-max-size stops writing AOF with EFBIG" {
         setup
-        wait_for_log_messages 0 {"*Error writing to the AOF file: No space left on device*"} 0 100 10
+        wait_for_log_messages 0 {"*Error writing to the AOF file: Reached aof-max-size*"} 0 100 10
         cleanup
     }
 
@@ -29,7 +29,7 @@ start_server {tags {"external:skip"}} {
         set len1 [getInfoProperty $info1 aof_buffer_length]
 
         catch {r set somelongerkey somelongrvalue} err
-        assert {$err eq "MISCONF Errors writing to the AOF file: No space left on device"}
+        assert {$err eq "MISCONF Errors writing to the AOF file: Reached aof-max-size"}
         assert_equal [r get somelongerkey] ""
 
         set info2 [r info]