daemon/defer: add hard-timeout for interrupting expensive computations

CZ-NIC · Jan 9, 2025 · 74b05f9 · 74b05f9
1 parent d5231f0
commit 74b05f9
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 10 deletions.
diff --git a/daemon/defer.c b/daemon/defer.c
@@ -4,6 +4,7 @@
 
 #include <math.h>
 #include <stdatomic.h>
+#include <unistd.h>
 #include "daemon/defer.h"
 #include "daemon/session2.h"
 #include "daemon/udp_queue.h"
@@ -62,6 +63,7 @@ struct defer {
 	size_t capacity;
 	kru_price_t max_decay;
 	uint32_t log_period;
+	uint32_t hard_timeout;
 	int cpus;
 	bool using_avx2;
 	_Atomic uint32_t log_time;
@@ -642,9 +644,36 @@ static void defer_queues_idle(uv_idle_t *handle)
 	VERBOSE_LOG("POLL\n");
 }
 
+static void defer_alarm(int signum)
+{
+	if (!defer || (defer->hard_timeout == 0)) return;
+
+	uint64_t elapsed = 0;
+	if (defer_sample_state.is_accounting) {
+		elapsed = defer_get_stamp() - defer_sample_state.stamp;
+		VERBOSE_LOG("SIGALRM %s, host %s used %.3f s of cpu time on ongoing operation\n",
+				signum ? "received" : "initialized",
+				kr_straddr(&defer_sample_state.addr.ip), elapsed / 1000000000.0); // XXX
+	} else {
+		VERBOSE_LOG("SIGALRM %s, no measuring in progress\n",
+				signum ? "received" : "initialized");
+	}
+	int64_t rest_to_timeout_ms = defer->hard_timeout - elapsed / 1000000; // ms - ns
+	if (rest_to_timeout_ms <= 0) {
+		uv_update_time(uv_default_loop()); // TODO more conceptual solution?
+		defer_charge(elapsed, &defer_sample_state.addr, defer_sample_state.stream);
+		kr_log_crit(DEFER, "Host %s used %0.3f s of cpu time continuously, interrupting cresd.\n",
+			kr_straddr(&defer_sample_state.addr.ip), elapsed / 1000000000.0);
+		classify(&defer_sample_state.addr, defer_sample_state.stream); // XXX
+		__sync_synchronize();
+		abort();
+	}
+	alarm((rest_to_timeout_ms + 999) / 1000);
+}
 
 /// Initialize shared memory, queues. To be called from Lua.
-int defer_init(const char *mmap_file, uint32_t log_period, int cpus)  // TODO possibly remove cpus; not needed
+int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus)
+	// TODO possibly remove cpus; not needed
 {
 	defer_initialized = true;
 	if (mmap_file == NULL) {
@@ -662,6 +691,7 @@ int defer_init(const char *mmap_file, uint32_t log_period, int cpus)  // TODO po
 		.capacity = KRU_CAPACITY,
 		.max_decay = MAX_DECAY,
 		.log_period = log_period,
+		.hard_timeout = hard_timeout,
 		.cpus = cpus,
 		.using_avx2 = using_avx2(),
 	};
@@ -676,6 +706,7 @@ int defer_init(const char *mmap_file, uint32_t log_period, int cpus)  // TODO po
 			sizeof(header.capacity) +
 			sizeof(header.max_decay) +
 			sizeof(header.log_period) +
+			sizeof(header.hard_timeout) +
 			sizeof(header.cpus),
 		"detected padding with undefined data inside mmapped header");
 
@@ -713,6 +744,9 @@ int defer_init(const char *mmap_file, uint32_t log_period, int cpus)  // TODO po
 	for (size_t i = 0; i < QUEUES_CNT; i++)
 		queue_init(queues[i]);
 
+	signal(SIGALRM, defer_alarm);
+	defer_alarm(0);
+
 	return 0;
 
 fail:

diff --git a/daemon/defer.h b/daemon/defer.h
@@ -9,9 +9,9 @@
 
 /// Initialize defer, incl. shared memory with KRU, excl. idle.
 KR_EXPORT
-int defer_init(const char *mmap_file, uint32_t log_period, int cpus);
+int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, int cpus);
 
-/// Initialize idle.
+/// Initialize idle and SIGALRM handler.
 int defer_init_idle(uv_loop_t *loop);
 
 /// Deinitialize shared memory.
@@ -79,9 +79,10 @@ static inline void defer_sample_start_stamp(uint64_t stamp)
 {
 	if (!defer) return;
 	kr_assert(!defer_sample_state.is_accounting);
-	defer_sample_state.is_accounting = true;
 	defer_sample_state.stamp = stamp;
 	defer_sample_state.addr.ip.sa_family = AF_UNSPEC;
+	__sync_synchronize();
+	defer_sample_state.is_accounting = true;
 }
 
 /// Internal; stop accounting work at specified timestamp and charge the source if applicable.
@@ -90,6 +91,7 @@ static inline void defer_sample_stop_stamp(uint64_t stamp)
 	if (!defer) return;
 	kr_assert(defer_sample_state.is_accounting);
 	defer_sample_state.is_accounting = false;
+	__sync_synchronize();
 
 	if (defer_sample_state.addr.ip.sa_family == AF_UNSPEC) return;
 
@@ -146,7 +148,10 @@ static inline void defer_sample_stop(defer_sample_state_t *prev_state, bool reus
 
 	// resume
 	if (prev_state) {
-		defer_sample_state = *prev_state;
+		defer_sample_state.addr = prev_state->addr;
+		defer_sample_state.stream = prev_state->stream;
 		defer_sample_state.stamp = stamp;
+		__sync_synchronize();
+		defer_sample_state.is_accounting = prev_state->is_accounting;
 	}
 }
diff --git a/daemon/lua/kres-gen-33.lua b/daemon/lua/kres-gen-33.lua
@@ -617,7 +617,7 @@ struct qr_task *worker_resolve_start(knot_pkt_t *, struct kr_qflags);
 int zi_zone_import(const zi_config_t);
 _Bool ratelimiting_request_begin(struct kr_request *);
 int ratelimiting_init(const char *, size_t, uint32_t, uint32_t, uint16_t, uint32_t, _Bool);
-int defer_init(const char *, uint32_t, int);
+int defer_init(const char *, uint32_t, uint32_t, int);
 struct engine {
 	char _stub[];
 };

diff --git a/daemon/main.c b/daemon/main.c
@@ -553,7 +553,7 @@ int main(int argc, char **argv)
 
 	uv_loop_t *loop = uv_default_loop();
 	/* Catch some signals. */
-	uv_signal_t sigint, sigterm, sigchld;
+	uv_signal_t sigint, sigterm, sigchld; // +SIGALRM handled by defer
 	if (true) ret = uv_signal_init(loop, &sigint);
 	if (!ret) ret = uv_signal_init(loop, &sigterm);
 	if (!ret) ret = uv_signal_init(loop, &sigchld);
@@ -618,7 +618,7 @@ int main(int argc, char **argv)
 
 	if (!defer_initialized) {
 		kr_log_warning(SYSTEM, "Prioritization not initialized from Lua, using hardcoded default.\n");
-		ret = defer_init("defer", 1, 1);
+		ret = defer_init("defer", 1, 0, 1);
 		if (ret) {
 			ret = EXIT_FAILURE;
 			goto cleanup;

diff --git a/doc/_static/config.schema.json b/doc/_static/config.schema.json
@@ -1727,11 +1727,18 @@
                     "pattern": "^(\\d+)(us|ms|s|m|h|d)$",
                     "description": "Minimal time between two log messages, or '0s' to disable.",
                     "default": "0s"
+                },
+                "hard-timeout": {
+                    "type": "string",
+                    "pattern": "^(\\d+)(us|ms|s|m|h|d)$",
+                    "description": "If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.",
+                    "default": "0s"
                 }
             },
             "default": {
                 "enabled": true,
-                "log_period": "0s"
+                "log_period": "0s",
+                "hard_timeout": "0s"
             }
         },
         "lua": {

diff --git a/doc/user/config-defer.rst b/doc/user/config-defer.rst
@@ -8,6 +8,7 @@ Request prioritization (defer)
 Defer tries to mitigate DoS attacks by measuring cpu time consumption of different hosts and networks
 and deferring future requests from the same origin.
 If there is not enough time to process all the requests, the lowest priority ones are dropped.
+It also allows setting a hard timeout on a continuous computation on a single request.
 
 The time measurements are taken into account only for TCP,
 as the source address of plain UDP can be forged.
@@ -56,3 +57,26 @@ The detailed configuration is printed by ``defer`` group on ``info`` level on st
     and logging is disabled for the :option:`log-period <defer/log-period: <time ms|s|m|h|d>`.
     As long as dropping is needed, one source is logged each period
     and sources with more dropped queries have greater probability to be chosen.
+
+.. option:: defer/hard-timeout: <time ms|s|m|h|d>
+
+    :default: 0s
+
+    Time limit for a cpu time consumed continuously on a single request, or ``0s`` to disable.
+    It causes crash of kresd if exceeded; use carefully.
+
+    This is intended as a last resort defence against yet unknown bugs
+    allowing an attacker to initiate very expensive computations by a single request
+    resulting in freezing kresd process for several seconds or minutes.
+
+    It is based on scheduling a SIGALRM to be delivered after the timeout (or up to 1s later),
+    which then interrupts the computation.
+    After the interrupt the priority of the request's origin is decreased according to the duration,
+    the kresd process is terminated (dropping all pending, but probably already timeouted, requests)
+    and started again by manager.
+    To keep the data with measurements and priorities alive during restart,
+    it is crucial to use :ref:`multiple workers <config-multiple-workers>`
+    as those data are shared between them and disappear with the last one.
+
+    A continuous work on a single request usually takes under 1 ms. (TODO check)
+    Set the timeout at least to several seconds to avoid random crashes. (TODO or more?)
diff --git a/python/knot_resolver/datamodel/defer_schema.py b/python/knot_resolver/datamodel/defer_schema.py
@@ -9,7 +9,9 @@ class DeferSchema(ConfigSchema):
     ---
     enabled: Use request prioritization.
     log_period: Minimal time between two log messages, or '0s' to disable.
+    hard_timeout: If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.
     """
 
     enabled: bool = True
     log_period: TimeUnit = TimeUnit("0s")
+    hard_timeout: TimeUnit = TimeUnit("0s")
diff --git a/python/knot_resolver/datamodel/templates/defer.lua.j2 b/python/knot_resolver/datamodel/templates/defer.lua.j2
@@ -4,7 +4,8 @@
 assert(C.defer_init(
 	'{{ cfg.rundir }}/defer',
 	{{ cfg.defer.log_period.millis() }},
+	{{ cfg.defer.hard_timeout.millis() }},
 	{{ cfg.workers }}) == 0)
 {% else %}
-assert(C.defer_init(nil, 0, 0) == 0)
+assert(C.defer_init(nil, 0, 0, 0) == 0)
 {%- endif %}