From 4b51350b4a005f84ef073210abf6a6d311f9eea7 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 9 Jan 2025 19:32:00 +0100 Subject: [PATCH 1/8] netstacklat: Add netstacklat tool Add the tool netstacklat, which measures latency up to different parts in the Linux ingress network stack. Base the initial implementation on a bpftrace script from Jesper Dangaard Brouer, which requiers the kernel to timestamp ingress packets (i.e. set the tstamp member of the skb). Hence, the latency recorded by the tool is the difference between the kernel timestamping point and various later points in the network stack. In this initial commit, include the eBPF programs for recording the networks stack latency at the start of the kernel functions tcp_v4_do_rcv(), tcp_data_queue(), and udp_queue_rcv_one_skb(). Use a structure making it easy to extend the tool with additional hook points in the future. Make the eBPF programs compatible with Cloudflare's ebpf_exporter, and use the map helpers (maps.bpf.h) from ebpf_exporter to ensure maps are used in a compatible way. Open code the histogram maps for different hook points as entirely separate maps, instead of encoding the different hook points in the key of a separate map as ebpf_exporter often does. This avoids costly hashmap lookups, as simple array maps can be used instead of hash maps. Also include a minimal user space loader, which loads and attaches the eBPF programs. Later commits will extend this loader to also report the recorded latencies stored in the BPF maps. Signed-off-by: Simon Sundberg --- netstacklat/.gitignore | 1 + netstacklat/Makefile | 12 +++++ netstacklat/bits.bpf.h | 29 +++++++++++ netstacklat/maps.bpf.h | 84 ++++++++++++++++++++++++++++++ netstacklat/netstacklat.bpf.c | 96 +++++++++++++++++++++++++++++++++++ netstacklat/netstacklat.c | 85 +++++++++++++++++++++++++++++++ netstacklat/netstacklat.h | 39 ++++++++++++++ 7 files changed, 346 insertions(+) create mode 100644 netstacklat/.gitignore create mode 100644 netstacklat/Makefile create mode 100644 netstacklat/bits.bpf.h create mode 100644 netstacklat/maps.bpf.h create mode 100644 netstacklat/netstacklat.bpf.c create mode 100644 netstacklat/netstacklat.c create mode 100644 netstacklat/netstacklat.h diff --git a/netstacklat/.gitignore b/netstacklat/.gitignore new file mode 100644 index 00000000..1d232888 --- /dev/null +++ b/netstacklat/.gitignore @@ -0,0 +1 @@ +netstacklat diff --git a/netstacklat/Makefile b/netstacklat/Makefile new file mode 100644 index 00000000..7b01c512 --- /dev/null +++ b/netstacklat/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) + +USER_TARGETS := netstacklat +BPF_TARGETS := netstacklat.bpf +BPF_SKEL_OBJ := netstacklat.bpf.o + +EXTRA_DEPS += netstacklat.h bits.bpf.h maps.bpf.h + +LIB_DIR = ../lib + +include $(LIB_DIR)/common.mk + diff --git a/netstacklat/bits.bpf.h b/netstacklat/bits.bpf.h new file mode 100644 index 00000000..2b7e825d --- /dev/null +++ b/netstacklat/bits.bpf.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* From https://github.com/iovisor/bcc/blob/v0.25.0/libbpf-tools/bits.bpf.h*/ + +#ifndef __BITS_BPF_H +#define __BITS_BPF_H + +static __always_inline u64 log2(u32 v) +{ + u32 shift, r; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + + return r; +} + +static __always_inline u64 log2l(u64 v) +{ + u32 hi = v >> 32; + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +#endif /* __BITS_BPF_H */ diff --git a/netstacklat/maps.bpf.h b/netstacklat/maps.bpf.h new file mode 100644 index 00000000..fde01ad5 --- /dev/null +++ b/netstacklat/maps.bpf.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: MIT */ +/* From https://github.com/cloudflare/ebpf_exporter/blob/99d2752f9e0a095b57f53e5da6856ad143b0e443/examples/maps.bpf.h */ + +#include "bits.bpf.h" + +#define lookup_or_zero_init_key(map, key, into) \ + u64 zero = 0; \ + \ + into = bpf_map_lookup_elem(map, key); \ + if (!into) { \ + bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); \ + into = bpf_map_lookup_elem(map, key); \ + if (!into) { \ + return 0; \ + } \ + } + +#define increment_variant(map, key, increment, variant) \ + u64 *count; \ + \ + lookup_or_zero_init_key(map, key, count); \ + \ + variant; \ + \ + return *count; + +static inline int increment_map(void *map, void *key, u64 increment) +{ + increment_variant(map, key, increment, __sync_fetch_and_add(count, increment)); +} + +static inline int increment_map_nosync(void *map, void *key, u64 increment) +{ + increment_variant(map, key, increment, *count += increment); +} + +// Arrays are always preallocated, so this only fails if the key is missing +#define read_array_ptr(map, key, into) \ + into = bpf_map_lookup_elem(map, key); \ + if (!into) { \ + return 0; \ + } + +#define _increment_histogram(map, key, increment, max_bucket, increment_fn) \ + if (key.bucket > max_bucket) { \ + key.bucket = max_bucket; \ + } \ + \ + increment_fn(map, &key, 1); \ + \ + if (increment > 0) { \ + key.bucket = max_bucket + 1; \ + increment_fn(map, &key, increment); \ + } + +#define _increment_ex2_histogram(map, key, increment, max_bucket, increment_fn) \ + key.bucket = log2l(increment); \ + \ + if (key.bucket > max_bucket) { \ + key.bucket = max_bucket; \ + } \ + \ + _increment_histogram(map, key, increment, max_bucket, increment_fn); + +#define increment_exp2_histogram(map, key, increment, max_bucket) \ + _increment_ex2_histogram(map, key, increment, max_bucket, increment_map) + +#define increment_exp2_histogram_nosync(map, key, increment, max_bucket) \ + _increment_ex2_histogram(map, key, increment, max_bucket, increment_map_nosync) + +#define _increment_exp2zero_histogram(map, key, increment, max_bucket, increment_fn) \ + if (increment == 0) { \ + key.bucket = 0; \ + } else { \ + key.bucket = log2l(increment) + 1; \ + } \ + \ + _increment_histogram(map, key, increment, max_bucket, increment_fn); + +#define increment_exp2zero_histogram(map, key, increment, max_bucket) \ + _increment_exp2zero_histogram(map, key, increment, max_bucket, increment_map) + +#define increment_exp2zero_histogram_nosync(map, key, increment, max_bucket) \ + _increment_exp2zero_histogram(map, key, increment, max_bucket, increment_map_nosync) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c new file mode 100644 index 00000000..8b33419a --- /dev/null +++ b/netstacklat/netstacklat.bpf.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include "vmlinux_local.h" +#include + +#include +#include +#include + +#include "netstacklat.h" +#include "maps.bpf.h" + +char LICENSE[] SEC("license") = "GPL"; + +static volatile const u64 TAI_OFFSET = (37UL * NS_PER_S); + +/* Helpers in maps.bpf.h require any histogram key to be a struct with a bucket member */ +struct hist_key { + u32 bucket; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, HIST_NBINS); + __type(key, u32); + __type(value, u64); +} tcp_v4_do_rcv_hist SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, HIST_NBINS); + __type(key, u32); + __type(value, u64); +} tcp_data_queue_hist SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, HIST_NBINS); + __type(key, u32); + __type(value, u64); +} udp_queue_rcv_hist SEC(".maps"); + +static void *hook_to_histmap(enum netstacklat_hook hook) +{ + switch (hook) { + case NETSTACKLAT_HOOK_TCP_V4_DO_RCV: + return &tcp_v4_do_rcv_hist; + case NETSTACKLAT_HOOK_TCP_DATA_QUEUE: + return &tcp_data_queue_hist; + case NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE: + return &udp_queue_rcv_hist; + default: + return NULL; + } +} + +static void record_current_netstacklat(struct sk_buff *skb, + enum netstacklat_hook hook) +{ + ktime_t delta_ns, skb_tstamp; + struct hist_key key; + + if (!skb) + return; + + skb_tstamp = BPF_CORE_READ(skb, tstamp); + if (skb_tstamp == 0) + return; + + delta_ns = bpf_ktime_get_tai_ns() - TAI_OFFSET - skb_tstamp; + if (delta_ns < 0) + return; + + increment_exp2_histogram_nosync(hook_to_histmap(hook), key, delta_ns, + HIST_MAX_LATENCY_SLOT); +} + +SEC("fentry/tcp_v4_do_rcv") +int BPF_PROG(netstacklat_tcp_v4_do_rcv, struct sock *sk, struct sk_buff *skb) +{ + record_current_netstacklat(skb, NETSTACKLAT_HOOK_TCP_V4_DO_RCV); + return 0; +} + +SEC("fentry/tcp_data_queue") +int BPF_PROG(netstacklat_tcp_data_queue, struct sock *sk, struct sk_buff *skb) +{ + record_current_netstacklat(skb, NETSTACKLAT_HOOK_TCP_DATA_QUEUE); + return 0; +} + +SEC("fentry/udp_queue_rcv_one_skb") +int BPF_PROG(netstacklat_udp_queue_rcv, struct sock *sk, struct sk_buff *skb) +{ + record_current_netstacklat(skb, NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE); + return 0; +} diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c new file mode 100644 index 00000000..0eb1ef17 --- /dev/null +++ b/netstacklat/netstacklat.c @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include +#include +#include +#include +#include + +#include +#include + +#include "netstacklat.h" +#include "netstacklat.bpf.skel.h" + +static int init_signalfd(void) +{ + sigset_t mask; + int fd, err; + + sigemptyset(&mask); + sigaddset(&mask, SIGINT); + sigaddset(&mask, SIGTERM); + + fd = signalfd(-1, &mask, 0); + if (fd < 0) + return -errno; + + err = pthread_sigmask(SIG_BLOCK, &mask, NULL); + if (err) { + err = -errno; + close(fd); + return err; + } + + return fd; +} + +int main(int argc, char *argv[]) +{ + struct signalfd_siginfo sig_info; + struct netstacklat_bpf *obj; + ssize_t read_bytes; + int sig_fd, err = 0; + char errmsg[128]; + + obj = netstacklat_bpf__open_and_load(); + if (!obj) { + err = libbpf_get_error(obj); + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed loading eBPF programs: %s\n", errmsg); + return err; + } + + err = netstacklat_bpf__attach(obj); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed to attach eBPF programs: %s\n", errmsg); + goto exit_destroy; + } + + sig_fd = init_signalfd(); + if (sig_fd < 0) { + err = sig_fd; + fprintf(stderr, "Failed setting up signal handling: %s\n", + strerror(-err)); + goto exit_detach; + } + + printf("eBPF programs are now attached\n"); + printf("eBPF program will stay attached as long as this user space program is running\n"); + printf("Hit CTRL-C to quit\n"); + + read_bytes = read(sig_fd, &sig_info, sizeof(sig_info)); + if (read_bytes != sizeof(sig_info)) { + err = EINVAL; + goto exit_sigfd; + } + +exit_sigfd: + close(sig_fd); +exit_detach: + netstacklat_bpf__detach(obj); +exit_destroy: + netstacklat_bpf__destroy(obj); + return err; +} diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h new file mode 100644 index 00000000..ba996041 --- /dev/null +++ b/netstacklat/netstacklat.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef NETSTACKLAT_H +#define NETSTACKLAT_H + +// Histograms support values up to 2^30 ns-> 1s +#define HIST_MAX_LATENCY_SLOT 30 +/* + * MAX_LATENCY_SLOT + 1 bin for hist, +1 "bin" for the "sum key" + * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys) + * that ebpf_exporter expects for exp2 hists, see the _increment_histogram + * macro in maps.bpf.h. + */ +#define HIST_NBINS (HIST_MAX_LATENCY_SLOT + 2) + +#define NS_PER_S 1000000000 + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) +#endif + +#ifndef max +#define max(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) +#endif + +enum netstacklat_hook { + NETSTACKLAT_HOOK_INVALID = 0, + NETSTACKLAT_HOOK_TCP_V4_DO_RCV, + NETSTACKLAT_HOOK_TCP_DATA_QUEUE, + NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE, + NETSTACKLAT_N_HOOKS, +}; + +#endif + From 50c07696924e2e7571ff06b441bf5f5e367f7d16 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Fri, 24 Jan 2025 14:23:06 +0100 Subject: [PATCH 2/8] netstacklat: Make userspace prog report the recorded latencies Add functionality to the user space component to periodically fetch the BPF maps netstacklat records the values in and print them out. Base the core program loop on the same setup as pping, where a single epoll instance is used to support multiple different types of events. So far it only deals with signal handling (for clean shutdown) plus a timer (for periodical reporting), but the setup can easily be extended if the program grows more complex in the future. Use the (somewhat complicated) bpf_map_lookup_batch to fetch the entire histogram maps in a single system call (instead of performing a lookup for each bin index individually). Signed-off-by: Simon Sundberg --- netstacklat/Makefile | 1 + netstacklat/netstacklat.c | 454 +++++++++++++++++++++++++++++++++++++- 2 files changed, 445 insertions(+), 10 deletions(-) diff --git a/netstacklat/Makefile b/netstacklat/Makefile index 7b01c512..759d2274 100644 --- a/netstacklat/Makefile +++ b/netstacklat/Makefile @@ -5,6 +5,7 @@ BPF_TARGETS := netstacklat.bpf BPF_SKEL_OBJ := netstacklat.bpf.o EXTRA_DEPS += netstacklat.h bits.bpf.h maps.bpf.h +LDLIBS += -lm LIB_DIR = ../lib diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 0eb1ef17..6f24a1f6 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -3,14 +3,283 @@ #include #include #include +#include +#include #include +#include +#include #include #include +#include #include "netstacklat.h" #include "netstacklat.bpf.skel.h" +struct netstacklat_config { + double report_interval_s; +}; + +#define MAX_EPOLL_EVENTS 8 + +/* + * Used pack both a "type" and a value into the epoll_event.data.u64 member. + * The topmost bits indicates the type (SIG, TIMER, etc) while the remaining + * bits can be used for the value. The MASK can be used to filter out the + * type/value. + */ +#define NETSTACKLAT_EPOLL_SIG (1ULL << 63) +#define NETSTACKLAT_EPOLL_TIMER (1ULL << 62) +#define NETSTACKLAT_EPOLL_MASK \ + (~(NETSTACKLAT_EPOLL_SIG | NETSTACKLAT_EPOLL_TIMER)) + +// Magical value used to indicate that the program should be aborted +#define NETSTACKLAT_ABORT 424242 + +#define MAX_BINSPAN_STRLEN 16 +#define MAX_BINCOUNT_STRLEN 10 +#define MAX_BAR_STRLEN (80 - 6 - MAX_BINSPAN_STRLEN - MAX_BINCOUNT_STRLEN) + +static const char *hook_to_str(enum netstacklat_hook hook) +{ + switch (hook) { + case NETSTACKLAT_HOOK_TCP_V4_DO_RCV: + return "tcp_v4_do_rcv"; + case NETSTACKLAT_HOOK_TCP_DATA_QUEUE: + return "tcp_data_queue"; + case NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE: + return "udp_queue_rcv_one_skb"; + default: + return "invalid"; + } +} + +static int hook_to_histmap(enum netstacklat_hook hook, + const struct netstacklat_bpf *obj) +{ + switch (hook) { + case NETSTACKLAT_HOOK_TCP_V4_DO_RCV: + return bpf_map__fd(obj->maps.tcp_v4_do_rcv_hist); + case NETSTACKLAT_HOOK_TCP_DATA_QUEUE: + return bpf_map__fd(obj->maps.tcp_data_queue_hist); + case NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE: + return bpf_map__fd(obj->maps.udp_queue_rcv_hist); + default: + return -EINVAL; + } +} + +static int find_first_nonzero(int nbins, __u64 hist[nbins]) +{ + int i; + + for (i = 0; i < nbins; i++) { + if (hist[i] > 0) + return i; + } + + return -1; +} + +static int find_last_nonzero(int nbins, __u64 hist[nbins]) +{ + int i; + + for (i = nbins - 1; i >= 0; i--) { + if (hist[i] > 0) + return i; + } + + return -1; +} + +static int find_largest_bin(int nbins, __u64 hist[nbins]) +{ + __u64 max_val = 0; + int i; + + for (i = 0; i < nbins; i++) { + if (hist[i] > max_val) + max_val = hist[i]; + } + + return max_val; +} + +static double ns_to_siprefix(double ns, char **prefix) +{ + static char *prefixes[] = { "n", "u", "m", "" }; + int psteps = 0; + + while (ns >= 1000 && psteps < ARRAY_SIZE(prefixes) - 1) { + ns /= 1000; + psteps++; + } + + *prefix = prefixes[psteps]; + + return ns; +} + +static void print_nchars(FILE *stream, char c, int n) +{ + while (n-- > 0) + fprintf(stream, "%c", c); +} + +static int print_bin_interval(FILE *stream, double low_bound_ns, + double high_bound_ns) +{ + char *lprefix, *hprefix; + double low_si, high_si; + + low_si = ns_to_siprefix(low_bound_ns, &lprefix); + + if (isinf(high_bound_ns)) { + high_si = INFINITY; + hprefix = " "; + } else { + high_si = ns_to_siprefix(high_bound_ns, &hprefix); + } + + return fprintf(stream, "[%.3g%ss, %.3g%ss)", low_si, lprefix, high_si, + hprefix); +} + +static void print_histbar(FILE *stream, __u64 count, __u64 max_count) +{ + int barlen = round((double)count / max_count * MAX_BAR_STRLEN); + + fprintf(stream, "|"); + print_nchars(stream, '@', barlen); + print_nchars(stream, ' ', MAX_BAR_STRLEN - barlen); + fprintf(stream, "|"); +} + +static void print_log2hist(FILE *stream, int nbins, __u64 hist[nbins], + double multiplier) +{ + int bin, start_bin, end_bin, max_bin, len; + double low_bound, high_bound = 0; + + start_bin = find_first_nonzero(nbins - 1, hist); + end_bin = find_last_nonzero(nbins - 1, hist); + max_bin = find_largest_bin(nbins - 1, hist); + + for (bin = max(0, start_bin); bin <= end_bin; bin++) { + low_bound = pow(2, bin) * multiplier; + high_bound = low_bound * 2; + + /* + * First bin will also include 0, i.e. [0, 2) + * Final bin will include any values too large to fit in the + * second-last bin. + */ + if (bin == 0) + low_bound = 0; + if (bin == nbins - 2) + high_bound = INFINITY; + + len = print_bin_interval(stream, low_bound, high_bound); + print_nchars(stream, ' ', max(0, MAX_BINSPAN_STRLEN - len) + 1); + fprintf(stream, "%*llu ", MAX_BINCOUNT_STRLEN, hist[bin]); + + print_histbar(stream, hist[bin], max_bin); + + fprintf(stream, "\n"); + } + + // Final "bin" contains the total sum of all values rather than a count + fprintf(stream, "Sum: %llu\n", hist[nbins - 1]); +} + +static void merge_percpu_hist(int nbins, int ncpus, + const __u64 percpu_hist[nbins][ncpus], + __u64 merged_hist[nbins]) +{ + int idx, cpu; + + memset(merged_hist, 0, sizeof(__u64) * nbins); + + for (idx = 0; idx < nbins; idx++) { + for (cpu = 0; cpu < ncpus; cpu++) { + merged_hist[idx] += percpu_hist[idx][cpu]; + } + } +} + +static int fetch_hist_map(int map_fd, __u64 hist[HIST_NBINS]) +{ + __u32 in_batch, out_batch, count = HIST_NBINS; + int ncpus = libbpf_num_possible_cpus(); + __u32 idx, idxs_fetched = 0; + __u64 (*percpu_hist)[ncpus]; + __u32 *keys; + int err = 0; + + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, batch_opts, .flags = BPF_EXIST); + + percpu_hist = calloc(HIST_NBINS, sizeof(*percpu_hist)); + keys = calloc(HIST_NBINS, sizeof(*keys)); + if (!percpu_hist || !keys) + return -ENOMEM; + + while (idxs_fetched < HIST_NBINS) { + err = bpf_map_lookup_batch(map_fd, + idxs_fetched > 0 ? &in_batch : NULL, + &out_batch, keys + idxs_fetched, + percpu_hist + idxs_fetched, &count, + &batch_opts); + if (err == -ENOENT) // All entries fetched + err = 0; + else if (err) + goto exit; + + // Verify keys match expected idx range + for (idx = idxs_fetched; idx < idxs_fetched + count; idx++) { + if (keys[idx] != idx) { + err = -EBADSLT; + goto exit; + } + } + + in_batch = out_batch; + idxs_fetched += count; + count = HIST_NBINS - idxs_fetched; + } + + merge_percpu_hist(HIST_NBINS, ncpus, percpu_hist, hist); + +exit: + free(percpu_hist); + free(keys); + return err; +} + +static int report_stats(const struct netstacklat_bpf *obj) +{ + enum netstacklat_hook hook; + __u64 hist[HIST_NBINS] = { 0 }; + time_t t; + int err; + + time(&t); + printf("%s", ctime(&t)); + + for (hook = 1; hook < NETSTACKLAT_N_HOOKS; hook++) { + printf("%s:\n", hook_to_str(hook)); + + err = fetch_hist_map(hook_to_histmap(hook, obj), hist); + if (err) + return err; + + print_log2hist(stdout, ARRAY_SIZE(hist), hist, 1); + printf("\n"); + } + + return 0; +} + static int init_signalfd(void) { sigset_t mask; @@ -34,12 +303,150 @@ static int init_signalfd(void) return fd; } -int main(int argc, char *argv[]) +static int handle_signal(int sig_fd) { struct signalfd_siginfo sig_info; + ssize_t size; + + size = read(sig_fd, &sig_info, sizeof(sig_info)); + if (size != sizeof(sig_info)) { + fprintf(stderr, "Failed reading signal fd\n"); + return -EBADFD; + } + + switch (sig_info.ssi_signo) { + case SIGINT: + case SIGTERM: + return NETSTACKLAT_ABORT; + default: + fprintf(stderr, "Unexpected signal: %d\n", sig_info.ssi_signo); + return -EBADR; + } +} + +static int setup_timer(__u64 interval_ns) +{ + struct itimerspec timercfg = { + .it_value = { .tv_sec = interval_ns / NS_PER_S, + .tv_nsec = interval_ns % NS_PER_S }, + .it_interval = { .tv_sec = interval_ns / NS_PER_S, + .tv_nsec = interval_ns % NS_PER_S } + }; + int fd, err; + + fd = timerfd_create(CLOCK_MONOTONIC, 0); + if (fd < 0) { + return -errno; + } + + err = timerfd_settime(fd, 0, &timercfg, NULL); + if (err) { + err = -errno; + close(fd); + return err; + } + + return fd; +} + +static int handle_timer(int timer_fd, const struct netstacklat_bpf *obj) +{ + __u64 timer_exps; + ssize_t size; + + size = read(timer_fd, &timer_exps, sizeof(timer_exps)); + if (size != sizeof(timer_exps)) { + fprintf(stderr, "Failed reading timer fd\n"); + return -EBADFD; + } + + if (timer_exps == 0) + return 0; + if (timer_exps > 1) + fprintf(stderr, "Warning: Missed %llu reporting intervals\n", + timer_exps - 1); + + return report_stats(obj); +} + +static int epoll_add_event(int epoll_fd, int fd, __u64 event_type, __u64 value) +{ + struct epoll_event ev = { + .events = EPOLLIN, + .data = { .u64 = event_type | value }, + }; + + if (value & ~NETSTACKLAT_EPOLL_MASK) + return -EINVAL; + + return epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) ? -errno : 0; +} + +static int setup_epoll_instance(int sig_fd, int timer_fd) +{ + int epoll_fd, err = 0; + + epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (epoll_fd < 0) + return -errno; + + err = epoll_add_event(epoll_fd, sig_fd, NETSTACKLAT_EPOLL_SIG, sig_fd); + if (err) + goto err; + + err = epoll_add_event(epoll_fd, timer_fd, NETSTACKLAT_EPOLL_TIMER, + timer_fd); + if (err) + goto err; + + return epoll_fd; + +err: + close(epoll_fd); + return err; +} + +static int poll_events(int epoll_fd, const struct netstacklat_bpf *obj) +{ + struct epoll_event events[MAX_EPOLL_EVENTS]; + int i, n, fd, err = 0; + __u64 epoll_type; + + n = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 100); + if (n < 0) + return -errno; + + for (i = 0; i < n; i++) { + epoll_type = events[i].data.u64 & ~NETSTACKLAT_EPOLL_MASK; + fd = events[i].data.u64 & NETSTACKLAT_EPOLL_MASK; + + switch (epoll_type) { + case NETSTACKLAT_EPOLL_SIG: + err = handle_signal(fd); + break; + case NETSTACKLAT_EPOLL_TIMER: + err = handle_timer(fd, obj); + break; + default: + fprintf(stderr, "Warning: unexpected epoll data: %lu\n", + events[i].data.u64); + break; + } + + if (err) + break; + } + + return err; +} + +int main(int argc, char *argv[]) +{ + struct netstacklat_config config = { + .report_interval_s = 5, + }; + int sig_fd, timer_fd, epoll_fd, err = 0; struct netstacklat_bpf *obj; - ssize_t read_bytes; - int sig_fd, err = 0; char errmsg[128]; obj = netstacklat_bpf__open_and_load(); @@ -65,16 +472,43 @@ int main(int argc, char *argv[]) goto exit_detach; } - printf("eBPF programs are now attached\n"); - printf("eBPF program will stay attached as long as this user space program is running\n"); - printf("Hit CTRL-C to quit\n"); - - read_bytes = read(sig_fd, &sig_info, sizeof(sig_info)); - if (read_bytes != sizeof(sig_info)) { - err = EINVAL; + timer_fd = setup_timer(config.report_interval_s * NS_PER_S); + if (timer_fd < 0) { + err = timer_fd; + fprintf(stderr, "Failed creating timer: %s\n", strerror(-err)); goto exit_sigfd; } + epoll_fd = setup_epoll_instance(sig_fd, timer_fd); + if (epoll_fd < 0) { + err = epoll_fd; + fprintf(stderr, "Failed setting up epoll: %s\n", + strerror(-err)); + goto exit_timerfd; + } + + // Report stats until user shuts down program + while (true) { + err = poll_events(epoll_fd, obj); + + if (err) { + if (err == NETSTACKLAT_ABORT) { + // Report stats a final time before terminating + report_stats(obj); + err = 0; + } else { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed polling fds: %s\n", + errmsg); + } + break; + } + } + + // Cleanup + close(epoll_fd); +exit_timerfd: + close(timer_fd); exit_sigfd: close(sig_fd); exit_detach: From 2be91ac2424cc911445d2bc193d06234c71b3b50 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Tue, 14 Jan 2025 11:01:28 +0100 Subject: [PATCH 3/8] netstacklat: Add README Add a README briefly explaining what this example does. Signed-off-by: Simon Sundberg --- netstacklat/README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 netstacklat/README.md diff --git a/netstacklat/README.md b/netstacklat/README.md new file mode 100644 index 00000000..e228b19f --- /dev/null +++ b/netstacklat/README.md @@ -0,0 +1,36 @@ +# Netstacklat - Monitor latency within the network stack +Netstacklat is a simple tool for monitoring latency within the Linux +network stack for ingress traffic. The tool relies on the kernel time +stamping received packets (`SOF_TIMESTAMPING_RX_SOFTWARE`), +specifically setting `sk_buff->tstamp`, and then reports when packets +arrive at various hooks relative to this timestamp, i.e. the time +between the latency between the packet being timestamped and reaching +the hook. + +The tool is based on the following bpftrace script from Jesper +Dangaard Brouer: +```console +sudo bpftrace -e ' + kfunc:tcp_v4_do_rcv, + kfunc:tcp_data_queue, + kfunc:udp_queue_rcv_one_skb + { + $tai_offset=37000000000; + $now=nsecs(tai)-$tai_offset; @cnt[probe]=count(); @total[probe]=count(); + $ts=args->skb->tstamp; $delta=$now-(uint64)$ts; + @hist_ns[probe]=hist($delta); + @stats[probe]=stats($delta); + //printf("now:%llu - ts:%llu = delta:%llu\n", $now, $ts, $delta); + } + interval:s:10 {time("\n%H:%M:%S\n"); + print(@cnt); clear(@cnt); + print(@total); + print(@stats); + print(@hist_ns); + }' +``` + +The eBPF part of the tool (`netstacklat.bpf.c`) is designed to be +compatible with +[ebpf_exporter](https://github.com/cloudflare/ebpf_exporter), so that +the data can easily be exported to Prometheus. From 4fb8a1ade7740e8be6f7377d397013dba09aa802 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 23 Jan 2025 16:57:48 +0100 Subject: [PATCH 4/8] netstacklat: Add command line argument parsing Add support for parsing arguments from the command line. Add the --help/-h and --report-interval/-r options. As part of this, add some minor documentation about the program and the available options that are displayed by the --help option. Base the print_usage() function on the usage() function from the traffic-pacing-edt example, but change its output format to be more consistent with most GNU utilities, listing the short option (if any) before the long option. Furthermore, also show if an argument is expected based on the has_arg value. Automatically generate the optstring that getopt_long() expects from the options array. While needlessly complicated for the two options currently supported, it simplifies adding additional options in the future. To allow for mapping the opt value returned by getopt_long() back the corresponding option struct (for example to access the full name of the option), add the optval_to_longopt() helper that linearly searches through the options array. This is more reliable than using the longindex pointer from getopt_long(), as that pointer is only set when a long option is passed, and not when the corresponding short option is passed. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.c | 144 +++++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 1 deletion(-) diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 6f24a1f6..9523e5f2 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -1,10 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ +static const char *__doc__ = + "Netstacklat - Monitor latency to various points in the ingress network stack"; + #include #include #include #include #include #include +#include +#include #include #include #include @@ -40,6 +45,136 @@ struct netstacklat_config { #define MAX_BINCOUNT_STRLEN 10 #define MAX_BAR_STRLEN (80 - 6 - MAX_BINSPAN_STRLEN - MAX_BINCOUNT_STRLEN) +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "report-interval", required_argument, NULL, 'r' }, + { 0, 0, 0, 0 } +}; + +static const struct option *optval_to_longopt(int val) +{ + int i; + + for (i = 0; long_options[i].name != 0; i++) { + if (long_options[i].val == val) + return &long_options[i]; + } + + return NULL; +} + +static int generate_optstr(char *buf, size_t size) +{ + int i, optlen, strlen = 0; + char optstr[4]; + + for (i = 0; long_options[i].name != 0; i++) { + if (long_options[i].flag || !isalnum(long_options[i].val)) + continue; + + optlen = snprintf( + optstr, sizeof(optstr), "%c%s", long_options[i].val, + long_options[i].has_arg == optional_argument ? "::" : + long_options[i].has_arg == required_argument ? ":" : + ""); + if (strlen + optlen < size) { + strncpy(buf + strlen, optstr, optlen + 1); + } + strlen += optlen; + } + + return strlen + 1; +} + +static void print_usage(FILE *stream, const char *prog_name) +{ + int i; + + fprintf(stream, "\nDOCUMENTATION:\n%s\n", __doc__); + fprintf(stream, "\n"); + fprintf(stream, " Usage: %s (options-see-below)\n", prog_name); + fprintf(stream, " Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + if (!long_options[i].flag && isalnum(long_options[i].val)) + fprintf(stream, " -%c, ", long_options[i].val); + else + fprintf(stream, " "); + + printf(" --%s", long_options[i].name); + + if (long_options[i].has_arg == required_argument) + fprintf(stream, " "); + else if (long_options[i].has_arg == optional_argument) + fprintf(stream, "[ARG]"); + + fprintf(stream, "\n"); + } + printf("\n"); +} + +static int parse_bounded_double(double *res, const char *str, double low, + double high, const char *name) +{ + char *endptr; + errno = 0; + + *res = strtod(str, &endptr); + if (endptr == str || strlen(str) != endptr - str) { + fprintf(stderr, "%s %s is not a valid number\n", name, str); + return -EINVAL; + } + + if (errno == ERANGE) { + fprintf(stderr, "%s %s overflowed\n", name, str); + return -ERANGE; + } + + if (*res < low || *res > high) { + fprintf(stderr, "%s must be in range [%g, %g]\n", name, low, high); + return -ERANGE; + } + + return 0; +} + +int parse_arguments(int argc, char *argv[], struct netstacklat_config *conf) +{ + char optstr[64]; + int opt, err; + + double fval; + + if (generate_optstr(optstr, sizeof(optstr)) > sizeof(optstr)) { + fprintf(stderr, + "Internal error: optstr too short to fit all long_options\n"); + return -ENAMETOOLONG; + } + + while ((opt = getopt_long(argc, argv, optstr, long_options, + NULL)) != -1) { + switch (opt) { + case 'r': // report interval + err = parse_bounded_double( + &fval, optarg, 0.01, 3600 * 24, + optval_to_longopt(opt)->name); + if (err) + return err; + + conf->report_interval_s = fval; + break; + case 'h': // help + print_usage(stdout, argv[0]); + exit(EXIT_SUCCESS); + default: + // unrecognized option reported by getopt, so just print usage + print_usage(stderr, argv[0]); + return -EINVAL; + } + } + + return 0; +} + static const char *hook_to_str(enum netstacklat_hook hook) { switch (hook) { @@ -445,10 +580,17 @@ int main(int argc, char *argv[]) struct netstacklat_config config = { .report_interval_s = 5, }; - int sig_fd, timer_fd, epoll_fd, err = 0; + int sig_fd, timer_fd, epoll_fd, err; struct netstacklat_bpf *obj; char errmsg[128]; + err = parse_arguments(argc, argv, &config); + if (err) { + fprintf(stderr, "Failed parsing arguments: %s\n", + strerror(-err)); + return err; + } + obj = netstacklat_bpf__open_and_load(); if (!obj) { err = libbpf_get_error(obj); From 7e178dfc092bcbce010a5fd51af63e1d9b11cbab Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 23 Jan 2025 20:08:23 +0100 Subject: [PATCH 5/8] netstacklat: Enable RX software timestamping on startup The netstacklat tool depends on the tstamp member in the skbs to be set by the kernel. So make the user space loader enable software RX timestamping on all packets when the tool is started. Note that, as the documentation at https://docs.kernel.org/networking/timestamping.html points out, enabling software receive timestamping on a socket enables it for all packets, as the destination socket is not known early in the network stack. It is therefore sufficient to enable it on a socket created by the program itself, even if no packets are received on that socket. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.c | 53 ++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 9523e5f2..084d0132 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -13,10 +13,12 @@ static const char *__doc__ = #include #include #include +#include #include #include #include +#include #include "netstacklat.h" #include "netstacklat.bpf.skel.h" @@ -415,6 +417,34 @@ static int report_stats(const struct netstacklat_bpf *obj) return 0; } +static int enable_sw_rx_tstamps(void) +{ + int tstamp_opt = SOF_TIMESTAMPING_RX_SOFTWARE; + int sock_fd, err; + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + err = -errno; + fprintf(stderr, "Failed opening socket: %s\n", strerror(-err)); + return err; + } + + err = setsockopt(sock_fd, SOL_SOCKET, SO_TIMESTAMPING, &tstamp_opt, + sizeof(tstamp_opt)); + if (err) { + err = -errno; + fprintf(stderr, "Failed setting SO_TIMESTAMPING option: %s\n", + strerror(-err)); + goto err_socket; + } + + return 0; + +err_socket: + close(sock_fd); + return err; +} + static int init_signalfd(void) { sigset_t mask; @@ -580,7 +610,7 @@ int main(int argc, char *argv[]) struct netstacklat_config config = { .report_interval_s = 5, }; - int sig_fd, timer_fd, epoll_fd, err; + int sig_fd, timer_fd, epoll_fd, sock_fd, err; struct netstacklat_bpf *obj; char errmsg[128]; @@ -591,19 +621,28 @@ int main(int argc, char *argv[]) return err; } + sock_fd = enable_sw_rx_tstamps(); + if (sock_fd < 0) { + err = sock_fd; + fprintf(stderr, + "Failed enabling software RX timestamping: %s\n", + strerror(-err)); + return err; + } + obj = netstacklat_bpf__open_and_load(); if (!obj) { err = libbpf_get_error(obj); libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed loading eBPF programs: %s\n", errmsg); - return err; + goto exit_sockfd; } err = netstacklat_bpf__attach(obj); if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); fprintf(stderr, "Failed to attach eBPF programs: %s\n", errmsg); - goto exit_destroy; + goto exit_destroy_bpf; } sig_fd = init_signalfd(); @@ -611,7 +650,7 @@ int main(int argc, char *argv[]) err = sig_fd; fprintf(stderr, "Failed setting up signal handling: %s\n", strerror(-err)); - goto exit_detach; + goto exit_detach_bpf; } timer_fd = setup_timer(config.report_interval_s * NS_PER_S); @@ -653,9 +692,11 @@ int main(int argc, char *argv[]) close(timer_fd); exit_sigfd: close(sig_fd); -exit_detach: +exit_detach_bpf: netstacklat_bpf__detach(obj); -exit_destroy: +exit_destroy_bpf: netstacklat_bpf__destroy(obj); +exit_sockfd: + close(sock_fd); return err; } From 0e1087fd6b0df823798a57a3c9f0b094044a78f0 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 23 Jan 2025 21:32:46 +0100 Subject: [PATCH 6/8] natstacklat: Initialize the TAI-offset from user space Make the user space loader calculate the TAI-offset at startup and set it as a constant for the eBPF programs. Split the open and loading stages of the eBPF programs apart to enable setting constants in the eBPF programs. Note that on some systems (e.g. most debian systems by default), the TAI offset may (incorrectly) be 0, so that CLOCK_TAI becomes identical to CLOCK_REALTIME. While this is principly incorrect, it does not pose an issue for netstacklat, as it only needs the TAI offset to translate CLOCK_TAI to CLOCK_REALTIME (which skb->tstamp is assumed to use as clock basis). Therefore, netstacklat will (from this commit) work correctly even if the TAI offset is not correctly set on the system. Limitation: The TAI offset is only set once the program is first loaded, and is not dynamically updated in case the TAI offset changes. So if the program is running while a leap second occurs, the recorded latencies may be off with one second. Furthermore, as the TAI offset is set from user space, it will not work when just using the eBPF portion together with ebpf-exporter. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 2 +- netstacklat/netstacklat.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 8b33419a..c637ac2b 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -11,7 +11,7 @@ char LICENSE[] SEC("license") = "GPL"; -static volatile const u64 TAI_OFFSET = (37UL * NS_PER_S); +volatile const signed long long TAI_OFFSET = (37LL * NS_PER_S); /* Helpers in maps.bpf.h require any histogram key to be a struct with a bucket member */ struct hist_key { diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 084d0132..6c9d21b8 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -14,6 +14,7 @@ static const char *__doc__ = #include #include #include +#include #include #include @@ -445,6 +446,14 @@ static int enable_sw_rx_tstamps(void) return err; } +static long get_tai_offset(void) +{ + struct ntptimeval ntpt; + + ntp_gettimex(&ntpt); + return ntpt.tai; +} + static int init_signalfd(void) { sigset_t mask; @@ -630,14 +639,23 @@ int main(int argc, char *argv[]) return err; } - obj = netstacklat_bpf__open_and_load(); + obj = netstacklat_bpf__open(); if (!obj) { err = libbpf_get_error(obj); libbpf_strerror(err, errmsg, sizeof(errmsg)); - fprintf(stderr, "Failed loading eBPF programs: %s\n", errmsg); + fprintf(stderr, "Failed opening eBPF object file: %s\n", errmsg); goto exit_sockfd; } + obj->rodata->TAI_OFFSET = (signed long long)get_tai_offset() * NS_PER_S; + + err = netstacklat_bpf__load(obj); + if (err) { + libbpf_strerror(err, errmsg, sizeof(errmsg)); + fprintf(stderr, "Failed loading eBPF programs: %s\n", errmsg); + goto exit_destroy_bpf; + } + err = netstacklat_bpf__attach(obj); if (err) { libbpf_strerror(err, errmsg, sizeof(errmsg)); From 2fc4dc50e9c8b68d49f4737b3dd860e30a956b27 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Tue, 28 Jan 2025 10:12:05 +0100 Subject: [PATCH 7/8] netstacklat: Add ebpf-exporter config Add a YAML configuration that ebpf-exporter can use to convert the BPF maps to Prometheus metrics. Convert the histogram map for each hook into the corresponding Prometheus histogram. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.yaml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 netstacklat/netstacklat.yaml diff --git a/netstacklat/netstacklat.yaml b/netstacklat/netstacklat.yaml new file mode 100644 index 00000000..4b7a023c --- /dev/null +++ b/netstacklat/netstacklat.yaml @@ -0,0 +1,35 @@ +metrics: + histograms: + - name: tcp_v4_do_rcv_hist + help: Netstack latency histogram for tcp_v4_do_rcv() in seconds + bucket_type: exp2 + bucket_min: 0 + bucket_max: 30 + bucket_multiplier: 0.000000001 # nanoseconds to seconds + labels: + - name: bucket + size: 4 + decoders: + - name: uint + - name: tcp_data_queue_hist + help: Netstack latency histogram for tcp_data_queue() in seconds + bucket_type: exp2 + bucket_min: 0 + bucket_max: 30 + bucket_multiplier: 0.000000001 # nanoseconds to seconds + labels: + - name: bucket + size: 4 + decoders: + - name: uint + - name: udp_queue_rcv_hist + help: Netstack latency histogram for udp_queue_rcv_one_skb() in seconds + bucket_type: exp2 + bucket_min: 0 + bucket_max: 30 + bucket_multiplier: 0.000000001 # nanoseconds to seconds + labels: + - name: bucket + size: 4 + decoders: + - name: uint From c875957ef5171e747a49494a0d687d6dd4dfd169 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Tue, 28 Jan 2025 12:01:11 +0100 Subject: [PATCH 8/8] netstacklat: Make build process compatible with ebpf-exporter Add a BPF_EXAMPLES macro flag that let's netstack.bpf.c know if it's being compiled in the bpf-examples repository or ebpf-exporter, and allows it to adjust the included headers accordingly. This allows copying the netstack.bpf.c, netstack.h and netstack.yaml files directly into ebpf_exporter/examples without any modifications. Signed-off-by: Simon Sundberg --- netstacklat/Makefile | 1 + netstacklat/netstacklat.bpf.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/netstacklat/Makefile b/netstacklat/Makefile index 759d2274..25fa9098 100644 --- a/netstacklat/Makefile +++ b/netstacklat/Makefile @@ -4,6 +4,7 @@ USER_TARGETS := netstacklat BPF_TARGETS := netstacklat.bpf BPF_SKEL_OBJ := netstacklat.bpf.o +EXTRA_CFLAGS += -DBPF_EXAMPLES EXTRA_DEPS += netstacklat.h bits.bpf.h maps.bpf.h LDLIBS += -lm diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index c637ac2b..839be433 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -1,6 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifdef BPF_EXAMPLES #include "vmlinux_local.h" #include +#else +#include +#endif #include #include