From c71f4cab3f5d9184b6bb83d49f95df7ccfe6006a Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 7 Jan 2025 10:24:02 -0800 Subject: [PATCH 1/2] include: import cond_break definition from the kernel --- scheds/include/scx/bpf_arena_common.h | 80 +++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/scheds/include/scx/bpf_arena_common.h b/scheds/include/scx/bpf_arena_common.h index e459ba476..d06bca85a 100644 --- a/scheds/include/scx/bpf_arena_common.h +++ b/scheds/include/scx/bpf_arena_common.h @@ -100,6 +100,86 @@ void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cn int node_id, __u64 flags) __ksym __weak; void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#endif +#endif + #else /* when compiled as user space code */ /* Provide the definition of PAGE_SIZE. */ From 63b6be90f55f8ad0c461d013468ae4b299805d8e Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 7 Jan 2025 11:17:00 -0800 Subject: [PATCH 2/2] lib/sdt_alloc: replace bpf_for loops in critical section with can_loop directive --- lib/sdt_alloc.bpf.c | 89 +++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 24 deletions(-) diff --git a/lib/sdt_alloc.bpf.c b/lib/sdt_alloc.bpf.c index 0d6e258b8..dd3ecb1c5 100644 --- a/lib/sdt_alloc.bpf.c +++ b/lib/sdt_alloc.bpf.c @@ -23,6 +23,17 @@ struct { struct sdt_alloc_stack __arena *prealloc_stack; +/* + * Necessary for cond_break/can_loop's semantics. According to kernel commit + * 011832b, the loop counter variable must be seen as imprecise and bounded + * by the verifier. Initializing it from a constant (e.g., i = 0;), then, + * makes it precise and prevents may_goto from helping with converging the + * loop. For these loops we must initialize the loop counter from a variable + * whose value the verifier cannot reason about when checking the program, so + * that the loop counter's value is imprecise. + */ +static __u64 zero = 0; + /* * XXX Hack to get the verifier to find the arena for sdt_exit_task. * As of 6.12-rc5, The verifier associates arenas with programs by @@ -174,7 +185,17 @@ int sdt_alloc_stack(struct sdt_alloc_stack __arena *stack) static SDT_TASK_FN_ATTRS int sdt_alloc_attempt(struct sdt_alloc_stack __arena *stack) { - bpf_repeat(SDT_TASK_ALLOC_ATTEMPTS) { + int i; + + /* + * Use can_loop to help with verification. The can_loop macro was + * introduced in kernel commit ba39486 and wraps around the may_goto + * instruction that helps with verifiying for loops. Using may_goto + * embeds a switch into the loop that ensures it is considered + * terminable by the verifier by adding a hidden switch to the loop + * and counting down with every iteration. + */ + for (i = zero; i < SDT_TASK_ALLOC_ATTEMPTS && can_loop; i++) { if (sdt_alloc_stack(stack) == 0) return 0; } @@ -348,6 +369,34 @@ int sdt_set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) return 0; } +static __noinline +int sdt_mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) +{ + sdt_desc_t *desc; + __u64 u, level; + int ret; + + for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { + level = SDT_TASK_LEVELS - 1 - u; + + /* Only propagate upwards if we are the parent's only free chunk. */ + desc = lv_desc[level]; + + /* Failed calls return unlocked. */ + ret = sdt_set_idx_state(desc, lv_pos[level], false); + if (unlikely(ret != 0)) + return ret; + + cast_kern(desc); + + desc->nr_free += 1; + if (desc->nr_free > 1) + return 0; + } + + return 0; +} + __hidden void sdt_free_idx(struct sdt_allocator *alloc, __u64 idx) { @@ -357,7 +406,7 @@ void sdt_free_idx(struct sdt_allocator *alloc, __u64 idx) struct sdt_chunk __arena *chunk; sdt_desc_t *desc; struct sdt_data __arena *data; - __u64 u, level, shift, pos; + __u64 level, shift, pos; __u64 lv_pos[SDT_TASK_LEVELS]; int ret; int i; @@ -373,7 +422,13 @@ void sdt_free_idx(struct sdt_allocator *alloc, __u64 idx) return; } - bpf_for(level, 0, SDT_TASK_LEVELS) { + /* To appease the verifier. */ + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { + lv_desc[level] = NULL; + lv_pos[level] = 0; + } + + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; pos = (idx >> shift) & mask; @@ -414,29 +469,15 @@ void sdt_free_idx(struct sdt_allocator *alloc, __u64 idx) }; /* Zero out one word at a time. */ - bpf_for(i, 0, alloc->pool.elem_size / 8) { + for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { data->payload[i] = 0; } } - bpf_for(u, 0, SDT_TASK_LEVELS) { - level = SDT_TASK_LEVELS - 1 - u; - - /* Only propagate upwards if we are the parent's only free chunk. */ - desc = lv_desc[level]; - - /* Failed calls return unlocked. */ - ret = sdt_set_idx_state(desc, lv_pos[level], false); - if (unlikely(ret != 0)) { - bpf_spin_unlock(&sdt_lock); - return; - } - - cast_kern(desc); - - desc->nr_free += 1; - if (desc->nr_free > 1) - break; + ret = sdt_mark_nodes_avail(lv_desc, lv_pos); + if (unlikely(ret != 0)) { + bpf_spin_unlock(&sdt_lock); + return; } sdt_stats.active_allocs -= 1; @@ -465,7 +506,7 @@ sdt_desc_t * sdt_find_empty(sdt_desc_t *desc, __u64 idx = 0; int ret; - bpf_for(level, 0, SDT_TASK_LEVELS) { + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { pos = sdt_chunk_find_empty(desc); /* Something has gone terribly wrong. */ @@ -499,7 +540,7 @@ sdt_desc_t * sdt_find_empty(sdt_desc_t *desc, } } - bpf_for(u, 0, SDT_TASK_LEVELS) { + for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { level = SDT_TASK_LEVELS - 1 - u; tmp = lv_desc[level];