diff --git a/.gitignore b/.gitignore index a55413249c..a92f284140 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ TODO.md minikube-* /data +bpf_logs_*.txt + # Snap Packaging Artifacts *.snap snap/local/parca-agent diff --git a/bpf/.clang-format b/bpf/.clang-format new file mode 100644 index 0000000000..1dbb17ab05 --- /dev/null +++ b/bpf/.clang-format @@ -0,0 +1,5 @@ +--- +BasedOnStyle: LLVM +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +ColumnLimit: 120 diff --git a/bpf/Makefile b/bpf/Makefile index f9b2624423..1c36353418 100644 --- a/bpf/Makefile +++ b/bpf/Makefile @@ -11,7 +11,7 @@ format: c/fmt .PHONY: c/fmt c/fmt: - clang-format -i --style=LLVM $(BPF_SRC) $(BPF_HEADERS) + clang-format -i --style=file $(BPF_SRC) $(BPF_HEADERS) .PHONY: format-check format-check: diff --git a/bpf/cpu/cpu.bpf.c b/bpf/cpu/cpu.bpf.c index 81c9083039..852caa3a85 100644 --- a/bpf/cpu/cpu.bpf.c +++ b/bpf/cpu/cpu.bpf.c @@ -12,6 +12,10 @@ #include "../common.h" #include "hash.h" +//#include +enum { + BPF_F_NO_PREALLOC = (1U << 0), +}; #include #include #include @@ -22,19 +26,31 @@ // Number of frames to walk per tail call iteration. #define MAX_STACK_DEPTH_PER_PROGRAM 15 // Number of BPF tail calls that will be attempted. +// +// invariant: `MAX_TAIL_CALLS * MAX_STACK_DEPTH_PER_PROGRAM` >= +// `MAX_STACK_DEPTH` #define MAX_TAIL_CALLS 10 -// Number of frames to walk in total. +// Maximum number of frames. #define MAX_STACK_DEPTH 127 -// Number of stacks. -#define MAX_STACK_TRACES 1024 +// Number of unique stacks. +#define MAX_STACK_TRACES_ENTRIES 1024 // Number of items in the stack counts aggregation map. #define MAX_STACK_COUNTS_ENTRIES 10240 +// Maximum number of processes we are willing to track. +#define MAX_PROCESSES 1500 // Binary search iterations for dwarf based stack walking. -// 2^20 can bisect ~1_048_576 entries. -#define MAX_BINARY_SEARCH_DEPTH 20 +// 2^19 can bisect ~524_288 entries. +// +// invariant: `2^MAX_BINARY_SEARCH_DEPTH >= MAX_UNWIND_TABLE_SIZE` +#define MAX_BINARY_SEARCH_DEPTH 19 // Size of the unwind table. +// 250k * sizeof(stack_unwind_row_t) = 2MB #define MAX_UNWIND_TABLE_SIZE 250 * 1000 -#define MAX_SHARDS 6 +// Unwind tables bigger than can't fit in the remaining space +// of the current shard are broken up into chunks up to `MAX_UNWIND_TABLE_SIZE`. +#define MAX_UNWIND_TABLE_CHUNKS 30 +// Maximum memory mappings per process. +#define MAX_MAPPINGS_PER_PROCESS 120 // Values for dwarf expressions. #define DWARF_EXPRESSION_UNKNOWN 0 @@ -71,34 +87,57 @@ const volatile struct config_t config = {}; /*============================== MACROS =====================================*/ -#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries) \ - struct { \ - __uint(type, _type); \ - __uint(max_entries, _max_entries); \ - __type(key, _key_type); \ - __type(value, _value_type); \ +#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries) \ + struct { \ + __uint(type, _type); \ + __uint(max_entries, _max_entries); \ + __type(key, _key_type); \ + __type(value, _value_type); \ } _name SEC(".maps"); // Stack Traces are slightly different // in that the value is 1 big byte array // of the stack addresses typedef __u64 stack_trace_type[MAX_STACK_DEPTH]; -#define BPF_STACK_TRACE(_name, _max_entries) \ +#define BPF_STACK_TRACE(_name, _max_entries) \ BPF_MAP(_name, BPF_MAP_TYPE_STACK_TRACE, u32, stack_trace_type, _max_entries); -#define BPF_HASH(_name, _key_type, _value_type, _max_entries) \ +#define BPF_HASH(_name, _key_type, _value_type, _max_entries) \ BPF_MAP(_name, BPF_MAP_TYPE_HASH, _key_type, _value_type, _max_entries); -#define DEFINE_COUNTER(__func__name) \ - static void BUMP_##__func__name() { \ - u32 *c = bpf_map_lookup_elem(&percpu_stats, &__func__name); \ - if (c != NULL) { \ - *c += 1; \ - } \ +#define DEFINE_COUNTER(__func__name) \ + static void BUMP_##__func__name() { \ + u32 *c = bpf_map_lookup_elem(&percpu_stats, &__func__name); \ + if (c != NULL) { \ + *c += 1; \ + } \ } /*============================= INTERNAL STRUCTS ============================*/ +// cheat: +// +// pid -> mapping_id +// mapping_id -> executable_id +// executable_id -> table_shards +// +// now we can find the shard + +// Unwind table shard. +typedef struct shard_info { + u64 low_pc; + u64 high_pc; + u64 shard_index; + u64 low_index; + u64 high_index; +} shard_info_t; + +// Unwind table shards for an executable mapping. +typedef struct stack_unwind_table_shards { + u64 len; + shard_info_t shards[MAX_UNWIND_TABLE_CHUNKS]; +} stack_unwind_table_shards_t; + // The addresses of a native stack trace. typedef struct stack_trace_t { u64 len; @@ -113,11 +152,24 @@ typedef struct stack_count_key { int user_stack_id_dwarf; } stack_count_key_t; -typedef struct unwind_tables_key { - int pid; - int shard; -} unwind_tables_key_t; +// Represents an executable mapping. +typedef struct mapping { + u64 load_address; + u64 begin; + u64 end; + u64 executable_id; + u64 type; +} mapping_t; + +// Executable mappings for a process. +typedef struct { + u64 is_jit_compiler; + u64 len; + mapping_t mappings[MAX_MAPPINGS_PER_PROCESS]; +} process_info_t; +// State of unwinder such as the registers as well +// as internal data. typedef struct unwind_state { u64 ip; u64 sp; @@ -127,17 +179,6 @@ typedef struct unwind_state { } unwind_state_t; // A row in the stack unwinding table. -// PERF(javierhonduco): in the future, split this struct from a buffer of -// `stack_unwind_row` to multiple buffers containing each field. That way we -// would be able to not only have more entries, but we would increase -// performance as more data will be able to fit in the CPU cache. -// -// This is particularly important for the program counter => map + -// map. the second map can be split further if we decide to do -// so. -// -// This is at the cost of code readability, so should only be done if -// experiments confirm this theory. typedef struct stack_unwind_row { u64 pc; u16 __reserved_do_not_use; @@ -148,11 +189,7 @@ typedef struct stack_unwind_row { } stack_unwind_row_t; // Unwinding table representation. -typedef struct stack_unwind_table_t { - u64 low_pc; - u64 high_pc; - u64 table_len; // items of the table, as the max size is static. - u64 __explicit_padding; +typedef struct stack_unwind_table { stack_unwind_row_t rows[MAX_UNWIND_TABLE_SIZE]; } stack_unwind_table_t; @@ -173,15 +210,22 @@ u32 UNWIND_SHOULD_NEVER_HAPPEN_ERROR = 5; u32 UNWIND_PC_NOT_COVERED_ERROR = 6; // Keep track of total samples. u32 UNWIND_SAMPLES_COUNT = 7; +u32 UNWIND_JIT_ERRORS = 8; /*================================ MAPS =====================================*/ -BPF_HASH(debug_pids, int, u8, 32); +BPF_HASH(debug_pids, int, u8, MAX_PROCESSES); +BPF_HASH(process_info, int, process_info_t, MAX_PROCESSES); + +BPF_STACK_TRACE(stack_traces, MAX_STACK_TRACES_ENTRIES); +BPF_HASH(dwarf_stack_traces, int, stack_trace_t, MAX_STACK_TRACES_ENTRIES); BPF_HASH(stack_counts, stack_count_key_t, u64, MAX_STACK_COUNTS_ENTRIES); -BPF_STACK_TRACE(stack_traces, MAX_STACK_TRACES); -BPF_HASH(dwarf_stack_traces, int, stack_trace_t, MAX_STACK_TRACES); -BPF_HASH(unwind_tables, unwind_tables_key_t, stack_unwind_table_t, - 2); // Table size will be updated in userspace. + +// executable_chunks? +BPF_HASH(unwind_shards, u64, stack_unwind_table_shards_t, + 5 * 1000); // @nocommit: update +BPF_HASH(unwind_tables, u64, stack_unwind_table_t, + 5); // Table size will be updated in userspace. struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); @@ -212,6 +256,7 @@ DEFINE_COUNTER(UNWIND_UNSUPPORTED_EXPRESSION); DEFINE_COUNTER(UNWIND_SHOULD_NEVER_HAPPEN_ERROR); DEFINE_COUNTER(UNWIND_CATCHALL_ERROR); DEFINE_COUNTER(UNWIND_PC_NOT_COVERED_ERROR); +DEFINE_COUNTER(UNWIND_JIT_ERRORS); static void unwind_print_stats() { u32 *success_counter = bpf_map_lookup_elem(&percpu_stats, &UNWIND_SUCCESS); @@ -219,48 +264,48 @@ static void unwind_print_stats() { return; } - u32 *total_counter = - bpf_map_lookup_elem(&percpu_stats, &UNWIND_SAMPLES_COUNT); + u32 *total_counter = bpf_map_lookup_elem(&percpu_stats, &UNWIND_SAMPLES_COUNT); if (total_counter == NULL) { return; } - u32 *truncated_counter = - bpf_map_lookup_elem(&percpu_stats, &UNWIND_TRUNCATED); + u32 *truncated_counter = bpf_map_lookup_elem(&percpu_stats, &UNWIND_TRUNCATED); if (truncated_counter == NULL) { return; } - u32 *unsup_expression = - bpf_map_lookup_elem(&percpu_stats, &UNWIND_UNSUPPORTED_EXPRESSION); + u32 *unsup_expression = bpf_map_lookup_elem(&percpu_stats, &UNWIND_UNSUPPORTED_EXPRESSION); if (unsup_expression == NULL) { return; } - u32 *not_covered_count = - bpf_map_lookup_elem(&percpu_stats, &UNWIND_PC_NOT_COVERED_ERROR); + u32 *not_covered_count = bpf_map_lookup_elem(&percpu_stats, &UNWIND_PC_NOT_COVERED_ERROR); if (not_covered_count == NULL) { return; } - u32 *catchall_count = - bpf_map_lookup_elem(&percpu_stats, &UNWIND_CATCHALL_ERROR); + u32 *catchall_count = bpf_map_lookup_elem(&percpu_stats, &UNWIND_CATCHALL_ERROR); if (catchall_count == NULL) { return; } - u32 *never = - bpf_map_lookup_elem(&percpu_stats, &UNWIND_SHOULD_NEVER_HAPPEN_ERROR); + u32 *never = bpf_map_lookup_elem(&percpu_stats, &UNWIND_SHOULD_NEVER_HAPPEN_ERROR); if (never == NULL) { return; } + u32 *jit_errors = bpf_map_lookup_elem(&percpu_stats, &UNWIND_JIT_ERRORS); + if (jit_errors == NULL) { + return; + } + bpf_printk("[[ stats for cpu %d ]]", (int)bpf_get_smp_processor_id()); bpf_printk("success=%lu", *success_counter); bpf_printk("unsup_expression=%lu", *unsup_expression); bpf_printk("truncated=%lu", *truncated_counter); bpf_printk("catchall=%lu", *catchall_count); bpf_printk("never=%lu", *never); + bpf_printk("jit_failure=%lu", *jit_errors); bpf_printk("total_counter=%lu", *total_counter); bpf_printk("(not_covered=%lu)", *not_covered_count); @@ -276,8 +321,7 @@ static void bump_samples() { } } -static __always_inline void * -bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) { +static __always_inline void *bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) { void *val; long err; @@ -297,9 +341,7 @@ bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) { // Binary search the unwind table to find the row index containing the unwind // information for a given program counter (pc). -static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc) { - u64 left = 0; - u64 right = table->table_len; +static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left, u64 right) { u64 found = BINARY_SEARCH_NOT_FOUND; for (int i = 0; i < MAX_BINARY_SEARCH_DEPTH; i++) { @@ -314,7 +356,7 @@ static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc) { // Appease the verifier. if (mid < 0 || mid >= MAX_UNWIND_TABLE_SIZE) { - bpf_printk("\t.should never happen"); + bpf_printk("\t.should never happen, mid: %lu, max: %lu", mid, MAX_UNWIND_TABLE_SIZE); BUMP_UNWIND_SHOULD_NEVER_HAPPEN_ERROR(); return BINARY_SEARCH_SHOULD_NEVER_HAPPEN; } @@ -338,26 +380,11 @@ static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc) { return BINARY_SEARCH_EXHAUSTED_ITERATIONS; } -// Print an unwinding table row for debugging. -static __always_inline void show_row(stack_unwind_table_t *unwind_table, - int index) { - /* - u64 pc = unwind_table->rows[index].pc; - u16 cfa_type = unwind_table->rows[index].cfa_type; - s16 cfa_offset = unwind_table->rows[index].cfa_offset; - s16 rbp_offset = unwind_table->rows[index].rbp_offset; - - bpf_printk("~ %d entry. Loc: %llx, CFA reg: %d Offset: %d, $rbp %d", index, - pc, cfa_type, cfa_offset, rbp_offset); */ -} - // Finds whether a process should be unwound using the unwind // tables. static __always_inline bool has_unwind_information(pid_t pid) { - unwind_tables_key_t key = {.pid = pid, .shard = 0}; - - stack_unwind_table_t *shard1 = bpf_map_lookup_elem(&unwind_tables, &key); - if (shard1) { + process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &pid); + if (proc_info) { return true; } return false; @@ -371,35 +398,105 @@ static __always_inline bool is_debug_enabled_for_pid(int pid) { return false; } -// Finds the unwind table for a given pid and program counter. -// Returns NULL if it can't be found, so this function can't be used to detect -// how should we unwind the native stack for a process. See -// `has_unwind_information()`. -static __always_inline stack_unwind_table_t *find_unwind_table(pid_t pid, - u64 pc) { - unwind_tables_key_t key = {.pid = pid, .shard = 0}; - - for (int i = 0; i < MAX_SHARDS; i++) { - key.shard = i; - stack_unwind_table_t *shard = bpf_map_lookup_elem(&unwind_tables, &key); - if (shard) { - if (shard->low_pc <= pc && pc <= shard->high_pc) { - bpf_printk("\t Shard %d", i); - return shard; - } +enum find_unwind_table_return { + FIND_UNWIND_SUCCESS = 1, + + FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN = 2, + FIND_UNWIND_MAPPING_EXHAUSTED_SEARCH = 3, + FIND_UNWIND_MAPPING_NOT_FOUND = 4, + FIND_UNWIND_SHARD_UNSET = 5, + FIND_UNWIND_SHARD_EXHAUSTED_SEARCH = 6, + FIND_UNWIND_SHARD_NOT_FOUND = 7, + + FIND_UNWIND_JITTED = 100, + FIND_UNWIND_SPECIAL = 200, +}; + +// Finds the shard information for a given pid and program counter. Optionally, +// and offset can be passed that will be filled in with the mapping's load +// address. +static __always_inline enum find_unwind_table_return find_unwind_table(shard_info_t **shard_info, pid_t pid, u64 pc, + u64 *offset) { + process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &pid); + // Appease the verifier. + if (proc_info == NULL) { + bpf_printk("[error] should never happen"); + return FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN; + } + + bool found = false; + u64 executable_id = 0; + u64 load_address = 0; + u64 type = 0; + + // Find the mapping. + for (int i = 0; i < MAX_MAPPINGS_PER_PROCESS; i++) { + if (i > proc_info->len) { + bpf_printk("[info] mapping not found, i (%d) > proc_info->len (%d) pc: %llx", i, proc_info->len, pc); + return FIND_UNWIND_MAPPING_EXHAUSTED_SEARCH; + } + + // Appease the verifier. + if (i < 0 || i > MAX_MAPPINGS_PER_PROCESS) { + bpf_printk("[error] should never happen, verifier"); + return FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN; + } + + if (proc_info->mappings[i].begin <= pc && pc <= proc_info->mappings[i].end) { + found = true; + executable_id = proc_info->mappings[i].executable_id; + load_address = proc_info->mappings[i].load_address; + type = proc_info->mappings[i].type; + break; + } + } + + if (found) { + if (offset != NULL) { + *offset = load_address; + } + if (type == 1) { + return FIND_UNWIND_JITTED; + } + if (type == 2) { + return FIND_UNWIND_SPECIAL; } + } else { + bpf_printk("[warn] :((( no mapping for ip=%llx", pc); + return FIND_UNWIND_MAPPING_NOT_FOUND; } - bpf_printk("[warn] no unwind table contains PC=%llx", pc); - return NULL; + bpf_printk("~about to check shards found=%d", found); + bpf_printk("~checking shards now"); + + // Find the shard where this unwind table lives. + stack_unwind_table_shards_t *shards = bpf_map_lookup_elem(&unwind_shards, &executable_id); + if (shards == NULL) { + bpf_printk("[info] shards is null for executable %llu", executable_id); + return FIND_UNWIND_SHARD_NOT_FOUND; + } + + for (int i = 0; i < MAX_UNWIND_TABLE_CHUNKS; i++) { + if (i > shards->len) { + return FIND_UNWIND_SHARD_EXHAUSTED_SEARCH; + } + + if (shards->shards[i].low_pc <= pc - load_address && pc - load_address <= shards->shards[i].high_pc) { + bpf_printk("[info] found shard"); + *shard_info = &shards->shards[i]; + return FIND_UNWIND_SUCCESS; + } + } + + bpf_printk("[error] could not find the right shard..."); + return FIND_UNWIND_SHARD_NOT_FOUND; } -static __always_inline void add_stacks(struct bpf_perf_event_data *ctx, - u64 pid_tgid, - enum stack_walking_method method, - unwind_state_t *unwind_state) { +// Aggregate the given stacktrace. +static __always_inline void add_stack(struct bpf_perf_event_data *ctx, u64 pid_tgid, enum stack_walking_method method, + unwind_state_t *unwind_state) { u64 zero = 0; - stack_count_key_t stack_key = {}; + stack_count_key_t stack_key = {0}; // The `bpf_get_current_pid_tgid` helpers returns // `current_task->tgid << 32 | current_task->pid`, the naming can be @@ -420,21 +517,20 @@ static __always_inline void add_stacks(struct bpf_perf_event_data *ctx, } if (method == STACK_WALKING_METHOD_DWARF) { - int stack_hash = - MurmurHash2((u32 *)unwind_state->stack.addresses, - MAX_STACK_DEPTH * sizeof(u64) / sizeof(u32), 0); + int stack_hash = MurmurHash2((u32 *)unwind_state->stack.addresses, MAX_STACK_DEPTH * sizeof(u64) / sizeof(u32), 0); bpf_printk("stack hash %d", stack_hash); stack_key.user_stack_id_dwarf = stack_hash; stack_key.user_stack_id = 0; // Insert stack. - bpf_map_update_elem(&dwarf_stack_traces, &stack_hash, &unwind_state->stack, - BPF_ANY); + bpf_map_update_elem(&dwarf_stack_traces, &stack_hash, &unwind_state->stack, BPF_ANY); } else if (method == STACK_WALKING_METHOD_FP) { int stack_id = bpf_get_stackid(ctx, &stack_traces, BPF_F_USER_STACK); if (stack_id >= 0) { stack_key.user_stack_id = stack_id; stack_key.user_stack_id_dwarf = 0; + } else { + // bpf_printk("fp failed\n"); } } @@ -445,6 +541,7 @@ static __always_inline void add_stacks(struct bpf_perf_event_data *ctx, } } +// The unwinding machinery lives here. SEC("perf_event") int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { u64 pid_tgid = bpf_get_current_pid_tgid(); @@ -459,7 +556,6 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { return 1; } - // #pragma clang loop unroll(full) for (int i = 0; i < MAX_STACK_DEPTH_PER_PROGRAM; i++) { bpf_printk("## frame: %d", i); @@ -467,24 +563,42 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { bpf_printk("\tcurrent sp: %llx", unwind_state->sp); bpf_printk("\tcurrent bp: %llx", unwind_state->bp); - stack_unwind_table_t *unwind_table = - find_unwind_table(user_pid, unwind_state->ip); + u64 offset = 0; + shard_info_t *shard = NULL; + enum find_unwind_table_return unwind_table_result = find_unwind_table(&shard, user_pid, unwind_state->ip, &offset); - if (unwind_table == NULL) { + if (unwind_table_result == FIND_UNWIND_JITTED) { + bpf_printk("JIT section, stopping"); + return 1; + } else if (unwind_table_result == FIND_UNWIND_SPECIAL) { + bpf_printk("special section, stopping"); + return 1; + } else if (shard == NULL) { + // improve reached_bottom_of_stack = true; break; } - u64 table_idx = find_offset_for_pc(unwind_table, unwind_state->ip); + stack_unwind_table_t *unwind_table = bpf_map_lookup_elem(&unwind_tables, &shard->shard_index); + if (unwind_table == NULL) { + bpf_printk("unwind table is null :( for shard %llu", shard->shard_index); + return 0; + } - if (table_idx == BINARY_SEARCH_NOT_FOUND || - table_idx == BINARY_SEARCH_SHOULD_NEVER_HAPPEN || + bpf_printk("le offset: %llx", offset); + u64 left = shard->low_index; + u64 right = shard->high_index; + bpf_printk("========== left %llu right %llu", left, right); + u64 table_idx = find_offset_for_pc(unwind_table, unwind_state->ip - offset, left, right); + + if (table_idx == BINARY_SEARCH_NOT_FOUND || table_idx == BINARY_SEARCH_SHOULD_NEVER_HAPPEN || table_idx == BINARY_SEARCH_EXHAUSTED_ITERATIONS) { bpf_printk("[error] binary search failed with %llx", table_idx); return 1; } bpf_printk("\t=> table_index: %d", table_idx); + bpf_printk("\t=> adjusted pc: %llx", unwind_state->ip - offset); // Appease the verifier. if (table_idx < 0 || table_idx >= MAX_UNWIND_TABLE_SIZE) { @@ -508,13 +622,10 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { s16 found_cfa_offset = unwind_table->rows[table_idx].cfa_offset; s16 found_rbp_offset = unwind_table->rows[table_idx].rbp_offset; - bpf_printk("\tcfa type: %d, offset: %d (row pc: %llx)", found_cfa_type, - found_cfa_offset, found_pc); + bpf_printk("\tcfa type: %d, offset: %d (row pc: %llx)", found_cfa_type, found_cfa_offset, found_pc); - if (found_rbp_type == RBP_TYPE_REGISTER || - found_rbp_type == RBP_TYPE_EXPRESSION) { - bpf_printk("\t!!!! frame pointer is %d (register or exp), bailing out", - found_rbp_type); + if (found_rbp_type == RBP_TYPE_REGISTER || found_rbp_type == RBP_TYPE_EXPRESSION) { + bpf_printk("\t[error] frame pointer is %d (register or exp), bailing out", found_rbp_type); BUMP_UNWIND_CATCHALL_ERROR(); return 1; } @@ -545,11 +656,9 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { return 1; } - previous_rsp = unwind_state->sp + 8 + - ((((unwind_state->ip & 15) >= threshold)) << 3); + previous_rsp = unwind_state->sp + 8 + ((((unwind_state->ip & 15) >= threshold)) << 3); } else { - bpf_printk("\t[error] register %d not valid (expected $rbp or $rsp)", - found_cfa_type); + bpf_printk("\t[error] register %d not valid (expected $rbp or $rsp)", found_cfa_type); BUMP_UNWIND_CATCHALL_ERROR(); return 1; } @@ -565,16 +674,24 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { // HACK(javierhonduco): This is an architectural shortcut we can take. As we // only support x86_64 at the minute, we can assume that the return address // is *always* 8 bytes ahead of the previous stack pointer. - u64 previous_rip_addr = - previous_rsp - 8; // the saved return address is 8 bytes ahead of the - // previous stack pointer + u64 previous_rip_addr = previous_rsp - 8; // the saved return address is 8 bytes ahead of the previous stack pointer u64 previous_rip = 0; - int err = bpf_probe_read_user( - &previous_rip, 8, - (void *)(previous_rip_addr)); // 8 bytes, a whole word - // in a 64 bits machine + int err = bpf_probe_read_user(&previous_rip, 8, (void *)(previous_rip_addr)); // 8 bytes, a whole word in a 64 bits machine if (previous_rip == 0) { + int user_pid = pid_tgid; + process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &user_pid); + if (proc_info == NULL) { + bpf_printk("[error] should never happen"); + return 1; + } + + if (proc_info->is_jit_compiler) { + bpf_printk("[info] rip=0, Section not added, yet"); + BUMP_UNWIND_JIT_ERRORS(); + return 1; + } + bpf_printk("[error] previous_rip should not be zero. This can mean that " "the read failed, ret=%d while reading @ %llx.", err, previous_rip_addr); @@ -588,12 +705,10 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { previous_rbp = unwind_state->bp; } else { u64 previous_rbp_addr = previous_rsp + found_rbp_offset; - bpf_printk("\t(bp_offset: %d, bp value stored at %llx)", found_rbp_offset, - previous_rbp_addr); - int ret = bpf_probe_read_user( - &previous_rbp, 8, - (void *)(previous_rbp_addr)); // 8 bytes, a whole word in a 64 bits - // machine + bpf_printk("\t(bp_offset: %d, bp value stored at %llx)", found_rbp_offset, previous_rbp_addr); + int ret = bpf_probe_read_user(&previous_rbp, 8, + (void *)(previous_rbp_addr)); // 8 bytes, a whole word in a 64 bits + // machine if (ret != 0) { bpf_printk("[error] previous_rbp should not be zero. This can mean " @@ -630,23 +745,30 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) { // https://refspecs.linuxbase.org/elf/x86_64-abi-0.99.pdf if (unwind_state->bp == 0) { bpf_printk("======= reached main! ======="); - add_stacks(ctx, pid_tgid, STACK_WALKING_METHOD_DWARF, unwind_state); + add_stack(ctx, pid_tgid, STACK_WALKING_METHOD_DWARF, unwind_state); BUMP_UNWIND_SUCCESS(); bpf_printk("yesssss :)"); } else { - // TODO(javierhonduco): The current code doesn't have good support for - // JIT'ed code, this is something that will be worked on in future - // iterations. - bpf_printk("[error] Could not find unwind table and rbp != 0 (%llx). " - "JIT'ed / bug?", - unwind_state->bp); + + int user_pid = pid_tgid; + process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &user_pid); + if (proc_info == NULL) { + bpf_printk("[error] should never happen"); + return 1; + } + + if (proc_info->is_jit_compiler) { + bpf_printk("[info] Section not added, yet"); + BUMP_UNWIND_JIT_ERRORS(); + return 1; + } + + bpf_printk("[error] Could not find unwind table and rbp != 0 (%llx) bug?", unwind_state->bp); BUMP_UNWIND_SHOULD_NEVER_HAPPEN_ERROR(); } return 0; - } else if (unwind_state->stack.len < MAX_STACK_DEPTH && - unwind_state->tail_calls < MAX_TAIL_CALLS) { - bpf_printk("Continuing walking the stack in a tail call, current tail %d", - unwind_state->tail_calls); + } else if (unwind_state->stack.len < MAX_STACK_DEPTH && unwind_state->tail_calls < MAX_TAIL_CALLS) { + bpf_printk("Continuing walking the stack in a tail call, current tail %d", unwind_state->tail_calls); unwind_state->tail_calls++; bpf_tail_call(ctx, &programs, 0); } @@ -676,8 +798,7 @@ static __always_inline void set_initial_state(bpf_user_pt_regs_t *regs) { unwind_state->tail_calls = 0; } -static __always_inline int -walk_user_stacktrace(struct bpf_perf_event_data *ctx) { +static __always_inline int walk_user_stacktrace(struct bpf_perf_event_data *ctx) { bump_samples(); @@ -700,7 +821,8 @@ int profile_cpu(struct bpf_perf_event_data *ctx) { return 0; if (config.debug) { - bpf_printk("debug mode enabled, make sure you specified process name"); + // very noisy + // bpf_printk("debug mode enabled, make sure you specified process name"); if (!is_debug_enabled_for_pid(user_tgid)) return 0; } @@ -709,33 +831,16 @@ int profile_cpu(struct bpf_perf_event_data *ctx) { // Check if the process is eligible for the unwind table or frame pointer // unwinders. if (!has_unwind_info) { - add_stacks(ctx, pid_tgid, STACK_WALKING_METHOD_FP, NULL); + add_stack(ctx, pid_tgid, STACK_WALKING_METHOD_FP, NULL); } else { - stack_unwind_table_t *unwind_table = - find_unwind_table(user_pid, ctx->regs.ip); - if (unwind_table == NULL) { - bpf_printk("IP not covered. In kernel space / bug? IP %llx)", - ctx->regs.ip); + shard_info_t *shard = NULL; + find_unwind_table(&shard, user_pid, ctx->regs.ip, NULL); + if (shard == NULL) { + bpf_printk("IP not covered. In kernel space / bug? IP %llx)", ctx->regs.ip); BUMP_UNWIND_PC_NOT_COVERED_ERROR(); return 0; } - u64 last_idx = unwind_table->table_len - 1; - // Appease the verifier. - if (last_idx < 0 || last_idx >= MAX_UNWIND_TABLE_SIZE) { - bpf_printk("\t[error] this should never happen"); - BUMP_UNWIND_SHOULD_NEVER_HAPPEN_ERROR(); - return 0; - } - - // javierhonduco: Debug output to ensure that the maps are correctly - // populated by comparing it with the data - // we are writing. Remove later on. - show_row(unwind_table, 0); - show_row(unwind_table, 1); - show_row(unwind_table, 2); - show_row(unwind_table, last_idx); - bpf_printk("pid %d tgid %d", user_pid, user_tgid); walk_user_stacktrace(ctx); } @@ -744,7 +849,6 @@ int profile_cpu(struct bpf_perf_event_data *ctx) { } #define KBUILD_MODNAME "parca-agent" -volatile const char bpf_metadata_name[] SEC(".rodata") = - "parca-agent (https://github.com/parca-dev/parca-agent)"; +volatile const char bpf_metadata_name[] SEC(".rodata") = "parca-agent (https://github.com/parca-dev/parca-agent)"; unsigned int VERSION SEC("version") = 1; char LICENSE[] SEC("license") = "GPL"; diff --git a/go.mod b/go.mod index 8ebba72221..cf4735fcec 100644 --- a/go.mod +++ b/go.mod @@ -34,6 +34,7 @@ require ( github.com/rzajac/flexbuf v0.14.0 github.com/stretchr/testify v1.8.1 github.com/xyproto/ainur v1.3.0 + golang.org/x/exp v0.0.0-20221212164502-fae10dda9338 golang.org/x/sync v0.1.0 golang.org/x/sys v0.4.0 google.golang.org/grpc v1.52.0 @@ -140,7 +141,6 @@ require ( go.uber.org/atomic v1.10.0 // indirect go.uber.org/goleak v1.2.0 // indirect golang.org/x/crypto v0.1.0 // indirect - golang.org/x/exp v0.0.0-20221212164502-fae10dda9338 // indirect golang.org/x/net v0.4.0 // indirect golang.org/x/oauth2 v0.3.0 // indirect golang.org/x/term v0.3.0 // indirect diff --git a/pkg/profiler/cpu/cpu.go b/pkg/profiler/cpu/cpu.go index e5ac03b6eb..ec96083504 100644 --- a/pkg/profiler/cpu/cpu.go +++ b/pkg/profiler/cpu/cpu.go @@ -23,6 +23,7 @@ import ( "encoding/binary" "errors" "fmt" + "os" "regexp" "runtime" "strings" @@ -82,9 +83,8 @@ type CPU struct { debuginfoManager profiler.DebugInfoManager labelsManager profiler.LabelsManager - psMapCache profiler.ProcessMapCache - objFileCache profiler.ObjectFileCache - unwindTableBuilder *unwind.UnwindTableBuilder + psMapCache profiler.ProcessMapCache + objFileCache profiler.ObjectFileCache metrics *metrics @@ -131,9 +131,8 @@ func NewCPUProfiler( processMappings: process.NewMapping(psMapCache), // Shared caches between all profilers. - psMapCache: psMapCache, - objFileCache: objFileCache, - unwindTableBuilder: unwind.NewUnwindTableBuilder(logger), + psMapCache: psMapCache, + objFileCache: objFileCache, profilingDuration: profilingDuration, profilingSamplingFrequency: profilingSamplingFrequency, @@ -189,6 +188,10 @@ func bpfCheck() error { return result.ErrorOrNil() } +func (p *CPU) debugProcesses() bool { + return len(p.debugProcessNames) > 0 +} + func (p *CPU) Run(ctx context.Context) error { level.Debug(p.logger).Log("msg", "starting cpu profiler") @@ -214,7 +217,7 @@ func (p *CPU) Run(ctx context.Context) error { level.Debug(p.logger).Log("msg", "actual memory locked rlimit", "cur", profiler.HumanizeRLimit(rLimit.Cur), "max", profiler.HumanizeRLimit(rLimit.Max)) var matchers []*regexp.Regexp - if len(p.debugProcessNames) > 0 { + if p.debugProcesses() { level.Info(p.logger).Log("msg", "process names specified, debugging processes", "matchers", strings.Join(p.debugProcessNames, ", ")) for _, exp := range p.debugProcessNames { regex, err := regexp.Compile(exp) @@ -315,17 +318,15 @@ func (p *CPU) Run(ctx context.Context) error { return fmt.Errorf("failed to create maps: %w", err) } - if debugEnabled { - pfs, err := procfs.NewDefaultFS() - if err != nil { - return fmt.Errorf("failed to create procfs: %w", err) - } - - level.Debug(p.logger).Log("msg", "debug process matchers found, starting process watcher") - // Update the debug pids map. - go p.watchProcesses(ctx, pfs, matchers) + pfs, err := procfs.NewDefaultFS() + if err != nil { + return fmt.Errorf("failed to create procfs: %w", err) } + level.Debug(p.logger).Log("msg", "debug process matchers found, starting process watcher") + // Update the debug pids map. + go p.watchProcesses(ctx, pfs, matchers) + ticker := time.NewTicker(p.profilingDuration) defer ticker.Stop() @@ -407,7 +408,8 @@ func (p *CPU) watchProcesses(ctx context.Context, pfs procfs.FS, matchers []*reg ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() - unwindTableCache := cache.New(cache.WithExpireAfterWrite(20 * time.Minute)) + // @nocommit: cache on start_at + unwindTableCache := cache.New() for { select { @@ -415,72 +417,163 @@ func (p *CPU) watchProcesses(ctx context.Context, pfs procfs.FS, matchers []*reg return case <-ticker.C: } - - procs, err := pfs.AllProcs() + allProcs, err := pfs.AllProcs() if err != nil { level.Error(p.logger).Log("msg", "failed to list processes", "err", err) - continue + return } pids := []int{} - for _, proc := range procs { - comm, err := proc.Comm() - if err != nil { - level.Error(p.logger).Log("msg", "failed to get process name", "err", err) - continue - } + if p.debugProcesses() { + for _, proc := range allProcs { + comm, err := proc.Comm() + if err != nil { + level.Error(p.logger).Log("msg", "failed to get process name", "err", err) + continue + } - if comm == "" { - continue - } + if comm == "" { + continue + } - for _, m := range matchers { - if m.MatchString(comm) { - level.Info(p.logger).Log("msg", "match found; debugging process", "pid", proc.PID, "comm", comm) - pids = append(pids, proc.PID) + for _, m := range matchers { + if m.MatchString(comm) { + level.Info(p.logger).Log("msg", "match found; debugging process", "pid", proc.PID, "comm", comm) + pids = append(pids, proc.PID) + } } } - } - if len(pids) > 0 { - level.Debug(p.logger).Log("msg", "updating debug pids map", "pids", fmt.Sprintf("%v", pids)) - // Only meant to be used for debugging, it is not safe to use in production. - if err := p.bpfMaps.setDebugPIDs(pids); err != nil { - level.Warn(p.logger).Log("msg", "failed to update debug pids map", "err", err) + if len(pids) > 0 { + level.Debug(p.logger).Log("msg", "updating debug pids map", "pids", fmt.Sprintf("%v", pids)) + // Only meant to be used for debugging, it is not safe to use in production. + if err := p.bpfMaps.setDebugPIDs(pids); err != nil { + level.Error(p.logger).Log("msg", "failed to update debug pids map", "err", err) + } + } else { + level.Debug(p.logger).Log("msg", "no processes matched the provided regex") + if err := p.bpfMaps.setDebugPIDs(nil); err != nil { + level.Error(p.logger).Log("msg", "failed to update debug pids map", "err", err) + } } } else { - level.Debug(p.logger).Log("msg", "no processes matched the provided regex") - if err := p.bpfMaps.setDebugPIDs(nil); err != nil { - level.Warn(p.logger).Log("msg", "failed to update debug pids map", "err", err) + for _, proc := range allProcs { + pids = append(pids, proc.PID) } - continue } - // Can only be enabled when a debug process name is specified. + fmt.Println("=========== about to call enableDWARFUnwinding") + + count := 0 if p.enableDWARFUnwinding { // Update unwind tables for the given pids. for _, pid := range pids { if _, exists := unwindTableCache.GetIfPresent(pid); exists { + // TODO(javierhonduco): Expire cache on pid recycling or mappings changes. + fmt.Println("already cached") continue } - level.Info(p.logger).Log("msg", "adding unwind tables", "pid", pid) - pt, err := p.unwindTableBuilder.UnwindTableForPid(pid) + executable := fmt.Sprintf("/proc/%d/exe", pid) + hasFramePointers, err := unwind.HasFramePointers(executable) if err != nil { - level.Warn(p.logger).Log("msg", "failed to build unwind table", "pid", pid, "err", err) + // It may not exist as reading procfs is racy. + if !errors.Is(err, os.ErrNotExist) { + level.Error(p.logger).Log("msg", "frame pointer detection failed", "executable", executable, "err", err) + continue + } + fmt.Println("HasFramePointers failed") + } + + if hasFramePointers { + fmt.Println("skipping", executable, "has fp") continue } - if err := p.bpfMaps.setUnwindTable(pid, pt); err != nil { - level.Warn(p.logger).Log("msg", "failed to update unwind tables", "pid", pid, "err", err) + level.Info(p.logger).Log("msg", "adding unwind tables", "pid", pid) + + err = p.addUnwindTableForProcess(pid) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + level.Debug(p.logger).Log("msg", "failed to add unwind table", "pid", pid, "err", err) + } else { + level.Error(p.logger).Log("msg", "failed to add unwind table", "pid", pid, "err", err) + } continue } + unwindTableCache.Put(pid, struct{}{}) + count++ + } + + // Must be called after calling `addUnwindTableForProcess`, as it's possible + // that the current in-memory unwind table shard hasn't been written to the + // map. + // TODO: have a dirty flag. + err := p.bpfMaps.PersistUnwindTable() + if err != nil { + panic(err) } } } } +// 1. Find executable sections +// 2. For each section, generate compact table +// 3. Add table to maps +// 4. Add map metadata to process +// +// @nocommit: later on, table caching +func (p *CPU) addUnwindTableForProcess(pid int) error { + proc, err := procfs.NewProc(pid) + if err != nil { + return err + } + + mappings, err := proc.ProcMaps() + if err != nil { + return err + } + + executableMappings := unwind.ListExecutableMappings(mappings) + procInfoBuf := new(bytes.Buffer) + // Important: this has to be called before addUnwindTableForProcessMapping + // .is_jit_compiler + var isJitCompiler uint64 + if executableMappings.HasJitted() { + isJitCompiler = 1 + } + if err := binary.Write(procInfoBuf, p.bpfMaps.byteOrder, isJitCompiler); err != nil { // @nocommit + panic(fmt.Errorf("write proc_info .is_jit_compiler bytes: %w", err)) + } + + // .len + if err := binary.Write(procInfoBuf, p.bpfMaps.byteOrder, uint64(len(executableMappings))); err != nil { // @nocommit + panic(fmt.Errorf("write proc_info .len bytes: %w", err)) + } + + for _, executableMapping := range executableMappings { + err = p.addUnwindTableForProcessMapping(pid, executableMapping, procInfoBuf) + if err != nil { + panic(fmt.Errorf("calling addUnwindTableForProcessMapping: %w", err)) + } + } + + if err := p.bpfMaps.processInfo.Update(unsafe.Pointer(&pid), unsafe.Pointer(&procInfoBuf.Bytes()[0])); err != nil { + panic(fmt.Errorf("update processInfo: %w", err)) + } + + return nil +} + +func (p *CPU) addUnwindTableForProcessMapping(pid int, executableMappings *unwind.ExecutableMapping, procInfoBuf *bytes.Buffer) error { + if err := p.bpfMaps.setUnwindTable(pid, executableMappings, procInfoBuf); err != nil { + panic(fmt.Errorf("setUnwindTable: %w", err)) + } + + return nil +} + func (p *CPU) report(lastError error, processLastErrors map[int]error) { p.mtx.Lock() defer p.mtx.Unlock() diff --git a/pkg/profiler/cpu/maps.go b/pkg/profiler/cpu/maps.go index 75e4398ce0..f410017f2b 100644 --- a/pkg/profiler/cpu/maps.go +++ b/pkg/profiler/cpu/maps.go @@ -18,13 +18,20 @@ import "C" import ( "bytes" + "debug/elf" "encoding/binary" "errors" "fmt" + "os" + "path" + "sort" + "time" "unsafe" - "github.com/parca-dev/parca-agent/internal/dwarf/frame" + "github.com/parca-dev/parca-agent/pkg/buildid" + "github.com/parca-dev/parca-agent/pkg/executable" "github.com/parca-dev/parca-agent/pkg/stack/unwind" + "golang.org/x/exp/constraints" bpf "github.com/aquasecurity/libbpfgo" ) @@ -33,33 +40,17 @@ const ( debugPIDsMapName = "debug_pids" stackCountsMapName = "stack_counts" stackTracesMapName = "stack_traces" + unwindShardsMapName = "unwind_shards" dwarfStackTracesMapName = "dwarf_stack_traces" unwindTablesMapName = "unwind_tables" + processInfoMapName = "process_info" programsMapName = "programs" - // With the current row structure, the max items we can store is 262k per map. - unwindTableMaxEntries = 100 + // With the current row structure, the max items we can store is 262k per map, we rounded + // it down to 250k. + unwindTableMaxEntries = 50 // How many shards we have. maxUnwindTableSize = 250 * 1000 // Always needs to be sync with MAX_UNWIND_TABLE_SIZE in the BPF program. - unwindTableShardCount = 6 // Always needs to be sync with MAX_SHARDS in the BPF program. - maxUnwindSize = maxUnwindTableSize * unwindTableShardCount -) - -type BpfCfaType uint16 - -const ( - CfaRegisterUndefined BpfCfaType = iota - CfaRegisterRbp - CfaRegisterRsp - CfaRegisterExpression -) - -type BpfRbpType uint16 - -const ( - RbpRuleOffsetUnchanged BpfRbpType = iota - RbpRuleOffset - RbpRuleRegister - RbpRegisterExpression + maxUnwindSize = maxUnwindTableSize * unwindTableMaxEntries ) var ( @@ -77,9 +68,33 @@ type bpfMaps struct { stackCounts *bpf.BPFMap stackTraces *bpf.BPFMap dwarfStackTraces *bpf.BPFMap + processInfo *bpf.BPFMap + unwindShards *bpf.BPFMap unwindTables *bpf.BPFMap programs *bpf.BPFMap + + // unwind stuff 🔬 + buildIdMapping map[string]uint64 + // globalView []{shard_id:, [all the ranges it contains]} + // which shard we are on + shardIndex uint64 + executableId uint64 + unwindInfoBuf *bytes.Buffer + // Account where we are within a shard + lowIndex int + highIndex int + // Other stats + totalEntries uint64 + uniqueMappings uint64 + referencedMappings uint64 +} + +func min[T constraints.Ordered](a, b T) T { + if a < b { + return a + } + return b } func initializeMaps(m *bpf.Module, byteOrder binary.ByteOrder) (*bpfMaps, error) { @@ -87,9 +102,13 @@ func initializeMaps(m *bpf.Module, byteOrder binary.ByteOrder) (*bpfMaps, error) return nil, fmt.Errorf("nil module") } + unwindInfoArray := make([]byte, 0, maxUnwindTableSize) + maps := &bpfMaps{ - module: m, - byteOrder: byteOrder, + module: m, + byteOrder: byteOrder, + unwindInfoBuf: bytes.NewBuffer(unwindInfoArray), + buildIdMapping: make(map[string]uint64), } return maps, nil @@ -132,6 +151,11 @@ func (m *bpfMaps) create() error { return fmt.Errorf("get stack traces map: %w", err) } + unwindShards, err := m.module.GetMap(unwindShardsMapName) + if err != nil { + return fmt.Errorf("get unwind shards map: %w", err) + } + unwindTables, err := m.module.GetMap(unwindTablesMapName) if err != nil { return fmt.Errorf("get unwind tables map: %w", err) @@ -142,11 +166,19 @@ func (m *bpfMaps) create() error { return fmt.Errorf("get dwarf stack traces map: %w", err) } + processInfo, err := m.module.GetMap(processInfoMapName) + if err != nil { + return fmt.Errorf("get process info map: %w", err) + } + m.debugPIDs = debugPIDs m.stackCounts = stackCounts m.stackTraces = stackTraces + m.unwindShards = unwindShards m.unwindTables = unwindTables m.dwarfStackTraces = dwarfStackTraces + m.processInfo = processInfo + return nil } @@ -222,6 +254,16 @@ func (m *bpfMaps) readUserStackWithDwarf(userStackID int32, stack *combinedStack return fmt.Errorf("read user stack bytes, %s: %w", err, errUnrecoverable) } + /* userStack := stack[:stackDepth] + for i := 0; i < stackDepth; i++ { + if i < int(dwarfStack.Len) { + userStack[i] = dwarfStack.Addrs[i] + fmt.Printf("frame: %x\n", dwarfStack.Addrs[i]) + } else { + userStack[i] = 0 + } + } */ + userStack := stack[:stackDepth] for i, addr := range dwarfStack.Addrs { if i >= stackDepth || i >= int(dwarfStack.Len) || addr == 0 { @@ -310,147 +352,382 @@ func (m *bpfMaps) clean() error { return nil } -// setUnwindTable updates the unwind tables with the given unwind table. -func (m *bpfMaps) setUnwindTable(pid int, ut unwind.UnwindTable) error { - buf := new(bytes.Buffer) +func (m *bpfMaps) generateCompactUnwindTable(fullExecutablePath string, mapping *unwind.ExecutableMapping) (unwind.CompactUnwindTable, uint64, uint64, error) { + var minCoveredPc uint64 + var maxCoveredPc uint64 + var ut unwind.CompactUnwindTable + + // 1. Get FDEs + fdes, err := unwind.ReadFDEs(fullExecutablePath) // @nocommit: this should accept an ELF file perhaps. + if err != nil { + return ut, 0, 0, err + } + + sort.Sort(fdes) // hope this help with efficiency, too + minCoveredPc = fdes[0].Begin() + maxCoveredPc = fdes[len(fdes)-1].End() + + // 2. Build unwind table + // 3. Get the compact, BPF-friendly representation + ut, err = unwind.BuildCompactUnwindTable(fdes) + if err != nil { + return ut, 0, 0, err + } + sort.Sort(ut) // 2.5 Sort @nocommit: perhaps sorting the BPF friendly one will be faster + + // now we have a full compact unwind table that we have to split in different BPF maps. + fmt.Println("=> found", len(ut), "unwind entries for", mapping.Executable, "low pc", fmt.Sprintf("%x", minCoveredPc), "high pc", fmt.Sprintf("%x", maxCoveredPc)) // @nocommit: remove + + return ut, minCoveredPc, maxCoveredPc, nil +} + +// writeUnwindTableRow writes a compact unwind table row to the provided buffer. +// +// Note: we are writing field by field as this way we don't allocate as much memory +// and spend less CPU time too as we skip the reflection code paths in `binary.Write`. +func (m *bpfMaps) writeUnwindTableRow(buffer *bytes.Buffer, row unwind.CompactUnwindTableRow) error { + // .pc + if err := binary.Write(buffer, m.byteOrder, row.Pc()); err != nil { + return fmt.Errorf("write unwind table .pc bytes: %w", err) + } + + // .__reserved_do_not_use + if err := binary.Write(buffer, m.byteOrder, row.ReservedDoNotUse()); err != nil { + return fmt.Errorf("write unwind table __reserved_do_not_use bytes: %w", err) + } + + // .cfa_type + if err := binary.Write(buffer, m.byteOrder, row.CfaType()); err != nil { + return fmt.Errorf("write unwind table cfa_type bytes: %w", err) + } + + // .rbp_type + if err := binary.Write(buffer, m.byteOrder, row.RbpType()); err != nil { + return fmt.Errorf("write unwind table rbp_type bytes: %w", err) + } + + // .cfa_offset + if err := binary.Write(buffer, m.byteOrder, row.CfaOffset()); err != nil { + return fmt.Errorf("write unwind table cfa_offset bytes: %w", err) + } + + // .rbp_offset + if err := binary.Write(buffer, m.byteOrder, row.RbpOffset()); err != nil { + return fmt.Errorf("write unwind table rbp_offset bytes: %w", err) + } + + return nil +} + +// writeMapping writes the memory mapping information to the provided buffer. +// +// Note: we are writing field by field as this way we don't allocate as much memory +// and spend less CPU time too as we skip the reflection code paths in `binary.Write`. +func (m *bpfMaps) writeMapping(procInfoBuf *bytes.Buffer, loadAddress uint64, startAddr uint64, endAddr uint64, executableId uint64, type_ uint64) error { + // .load_address + if err := binary.Write(procInfoBuf, m.byteOrder, loadAddress); err != nil { + return fmt.Errorf("write mappings .load_address bytes: %w", err) + } + // .begin + if err := binary.Write(procInfoBuf, m.byteOrder, startAddr); err != nil { + return fmt.Errorf("write mappings .begin bytes: %w", err) + } + // .end + if err := binary.Write(procInfoBuf, m.byteOrder, endAddr); err != nil { + return fmt.Errorf("write mappings .end bytes: %w", err) + } + // .executable_id + if err := binary.Write(procInfoBuf, m.byteOrder, executableId); err != nil { + return fmt.Errorf("write proc info .executable_id bytes: %w", err) + } + // .type + if err := binary.Write(procInfoBuf, m.byteOrder, type_); err != nil { + return fmt.Errorf("write proc info .type bytes: %w", err) + } + + return nil +} - if len(ut) >= maxUnwindSize { - return fmt.Errorf("maximum unwind table size reached. Table size %d, but max size is %d", len(ut), maxUnwindSize) +// mappingId returns the internal identifier for a memory mapping. +// It will either return the already produced ID or generate a new +// one while indicating whether it was already seen or not. +// +// This allows us to reuse the unwind tables for the mappings we +// are dealing with. +func (m *bpfMaps) mappingId(buildId string) (uint64, bool) { + _, alreadySeenMapping := m.buildIdMapping[buildId] + if alreadySeenMapping { + fmt.Println("-> caching - seen this mapping before") + m.referencedMappings += 1 + } else { + fmt.Println("-> caching - new mapping") + + m.buildIdMapping[buildId] = m.executableId } - // Range-partition the unwind table in the different shards. - shardIndex := 0 - for i := 0; i < len(ut); i += maxUnwindTableSize { - upTo := i + maxUnwindTableSize - if upTo > len(ut) { - upTo = len(ut) + return m.buildIdMapping[buildId], alreadySeenMapping +} + +// PersistUnwindTable writes the current in-flight, writable shard +// to the corresponding BPF map's shard. +// +// Note: as of now, this must be called in two situations: +// - In the callsite, once we are done with generating the unwind +// tables. +// - Whenever the current in-flight shard is full, before we wipe +// it and start reusing it. +func (m *bpfMaps) PersistUnwindTable() error { + totalRows := m.unwindInfoBuf.Len() / 16 + fmt.Println("unwind rows", totalRows) + shardIndex := uint64(m.shardIndex) + var err error + for i := 0; i < 100; i++ { + err = m.unwindTables.Update(unsafe.Pointer(&shardIndex), unsafe.Pointer(&m.unwindInfoBuf.Bytes()[0])) + if err == nil { + fmt.Println("~~ worked:, rows:", totalRows, "try:", i) + return nil + } else { + fmt.Println("~~ failed:", err, "rows:", totalRows, "try:", i) + time.Sleep(100 * time.Millisecond) } + } + + return fmt.Errorf("update unwind tables: %w", err) +} - chunk := ut[i:upTo] +// availableEntries returns how many entries we have left +// in the in-flight shard. +func (m *bpfMaps) availableEntries() int { + return maxUnwindTableSize - m.highIndex +} - // Write `.low_pc` - if err := binary.Write(buf, m.byteOrder, chunk[0].Loc); err != nil { - return fmt.Errorf("write the number of rows: %w", err) +// assertInvariants checks that some invariants that should +// always be true during the execution of the program are held. +func (m *bpfMaps) assertInvariants() { + if m.lowIndex < 0 { + panic("m.lowIndex < 0, this should never happen") + } + if m.highIndex > maxUnwindTableSize { + panic("m.highIndex > 250k, this should never happen") + } +} + +// setUnwindTable sets all the necessary metadata and unwind tables, if needed +// to make DWARF unwinding work, such as: +// +// - Continue appending information to the executable mapping information for a process +// - Add mapping information +// - If unwind table is already present, we are done here +// - Otherwise, we generate the unwind table for this executable +func (m *bpfMaps) setUnwindTable(pid int, mapping *unwind.ExecutableMapping, procInfoBuf *bytes.Buffer) error { + fmt.Println("========================================================================================") + fmt.Println("setUnwindTable called (total shards:", m.shardIndex, ", total entries:", m.totalEntries, ")") + fmt.Println("========================================================================================") + + // Deal with mappings that are not filed backed. They don't have unwind + // information. + if mapping.IsNotFileBacked() { + var type_ uint64 + if mapping.IsJitted() { + fmt.Println("JIT section") + type_ = 1 } - // Write `.high_pc`. - if err := binary.Write(buf, m.byteOrder, chunk[len(chunk)-1].Loc); err != nil { - return fmt.Errorf("write the number of rows: %w", err) + if mapping.IsSpecial() { + fmt.Println("Special section") + type_ = 2 } - // Write number of rows `.table_len`. - if err := binary.Write(buf, m.byteOrder, uint64(len(chunk))); err != nil { - return fmt.Errorf("write the number of rows: %w", err) + + err := m.writeMapping(procInfoBuf, mapping.LoadAddr, mapping.StartAddr, mapping.EndAddr, uint64(0), type_) + if err != nil { + return fmt.Errorf("writting mappings failed with %w", err) + } + return nil + } + + // Deal with mappings that are backed by a file and might contain unwind + // information. + fullExecutablePath := path.Join("/proc/", fmt.Sprintf("%d", pid), "/root/", mapping.Executable) + + elfFile, err := elf.Open(fullExecutablePath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil } - // Write padding. - if err := binary.Write(buf, m.byteOrder, uint64(0)); err != nil { - return fmt.Errorf("write the number of rows: %w", err) + return fmt.Errorf("elf.Open failed: %w", err) + } + buildId, err := buildid.BuildID(&buildid.ElfFile{File: elfFile, Path: fullExecutablePath}) + if err != nil { + return fmt.Errorf("BuildID failed %s: %w", fullExecutablePath, err) + } + + // Find the adjusted load address. + aslrElegible := executable.IsASLRElegibleElf(elfFile) + + adjustedLoadAddress := uint64(0) + if mapping.IsMainObject() { + fmt.Println("!!!!!!! main object", mapping) + if aslrElegible { + adjustedLoadAddress = mapping.LoadAddr } - for _, row := range chunk { - // Right now we only support x86_64, where the return address position - // is specified in the ABI, so we don't write it. + } else { + adjustedLoadAddress = mapping.LoadAddr + } + + fmt.Println("[info] adding memory mappings in for executable with ID", m.executableId, "buildId", buildId, "exec", mapping.Executable) + + // Add the memory mapping information. + foundExecutableId, mappingAlreadySeen := m.mappingId(buildId) + + err = m.writeMapping(procInfoBuf, adjustedLoadAddress, mapping.StartAddr, mapping.EndAddr, uint64(foundExecutableId), uint64(0)) + if err != nil { + return fmt.Errorf("writting mappings failed with %w", err) + } + + // Generated and add the unwind table, if needed. + if !mappingAlreadySeen { + + unwindShardsKeyBuf := new(bytes.Buffer) + unwindShardsValBuf := new(bytes.Buffer) - // Write Program Counter (PC). - if err := binary.Write(buf, m.byteOrder, row.Loc); err != nil { - return fmt.Errorf("write the program counter: %w", err) + chunkIndex := 0 + + // ==================================== generate unwind table + + ut, minCoveredPc, maxCoveredPc, err := m.generateCompactUnwindTable(fullExecutablePath, mapping) + if err != nil { + if err == unwind.ErrNoFDEsFound { + // is it ok to return here? + return nil + } + if err == unwind.ErrEhFrameSectionNotFound { + // is it ok to return here? + return nil } + return nil + } + + threshold := min(len(ut), m.availableEntries()) + currentChunk := ut[:threshold] + restChunks := ut[threshold:] - // Write __reserved_do_not_use. - if err := binary.Write(buf, m.byteOrder, uint16(0)); err != nil { - return fmt.Errorf("write CFA register bytes: %w", err) + numShards := 1 + len(restChunks)/maxUnwindTableSize // @nocommit: verify this + + // .len + if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(numShards)); err != nil { + return fmt.Errorf("write shards .len bytes: %w", err) + } + + for { + m.assertInvariants() + + fmt.Println("- current chunk size", len(currentChunk)) + fmt.Println("- rest of chunk size", len(restChunks)) + + m.totalEntries += uint64(len(currentChunk)) + + if len(currentChunk) == 0 { + fmt.Println("!! done with the last chunk") + break } - var CfaRegister uint8 - var RbpRegister uint8 - var CfaOffset int16 - var RbpOffset int16 - - // CFA. - switch row.CFA.Rule { - case frame.RuleCFA: - if row.CFA.Reg == frame.X86_64FramePointer { - CfaRegister = uint8(CfaRegisterRbp) - } else if row.CFA.Reg == frame.X86_64StackPointer { - CfaRegister = uint8(CfaRegisterRsp) - } - CfaOffset = int16(row.CFA.Offset) - case frame.RuleExpression: - CfaRegister = uint8(CfaRegisterExpression) - CfaOffset = int16(unwind.ExpressionIdentifier(row.CFA.Expression)) + m.highIndex += len(currentChunk) + fmt.Println("- lowindex [", m.lowIndex, ":", m.highIndex, "] highIndex") + + // ======================== shard info =============================== + // Set (executable ID) -> unwind table shards info + // basically have the info - default: - return fmt.Errorf("CFA rule is not valid. This should never happen") + fmt.Println("- executable", m.executableId, "mapping", mapping.Executable, "shard", chunkIndex) + if err := binary.Write(unwindShardsKeyBuf, m.byteOrder, uint64(m.executableId)); err != nil { + return fmt.Errorf("write shards key bytes: %w", err) } - // Frame pointer. - switch row.RBP.Rule { - case frame.RuleUndefined: - case frame.RuleOffset: - RbpRegister = uint8(RbpRuleOffset) - RbpOffset = int16(row.RBP.Offset) - case frame.RuleRegister: - RbpRegister = uint8(RbpRuleRegister) - case frame.RuleExpression: - RbpRegister = uint8(RbpRegisterExpression) + // note this might not be correct if using the unwind table info for the first or last items + minPc := currentChunk[0].Pc() + if chunkIndex == 0 { + minPc = uint64(minCoveredPc) + } + // .low_pc + if err := binary.Write(unwindShardsValBuf, m.byteOrder, minPc); err != nil { + return fmt.Errorf("write shards .low_pc bytes: %w", err) } - // Write CFA type (.cfa_type). - if err := binary.Write(buf, m.byteOrder, CfaRegister); err != nil { - return fmt.Errorf("write CFA register bytes: %w", err) + // note this might not be correct if using the unwind table info for the first or last items + maxPc := currentChunk[len(currentChunk)-1].Pc() + if chunkIndex == numShards { + maxPc = uint64(maxCoveredPc) + } + // .high_pc + if err := binary.Write(unwindShardsValBuf, m.byteOrder, maxPc); err != nil { + return fmt.Errorf("write shards .high_pc bytes: %w", err) } - // Write frame pointer type (.rbp_type). - if err := binary.Write(buf, m.byteOrder, RbpRegister); err != nil { - return fmt.Errorf("write CFA register bytes: %w", err) + // .shard_index + if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(m.shardIndex)); err != nil { + return fmt.Errorf("write shards .shard_index bytes: %w", err) } - // Write CFA offset (.cfa_offset). - if err := binary.Write(buf, m.byteOrder, CfaOffset); err != nil { - return fmt.Errorf("write CFA offset bytes: %w", err) + // .low_index + if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(m.lowIndex)); err != nil { + return fmt.Errorf("write shards .low_index bytes: %w", err) + } + // .high_index + if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(m.highIndex)); err != nil { + return fmt.Errorf("write shards .high_index bytes: %w", err) } - // Write frame pointer offset (.rbp_offset). - if err := binary.Write(buf, m.byteOrder, RbpOffset); err != nil { - return fmt.Errorf("write RBP offset bytes: %w", err) + m.lowIndex = m.highIndex // @nocommit this is wrong??? + + // ====================== Write unwind table ===================== + for _, row := range currentChunk { + if err := m.writeUnwindTableRow(m.unwindInfoBuf, row); err != nil { + return fmt.Errorf("writing unwind table row: %w", err) + } } - } - // Set (PID, shard ID) -> unwind table for each shard. - keyBuf := new(bytes.Buffer) - if err := binary.Write(keyBuf, m.byteOrder, int32(pid)); err != nil { - return fmt.Errorf("write RBP offset bytes: %w", err) - } - if err := binary.Write(keyBuf, m.byteOrder, int32(shardIndex)); err != nil { - return fmt.Errorf("write RBP offset bytes: %w", err) - } + // Need a new shard? + if m.availableEntries() == 0 { + fmt.Println("run out of space in the 'live' shard, creating a new one") + err := m.PersistUnwindTable() + if err != nil { + return fmt.Errorf("failed to write unwind table: %w", err) + } + m.shardIndex++ + m.unwindInfoBuf.Reset() // @nocommit is it stored?? + m.lowIndex = 0 + m.highIndex = 0 + + if m.shardIndex == unwindTableMaxEntries { + fmt.Println(m.buildIdMapping) + fmt.Println("Not enough shards - this is not implemented but we should deal with this") + } + } + + // Recalculate for next iteration + threshold := min(len(restChunks), m.availableEntries()) + currentChunk = restChunks[:threshold] + restChunks = restChunks[threshold:] - if err := m.unwindTables.Update(unsafe.Pointer(&keyBuf.Bytes()[0]), unsafe.Pointer(&buf.Bytes()[0])); err != nil { - return fmt.Errorf("update unwind tables: %w", err) + chunkIndex++ } - shardIndex++ - buf.Reset() - } - - // HACK(javierhonduco): remove this. - // Debug stuff to compare this with the BPF program's view of the world. - /* printRow := func(w io.Writer, pt unwind.UnwindTable, index int) { - cfaInfo := "" - switch ut[index].CFA.Rule { - case frame.RuleCFA: - cfaInfo = fmt.Sprintf("CFA Reg: %d Offset:%d", ut[index].CFA.Reg, ut[index].CFA.Offset) - case frame.RuleExpression: - cfaInfo = "CFA exp" - default: - panic("CFA rule is not valid. This should never happen.") + + if err := m.unwindShards.Update( + unsafe.Pointer(&unwindShardsKeyBuf.Bytes()[0]), + unsafe.Pointer(&unwindShardsValBuf.Bytes()[0])); err != nil { + return fmt.Errorf("failed to update unwind shard: %w", err) } - fmt.Fprintf(w, "\trow[%d]. Loc: %x, %s, $rbp: %d\n", index, pt[index].Loc, cfaInfo, pt[index].RBP.Offset) + m.executableId++ + m.uniqueMappings++ } - fmt.Fprintf(os.Stdout, "\t- Total entries %d\n\n", len(ut)) - printRow(os.Stdout, ut, 0) - printRow(os.Stdout, ut, 1) - printRow(os.Stdout, ut, 2) - printRow(os.Stdout, ut, 6) - printRow(os.Stdout, ut, len(ut)-1) */ + m.assertInvariants() + + // @nocommit NO SPACE LEFT + if m.availableEntries() == 0 { + panic("no space left, this should never happen") + } + // @nocommit TODO: check if we are full and flush if that's the case return nil } diff --git a/pkg/stack/unwind/unwind_table.go b/pkg/stack/unwind/unwind_table.go index 4b1b27e1b5..42941dcbbb 100644 --- a/pkg/stack/unwind/unwind_table.go +++ b/pkg/stack/unwind/unwind_table.go @@ -16,26 +16,22 @@ package unwind import ( "debug/elf" + "errors" "fmt" "io" - "path" - "sort" - "strings" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/hashicorp/go-multierror" - "github.com/prometheus/procfs" "github.com/parca-dev/parca-agent/internal/dwarf/frame" - "github.com/parca-dev/parca-agent/pkg/executable" ) -// UnwindTableBuilder helps to build UnwindTable for a given PID. -// -// javierhonduco(note): Caching on PID alone will result in hard to debug issues as -// PIDs are reused. Right now we will parse the CIEs and FDEs over and over. Caching -// will be added later on. +var ( + ErrNoFDEsFound = errors.New("no FDEs found") + ErrEhFrameSectionNotFound = errors.New("failed to find .eh_frame section") +) + type UnwindTableBuilder struct { logger log.Logger } @@ -44,126 +40,6 @@ func NewUnwindTableBuilder(logger log.Logger) *UnwindTableBuilder { return &UnwindTableBuilder{logger: logger} } -type UnwindTable []UnwindTableRow - -func (t UnwindTable) Len() int { return len(t) } -func (t UnwindTable) Less(i, j int) bool { return t[i].Loc < t[j].Loc } -func (t UnwindTable) Swap(i, j int) { t[i], t[j] = t[j], t[i] } - -// TODO(kakkoyun): Unify with existing process maps mechanisms. -// - pkg/process/mappings.go -// The rest of the code base share a cache for process maps. - -// processMaps returns a map of file-backed memory mappings for a given -// process which contains at least one executable section. The value of -// mapping contains the metadata for the first mapping for each file, no -// matter if it's executable or not. -// -// This is needed as typically the first mapped section for a dynamic library -// is not executable, as it may contain only data, such as the `.bss` or the -// `.rodata` section. -func processMaps(pid int) (map[string]*procfs.ProcMap, string, error) { - p, err := procfs.NewProc(pid) - if err != nil { - return nil, "", fmt.Errorf("could not get process: %w", err) - } - maps, err := p.ProcMaps() - if err != nil { - return nil, "", fmt.Errorf("could not get maps: %w", err) - } - - // Find the file-backed memory mappings that contain at least one - // executable section. - filesWithSomeExecutable := make(map[string]bool) - for _, map_ := range maps { - if map_.Pathname != "" && map_.Perms.Execute { - filesWithSomeExecutable[map_.Pathname] = true - } - } - - dynamicExecutables := make(map[string]*procfs.ProcMap) - mainExecutable := "" - - // Find all the dynamically loaded libraries. We need to make sure - // that we skip the files that do not have a single executable mapping - // as these are just data. - for _, map_ := range maps { - path := map_.Pathname - if path == "" { - continue - } - if !strings.HasPrefix(path, "/") { - continue - } - // The first entry should be the "main" executable, and not - // a dynamic library. - if mainExecutable == "" { - mainExecutable = map_.Pathname - } - _, ok := dynamicExecutables[path] - if ok { - continue - } - - _, ok = filesWithSomeExecutable[path] - if ok { - dynamicExecutables[path] = map_ - } - } - - return dynamicExecutables, mainExecutable, nil -} - -func (ptb *UnwindTableBuilder) UnwindTableForPid(pid int) (UnwindTable, error) { - mappedFiles, mainExec, err := processMaps(pid) - if err != nil { - return nil, fmt.Errorf("error opening the maps %w", err) - } - - ut := UnwindTable{} - for _, m := range mappedFiles { - executablePath := path.Join(fmt.Sprintf("/proc/%d/root", pid), m.Pathname) - - level.Info(ptb.logger).Log("msg", "finding tables for mapped executable", "path", executablePath, "starting address", fmt.Sprintf("%x", m.StartAddr)) - fdes, err := ptb.readFDEs(executablePath) - // TODO(javierhonduco): Add markers in between executable sections. - if err != nil { - level.Error(ptb.logger).Log("msg", "failed to read frame description entries", "obj", executablePath, "err", err) - continue - } - - rows := ptb.buildUnwindTable(fdes) - if len(rows) == 0 { - level.Error(ptb.logger).Log("msg", "unwind table empty for", "obj", executablePath) - continue - } - - level.Info(ptb.logger).Log("msg", "adding tables for mapped executable", "path", executablePath, "rows", len(rows), "low pc", fmt.Sprintf("%x", rows[0].Loc), "high pc", fmt.Sprintf("%x", rows[len(rows)-1].Loc)) - - aslrElegible, err := executable.IsASLRElegible(executablePath) - if err != nil { - return nil, fmt.Errorf("ASLR check failed with with: %w", err) - } - - if strings.Contains(executablePath, mainExec) { - if aslrElegible { - for i := range rows { - rows[i].Loc += uint64(m.StartAddr) - } - } - } else { - for i := range rows { - rows[i].Loc += uint64(m.StartAddr) - } - } - ut = append(ut, rows...) - } - - // Sort the entries so we can binary search over them. - sort.Sort(ut) - return ut, nil -} - func x64RegisterToString(reg uint64) string { // TODO(javierhonduco): // - add source for this table. @@ -183,7 +59,7 @@ func x64RegisterToString(reg uint64) string { // PrintTable is a debugging helper that prints the unwinding table to the given io.Writer. func (ptb *UnwindTableBuilder) PrintTable(writer io.Writer, path string, compact bool) error { - fdes, err := ptb.readFDEs(path) + fdes, err := ReadFDEs(path) if err != nil { return err } @@ -261,7 +137,7 @@ func (ptb *UnwindTableBuilder) PrintTable(writer io.Writer, path string, compact return nil } -func (ptb *UnwindTableBuilder) readFDEs(path string) (frame.FrameDescriptionEntries, error) { +func ReadFDEs(path string) (frame.FrameDescriptionEntries, error) { obj, err := elf.Open(path) if err != nil { return nil, fmt.Errorf("failed to open elf: %w", err) @@ -270,7 +146,7 @@ func (ptb *UnwindTableBuilder) readFDEs(path string) (frame.FrameDescriptionEntr sec := obj.Section(".eh_frame") if sec == nil { - return nil, fmt.Errorf("failed to find .eh_frame section") + return nil, ErrEhFrameSectionNotFound } // TODO(kakkoyun): Consider using the debug_frame section as a fallback. @@ -286,18 +162,22 @@ func (ptb *UnwindTableBuilder) readFDEs(path string) (frame.FrameDescriptionEntr return nil, fmt.Errorf("failed to parse frame data: %w", err) } + if len(fdes) == 0 { + return nil, ErrNoFDEsFound + } + return fdes, nil } -func (ptb *UnwindTableBuilder) buildUnwindTable(fdes frame.FrameDescriptionEntries) UnwindTable { +func BuildUnwindTable(fdes frame.FrameDescriptionEntries) UnwindTable { // The frame package can raise in case of malformed unwind data. + table := make(UnwindTable, 0, 4*len(fdes)) // heuristic defer func() { if r := recover(); r != nil { - level.Info(ptb.logger).Log("msg", "recovered a panic in buildUnwindTable", "stack", r) + //level.Info(ptb.logger).Log("msg", "recovered a panic in buildUnwindTable", "stack", r) } }() - table := make(UnwindTable, 0) for _, fde := range fdes { frameContext := frame.ExecuteDwarfProgram(fde, nil) for insCtx := frameContext.Next(); frameContext.HasNext(); insCtx = frameContext.Next() { @@ -323,6 +203,12 @@ type UnwindTableRow struct { RA frame.DWRule } +type UnwindTable []UnwindTableRow + +func (t UnwindTable) Len() int { return len(t) } +func (t UnwindTable) Less(i, j int) bool { return t[i].Loc < t[j].Loc } +func (t UnwindTable) Swap(i, j int) { t[i], t[j] = t[j], t[i] } + func unwindTableRow(instructionContext *frame.InstructionContext) *UnwindTableRow { if instructionContext == nil { return nil diff --git a/pkg/stack/unwind/unwind_table_test.go b/pkg/stack/unwind/unwind_table_test.go index 80fb513b44..4a1a8f064d 100644 --- a/pkg/stack/unwind/unwind_table_test.go +++ b/pkg/stack/unwind/unwind_table_test.go @@ -17,20 +17,16 @@ package unwind import ( "testing" - "github.com/go-kit/log" "github.com/stretchr/testify/require" "github.com/parca-dev/parca-agent/internal/dwarf/frame" ) func TestBuildUnwindTable(t *testing.T) { - logger := log.NewNopLogger() - utb := NewUnwindTableBuilder(logger) - - fdes, err := utb.readFDEs("../../../testdata/out/basic-cpp") + fdes, err := ReadFDEs("../../../testdata/out/basic-cpp") require.NoError(t, err) - unwindTable := utb.buildUnwindTable(fdes) + unwindTable := BuildUnwindTable(fdes) require.Equal(t, 38, len(unwindTable)) require.Equal(t, uint64(0x401020), unwindTable[0].Loc) @@ -47,12 +43,10 @@ func benchmarkParsingDwarfUnwindInformation(b *testing.B, executable string) { b.Helper() b.ReportAllocs() - logger := log.NewNopLogger() var rbpOffset int64 - utb := NewUnwindTableBuilder(logger) for n := 0; n < b.N; n++ { - fdes, err := utb.readFDEs(executable) + fdes, err := ReadFDEs(executable) if err != nil { panic("could not read FDEs") } diff --git a/things_to_do_next.txt b/things_to_do_next.txt new file mode 100644 index 0000000000..d6cd4ab1ea --- /dev/null +++ b/things_to_do_next.txt @@ -0,0 +1,36 @@ +1. Unwind table generation doesn't have to happen if the mappings are cachedLabels +2. Clean up the code +3. Add JIT checks? +4. Refresh PIDs from time to time +5. Debugging endpoint +6. Re-do unwind info if shards are full + +================ +later +- don't write all the time +- reuse buffers + +================ + +1. Read the code +2. Make it better +3. Try to improve it +4. Add WebUI +5. Add JIT detection? +6. Do lots of testing (edge cases etc) +7. Add caching + + + + +== bugs + +- nginx fails sometimes +- make && sudo dist/parca-agent --node=test --remote-store-insecure --remote-store-address=127.0.0.1:7070 --experimental-enable-dwarf-unwinding --debug-process-names="(systemd|python|ruby|irb|bash)" fails + +perhaps and edge case? + +- left 226562 right 250000 + + +