diff --git a/.gitignore b/.gitignore
index a55413249c..a92f284140 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@ TODO.md
 minikube-*
 /data
 
+bpf_logs_*.txt
+
 # Snap Packaging Artifacts
 *.snap
 snap/local/parca-agent
diff --git a/bpf/.clang-format b/bpf/.clang-format
new file mode 100644
index 0000000000..1dbb17ab05
--- /dev/null
+++ b/bpf/.clang-format
@@ -0,0 +1,5 @@
+---
+BasedOnStyle: LLVM
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+ColumnLimit: 120
diff --git a/bpf/Makefile b/bpf/Makefile
index f9b2624423..1c36353418 100644
--- a/bpf/Makefile
+++ b/bpf/Makefile
@@ -11,7 +11,7 @@ format: c/fmt
 
 .PHONY: c/fmt
 c/fmt:
-	clang-format -i --style=LLVM $(BPF_SRC) $(BPF_HEADERS)
+	clang-format -i --style=file $(BPF_SRC) $(BPF_HEADERS)
 
 .PHONY: format-check
 format-check:
diff --git a/bpf/cpu/cpu.bpf.c b/bpf/cpu/cpu.bpf.c
index 81c9083039..852caa3a85 100644
--- a/bpf/cpu/cpu.bpf.c
+++ b/bpf/cpu/cpu.bpf.c
@@ -12,6 +12,10 @@
 #include "../common.h"
 #include "hash.h"
 
+//#include <uapi/linux/bpf.h>
+enum {
+  BPF_F_NO_PREALLOC = (1U << 0),
+};
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
@@ -22,19 +26,31 @@
 // Number of frames to walk per tail call iteration.
 #define MAX_STACK_DEPTH_PER_PROGRAM 15
 // Number of BPF tail calls that will be attempted.
+//
+// invariant: `MAX_TAIL_CALLS * MAX_STACK_DEPTH_PER_PROGRAM` >=
+// `MAX_STACK_DEPTH`
 #define MAX_TAIL_CALLS 10
-// Number of frames to walk in total.
+// Maximum number of frames.
 #define MAX_STACK_DEPTH 127
-// Number of stacks.
-#define MAX_STACK_TRACES 1024
+// Number of unique stacks.
+#define MAX_STACK_TRACES_ENTRIES 1024
 // Number of items in the stack counts aggregation map.
 #define MAX_STACK_COUNTS_ENTRIES 10240
+// Maximum number of processes we are willing to track.
+#define MAX_PROCESSES 1500
 // Binary search iterations for dwarf based stack walking.
-// 2^20 can bisect ~1_048_576 entries.
-#define MAX_BINARY_SEARCH_DEPTH 20
+// 2^19 can bisect ~524_288 entries.
+//
+// invariant: `2^MAX_BINARY_SEARCH_DEPTH >= MAX_UNWIND_TABLE_SIZE`
+#define MAX_BINARY_SEARCH_DEPTH 19
 // Size of the unwind table.
+// 250k * sizeof(stack_unwind_row_t) = 2MB
 #define MAX_UNWIND_TABLE_SIZE 250 * 1000
-#define MAX_SHARDS 6
+// Unwind tables bigger than can't fit in the remaining space
+// of the current shard are broken up into chunks up to `MAX_UNWIND_TABLE_SIZE`.
+#define MAX_UNWIND_TABLE_CHUNKS 30
+// Maximum memory mappings per process.
+#define MAX_MAPPINGS_PER_PROCESS 120
 
 // Values for dwarf expressions.
 #define DWARF_EXPRESSION_UNKNOWN 0
@@ -71,34 +87,57 @@ const volatile struct config_t config = {};
 
 /*============================== MACROS =====================================*/
 
-#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries)            \
-  struct {                                                                     \
-    __uint(type, _type);                                                       \
-    __uint(max_entries, _max_entries);                                         \
-    __type(key, _key_type);                                                    \
-    __type(value, _value_type);                                                \
+#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries)                                                    \
+  struct {                                                                                                             \
+    __uint(type, _type);                                                                                               \
+    __uint(max_entries, _max_entries);                                                                                 \
+    __type(key, _key_type);                                                                                            \
+    __type(value, _value_type);                                                                                        \
   } _name SEC(".maps");
 
 // Stack Traces are slightly different
 // in that the value is 1 big byte array
 // of the stack addresses
 typedef __u64 stack_trace_type[MAX_STACK_DEPTH];
-#define BPF_STACK_TRACE(_name, _max_entries)                                   \
+#define BPF_STACK_TRACE(_name, _max_entries)                                                                           \
   BPF_MAP(_name, BPF_MAP_TYPE_STACK_TRACE, u32, stack_trace_type, _max_entries);
 
-#define BPF_HASH(_name, _key_type, _value_type, _max_entries)                  \
+#define BPF_HASH(_name, _key_type, _value_type, _max_entries)                                                          \
   BPF_MAP(_name, BPF_MAP_TYPE_HASH, _key_type, _value_type, _max_entries);
 
-#define DEFINE_COUNTER(__func__name)                                           \
-  static void BUMP_##__func__name() {                                          \
-    u32 *c = bpf_map_lookup_elem(&percpu_stats, &__func__name);                \
-    if (c != NULL) {                                                           \
-      *c += 1;                                                                 \
-    }                                                                          \
+#define DEFINE_COUNTER(__func__name)                                                                                   \
+  static void BUMP_##__func__name() {                                                                                  \
+    u32 *c = bpf_map_lookup_elem(&percpu_stats, &__func__name);                                                        \
+    if (c != NULL) {                                                                                                   \
+      *c += 1;                                                                                                         \
+    }                                                                                                                  \
   }
 
 /*============================= INTERNAL STRUCTS ============================*/
 
+// cheat:
+//
+// pid -> mapping_id
+// mapping_id -> executable_id
+// executable_id -> table_shards
+//
+// now we can find the shard
+
+// Unwind table shard.
+typedef struct shard_info {
+  u64 low_pc;
+  u64 high_pc;
+  u64 shard_index;
+  u64 low_index;
+  u64 high_index;
+} shard_info_t;
+
+// Unwind table shards for an executable mapping.
+typedef struct stack_unwind_table_shards {
+  u64 len;
+  shard_info_t shards[MAX_UNWIND_TABLE_CHUNKS];
+} stack_unwind_table_shards_t;
+
 // The addresses of a native stack trace.
 typedef struct stack_trace_t {
   u64 len;
@@ -113,11 +152,24 @@ typedef struct stack_count_key {
   int user_stack_id_dwarf;
 } stack_count_key_t;
 
-typedef struct unwind_tables_key {
-  int pid;
-  int shard;
-} unwind_tables_key_t;
+// Represents an executable mapping.
+typedef struct mapping {
+  u64 load_address;
+  u64 begin;
+  u64 end;
+  u64 executable_id;
+  u64 type;
+} mapping_t;
+
+// Executable mappings for a process.
+typedef struct {
+  u64 is_jit_compiler;
+  u64 len;
+  mapping_t mappings[MAX_MAPPINGS_PER_PROCESS];
+} process_info_t;
 
+// State of unwinder such as the registers as well
+// as internal data.
 typedef struct unwind_state {
   u64 ip;
   u64 sp;
@@ -127,17 +179,6 @@ typedef struct unwind_state {
 } unwind_state_t;
 
 // A row in the stack unwinding table.
-// PERF(javierhonduco): in the future, split this struct from a buffer of
-// `stack_unwind_row` to multiple buffers containing each field. That way we
-// would be able to not only have more entries, but we would increase
-// performance as more data will be able to fit in the CPU cache.
-//
-// This is particularly important for the program counter => map<pid, pcs> +
-// map<pid, other_data>. the second map can be split further if we decide to do
-// so.
-//
-// This is at the cost of code readability, so should only be done if
-// experiments confirm this theory.
 typedef struct stack_unwind_row {
   u64 pc;
   u16 __reserved_do_not_use;
@@ -148,11 +189,7 @@ typedef struct stack_unwind_row {
 } stack_unwind_row_t;
 
 // Unwinding table representation.
-typedef struct stack_unwind_table_t {
-  u64 low_pc;
-  u64 high_pc;
-  u64 table_len; // items of the table, as the max size is static.
-  u64 __explicit_padding;
+typedef struct stack_unwind_table {
   stack_unwind_row_t rows[MAX_UNWIND_TABLE_SIZE];
 } stack_unwind_table_t;
 
@@ -173,15 +210,22 @@ u32 UNWIND_SHOULD_NEVER_HAPPEN_ERROR = 5;
 u32 UNWIND_PC_NOT_COVERED_ERROR = 6;
 // Keep track of total samples.
 u32 UNWIND_SAMPLES_COUNT = 7;
+u32 UNWIND_JIT_ERRORS = 8;
 
 /*================================ MAPS =====================================*/
 
-BPF_HASH(debug_pids, int, u8, 32);
+BPF_HASH(debug_pids, int, u8, MAX_PROCESSES);
+BPF_HASH(process_info, int, process_info_t, MAX_PROCESSES);
+
+BPF_STACK_TRACE(stack_traces, MAX_STACK_TRACES_ENTRIES);
+BPF_HASH(dwarf_stack_traces, int, stack_trace_t, MAX_STACK_TRACES_ENTRIES);
 BPF_HASH(stack_counts, stack_count_key_t, u64, MAX_STACK_COUNTS_ENTRIES);
-BPF_STACK_TRACE(stack_traces, MAX_STACK_TRACES);
-BPF_HASH(dwarf_stack_traces, int, stack_trace_t, MAX_STACK_TRACES);
-BPF_HASH(unwind_tables, unwind_tables_key_t, stack_unwind_table_t,
-         2); // Table size will be updated in userspace.
+
+// executable_chunks?
+BPF_HASH(unwind_shards, u64, stack_unwind_table_shards_t,
+         5 * 1000); // <executable_id, shardmap> @nocommit: update
+BPF_HASH(unwind_tables, u64, stack_unwind_table_t,
+         5); // Table size will be updated in userspace.
 
 struct {
   __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
@@ -212,6 +256,7 @@ DEFINE_COUNTER(UNWIND_UNSUPPORTED_EXPRESSION);
 DEFINE_COUNTER(UNWIND_SHOULD_NEVER_HAPPEN_ERROR);
 DEFINE_COUNTER(UNWIND_CATCHALL_ERROR);
 DEFINE_COUNTER(UNWIND_PC_NOT_COVERED_ERROR);
+DEFINE_COUNTER(UNWIND_JIT_ERRORS);
 
 static void unwind_print_stats() {
   u32 *success_counter = bpf_map_lookup_elem(&percpu_stats, &UNWIND_SUCCESS);
@@ -219,48 +264,48 @@ static void unwind_print_stats() {
     return;
   }
 
-  u32 *total_counter =
-      bpf_map_lookup_elem(&percpu_stats, &UNWIND_SAMPLES_COUNT);
+  u32 *total_counter = bpf_map_lookup_elem(&percpu_stats, &UNWIND_SAMPLES_COUNT);
   if (total_counter == NULL) {
     return;
   }
 
-  u32 *truncated_counter =
-      bpf_map_lookup_elem(&percpu_stats, &UNWIND_TRUNCATED);
+  u32 *truncated_counter = bpf_map_lookup_elem(&percpu_stats, &UNWIND_TRUNCATED);
   if (truncated_counter == NULL) {
     return;
   }
 
-  u32 *unsup_expression =
-      bpf_map_lookup_elem(&percpu_stats, &UNWIND_UNSUPPORTED_EXPRESSION);
+  u32 *unsup_expression = bpf_map_lookup_elem(&percpu_stats, &UNWIND_UNSUPPORTED_EXPRESSION);
   if (unsup_expression == NULL) {
     return;
   }
 
-  u32 *not_covered_count =
-      bpf_map_lookup_elem(&percpu_stats, &UNWIND_PC_NOT_COVERED_ERROR);
+  u32 *not_covered_count = bpf_map_lookup_elem(&percpu_stats, &UNWIND_PC_NOT_COVERED_ERROR);
   if (not_covered_count == NULL) {
     return;
   }
 
-  u32 *catchall_count =
-      bpf_map_lookup_elem(&percpu_stats, &UNWIND_CATCHALL_ERROR);
+  u32 *catchall_count = bpf_map_lookup_elem(&percpu_stats, &UNWIND_CATCHALL_ERROR);
   if (catchall_count == NULL) {
     return;
   }
 
-  u32 *never =
-      bpf_map_lookup_elem(&percpu_stats, &UNWIND_SHOULD_NEVER_HAPPEN_ERROR);
+  u32 *never = bpf_map_lookup_elem(&percpu_stats, &UNWIND_SHOULD_NEVER_HAPPEN_ERROR);
   if (never == NULL) {
     return;
   }
 
+  u32 *jit_errors = bpf_map_lookup_elem(&percpu_stats, &UNWIND_JIT_ERRORS);
+  if (jit_errors == NULL) {
+    return;
+  }
+
   bpf_printk("[[ stats for cpu %d ]]", (int)bpf_get_smp_processor_id());
   bpf_printk("success=%lu", *success_counter);
   bpf_printk("unsup_expression=%lu", *unsup_expression);
   bpf_printk("truncated=%lu", *truncated_counter);
   bpf_printk("catchall=%lu", *catchall_count);
   bpf_printk("never=%lu", *never);
+  bpf_printk("jit_failure=%lu", *jit_errors);
 
   bpf_printk("total_counter=%lu", *total_counter);
   bpf_printk("(not_covered=%lu)", *not_covered_count);
@@ -276,8 +321,7 @@ static void bump_samples() {
   }
 }
 
-static __always_inline void *
-bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) {
+static __always_inline void *bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) {
   void *val;
   long err;
 
@@ -297,9 +341,7 @@ bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) {
 
 // Binary search the unwind table to find the row index containing the unwind
 // information for a given program counter (pc).
-static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc) {
-  u64 left = 0;
-  u64 right = table->table_len;
+static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left, u64 right) {
   u64 found = BINARY_SEARCH_NOT_FOUND;
 
   for (int i = 0; i < MAX_BINARY_SEARCH_DEPTH; i++) {
@@ -314,7 +356,7 @@ static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc) {
 
     // Appease the verifier.
     if (mid < 0 || mid >= MAX_UNWIND_TABLE_SIZE) {
-      bpf_printk("\t.should never happen");
+      bpf_printk("\t.should never happen, mid: %lu, max: %lu", mid, MAX_UNWIND_TABLE_SIZE);
       BUMP_UNWIND_SHOULD_NEVER_HAPPEN_ERROR();
       return BINARY_SEARCH_SHOULD_NEVER_HAPPEN;
     }
@@ -338,26 +380,11 @@ static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc) {
   return BINARY_SEARCH_EXHAUSTED_ITERATIONS;
 }
 
-// Print an unwinding table row for debugging.
-static __always_inline void show_row(stack_unwind_table_t *unwind_table,
-                                     int index) {
-  /*
-    u64 pc = unwind_table->rows[index].pc;
-    u16 cfa_type = unwind_table->rows[index].cfa_type;
-    s16 cfa_offset = unwind_table->rows[index].cfa_offset;
-    s16 rbp_offset = unwind_table->rows[index].rbp_offset;
-
-    bpf_printk("~ %d entry. Loc: %llx, CFA reg: %d Offset: %d, $rbp %d", index,
-               pc, cfa_type, cfa_offset, rbp_offset); */
-}
-
 // Finds whether a process should be unwound using the unwind
 // tables.
 static __always_inline bool has_unwind_information(pid_t pid) {
-  unwind_tables_key_t key = {.pid = pid, .shard = 0};
-
-  stack_unwind_table_t *shard1 = bpf_map_lookup_elem(&unwind_tables, &key);
-  if (shard1) {
+  process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &pid);
+  if (proc_info) {
     return true;
   }
   return false;
@@ -371,35 +398,105 @@ static __always_inline bool is_debug_enabled_for_pid(int pid) {
   return false;
 }
 
-// Finds the unwind table for a given pid and program counter.
-// Returns NULL if it can't be found, so this function can't be used to detect
-// how should we unwind the native stack for a process. See
-// `has_unwind_information()`.
-static __always_inline stack_unwind_table_t *find_unwind_table(pid_t pid,
-                                                               u64 pc) {
-  unwind_tables_key_t key = {.pid = pid, .shard = 0};
-
-  for (int i = 0; i < MAX_SHARDS; i++) {
-    key.shard = i;
-    stack_unwind_table_t *shard = bpf_map_lookup_elem(&unwind_tables, &key);
-    if (shard) {
-      if (shard->low_pc <= pc && pc <= shard->high_pc) {
-        bpf_printk("\t Shard %d", i);
-        return shard;
-      }
+enum find_unwind_table_return {
+  FIND_UNWIND_SUCCESS = 1,
+
+  FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN = 2,
+  FIND_UNWIND_MAPPING_EXHAUSTED_SEARCH = 3,
+  FIND_UNWIND_MAPPING_NOT_FOUND = 4,
+  FIND_UNWIND_SHARD_UNSET = 5,
+  FIND_UNWIND_SHARD_EXHAUSTED_SEARCH = 6,
+  FIND_UNWIND_SHARD_NOT_FOUND = 7,
+
+  FIND_UNWIND_JITTED = 100,
+  FIND_UNWIND_SPECIAL = 200,
+};
+
+// Finds the shard information for a given pid and program counter. Optionally,
+// and offset can be passed that will be filled in with the mapping's load
+// address.
+static __always_inline enum find_unwind_table_return find_unwind_table(shard_info_t **shard_info, pid_t pid, u64 pc,
+                                                                       u64 *offset) {
+  process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &pid);
+  // Appease the verifier.
+  if (proc_info == NULL) {
+    bpf_printk("[error] should never happen");
+    return FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN;
+  }
+
+  bool found = false;
+  u64 executable_id = 0;
+  u64 load_address = 0;
+  u64 type = 0;
+
+  // Find the mapping.
+  for (int i = 0; i < MAX_MAPPINGS_PER_PROCESS; i++) {
+    if (i > proc_info->len) {
+      bpf_printk("[info] mapping not found, i (%d) > proc_info->len (%d) pc: %llx", i, proc_info->len, pc);
+      return FIND_UNWIND_MAPPING_EXHAUSTED_SEARCH;
+    }
+
+    // Appease the verifier.
+    if (i < 0 || i > MAX_MAPPINGS_PER_PROCESS) {
+      bpf_printk("[error] should never happen, verifier");
+      return FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN;
+    }
+
+    if (proc_info->mappings[i].begin <= pc && pc <= proc_info->mappings[i].end) {
+      found = true;
+      executable_id = proc_info->mappings[i].executable_id;
+      load_address = proc_info->mappings[i].load_address;
+      type = proc_info->mappings[i].type;
+      break;
+    }
+  }
+
+  if (found) {
+    if (offset != NULL) {
+      *offset = load_address;
+    }
+    if (type == 1) {
+      return FIND_UNWIND_JITTED;
+    }
+    if (type == 2) {
+      return FIND_UNWIND_SPECIAL;
     }
+  } else {
+    bpf_printk("[warn] :((( no mapping for ip=%llx", pc);
+    return FIND_UNWIND_MAPPING_NOT_FOUND;
   }
 
-  bpf_printk("[warn] no unwind table contains PC=%llx", pc);
-  return NULL;
+  bpf_printk("~about to check shards found=%d", found);
+  bpf_printk("~checking shards now");
+
+  // Find the shard where this unwind table lives.
+  stack_unwind_table_shards_t *shards = bpf_map_lookup_elem(&unwind_shards, &executable_id);
+  if (shards == NULL) {
+    bpf_printk("[info] shards is null for executable %llu", executable_id);
+    return FIND_UNWIND_SHARD_NOT_FOUND;
+  }
+
+  for (int i = 0; i < MAX_UNWIND_TABLE_CHUNKS; i++) {
+    if (i > shards->len) {
+      return FIND_UNWIND_SHARD_EXHAUSTED_SEARCH;
+    }
+
+    if (shards->shards[i].low_pc <= pc - load_address && pc - load_address <= shards->shards[i].high_pc) {
+      bpf_printk("[info] found shard");
+      *shard_info = &shards->shards[i];
+      return FIND_UNWIND_SUCCESS;
+    }
+  }
+
+  bpf_printk("[error] could not find the right shard...");
+  return FIND_UNWIND_SHARD_NOT_FOUND;
 }
 
-static __always_inline void add_stacks(struct bpf_perf_event_data *ctx,
-                                       u64 pid_tgid,
-                                       enum stack_walking_method method,
-                                       unwind_state_t *unwind_state) {
+// Aggregate the given stacktrace.
+static __always_inline void add_stack(struct bpf_perf_event_data *ctx, u64 pid_tgid, enum stack_walking_method method,
+                                      unwind_state_t *unwind_state) {
   u64 zero = 0;
-  stack_count_key_t stack_key = {};
+  stack_count_key_t stack_key = {0};
 
   // The `bpf_get_current_pid_tgid` helpers returns
   // `current_task->tgid << 32 | current_task->pid`, the naming can be
@@ -420,21 +517,20 @@ static __always_inline void add_stacks(struct bpf_perf_event_data *ctx,
   }
 
   if (method == STACK_WALKING_METHOD_DWARF) {
-    int stack_hash =
-        MurmurHash2((u32 *)unwind_state->stack.addresses,
-                    MAX_STACK_DEPTH * sizeof(u64) / sizeof(u32), 0);
+    int stack_hash = MurmurHash2((u32 *)unwind_state->stack.addresses, MAX_STACK_DEPTH * sizeof(u64) / sizeof(u32), 0);
     bpf_printk("stack hash %d", stack_hash);
     stack_key.user_stack_id_dwarf = stack_hash;
     stack_key.user_stack_id = 0;
 
     // Insert stack.
-    bpf_map_update_elem(&dwarf_stack_traces, &stack_hash, &unwind_state->stack,
-                        BPF_ANY);
+    bpf_map_update_elem(&dwarf_stack_traces, &stack_hash, &unwind_state->stack, BPF_ANY);
   } else if (method == STACK_WALKING_METHOD_FP) {
     int stack_id = bpf_get_stackid(ctx, &stack_traces, BPF_F_USER_STACK);
     if (stack_id >= 0) {
       stack_key.user_stack_id = stack_id;
       stack_key.user_stack_id_dwarf = 0;
+    } else {
+      // bpf_printk("fp failed\n");
     }
   }
 
@@ -445,6 +541,7 @@ static __always_inline void add_stacks(struct bpf_perf_event_data *ctx,
   }
 }
 
+// The unwinding machinery lives here.
 SEC("perf_event")
 int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
   u64 pid_tgid = bpf_get_current_pid_tgid();
@@ -459,7 +556,6 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     return 1;
   }
 
-  // #pragma clang loop unroll(full)
   for (int i = 0; i < MAX_STACK_DEPTH_PER_PROGRAM; i++) {
     bpf_printk("## frame: %d", i);
 
@@ -467,24 +563,42 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     bpf_printk("\tcurrent sp: %llx", unwind_state->sp);
     bpf_printk("\tcurrent bp: %llx", unwind_state->bp);
 
-    stack_unwind_table_t *unwind_table =
-        find_unwind_table(user_pid, unwind_state->ip);
+    u64 offset = 0;
+    shard_info_t *shard = NULL;
+    enum find_unwind_table_return unwind_table_result = find_unwind_table(&shard, user_pid, unwind_state->ip, &offset);
 
-    if (unwind_table == NULL) {
+    if (unwind_table_result == FIND_UNWIND_JITTED) {
+      bpf_printk("JIT section, stopping");
+      return 1;
+    } else if (unwind_table_result == FIND_UNWIND_SPECIAL) {
+      bpf_printk("special section, stopping");
+      return 1;
+    } else if (shard == NULL) {
+      // improve
       reached_bottom_of_stack = true;
       break;
     }
 
-    u64 table_idx = find_offset_for_pc(unwind_table, unwind_state->ip);
+    stack_unwind_table_t *unwind_table = bpf_map_lookup_elem(&unwind_tables, &shard->shard_index);
+    if (unwind_table == NULL) {
+      bpf_printk("unwind table is null :( for shard %llu", shard->shard_index);
+      return 0;
+    }
 
-    if (table_idx == BINARY_SEARCH_NOT_FOUND ||
-        table_idx == BINARY_SEARCH_SHOULD_NEVER_HAPPEN ||
+    bpf_printk("le offset: %llx", offset);
+    u64 left = shard->low_index;
+    u64 right = shard->high_index;
+    bpf_printk("========== left %llu right %llu", left, right);
+    u64 table_idx = find_offset_for_pc(unwind_table, unwind_state->ip - offset, left, right);
+
+    if (table_idx == BINARY_SEARCH_NOT_FOUND || table_idx == BINARY_SEARCH_SHOULD_NEVER_HAPPEN ||
         table_idx == BINARY_SEARCH_EXHAUSTED_ITERATIONS) {
       bpf_printk("[error] binary search failed with %llx", table_idx);
       return 1;
     }
 
     bpf_printk("\t=> table_index: %d", table_idx);
+    bpf_printk("\t=> adjusted pc: %llx", unwind_state->ip - offset);
 
     // Appease the verifier.
     if (table_idx < 0 || table_idx >= MAX_UNWIND_TABLE_SIZE) {
@@ -508,13 +622,10 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     s16 found_cfa_offset = unwind_table->rows[table_idx].cfa_offset;
     s16 found_rbp_offset = unwind_table->rows[table_idx].rbp_offset;
 
-    bpf_printk("\tcfa type: %d, offset: %d (row pc: %llx)", found_cfa_type,
-               found_cfa_offset, found_pc);
+    bpf_printk("\tcfa type: %d, offset: %d (row pc: %llx)", found_cfa_type, found_cfa_offset, found_pc);
 
-    if (found_rbp_type == RBP_TYPE_REGISTER ||
-        found_rbp_type == RBP_TYPE_EXPRESSION) {
-      bpf_printk("\t!!!! frame pointer is %d (register or exp), bailing out",
-                 found_rbp_type);
+    if (found_rbp_type == RBP_TYPE_REGISTER || found_rbp_type == RBP_TYPE_EXPRESSION) {
+      bpf_printk("\t[error] frame pointer is %d (register or exp), bailing out", found_rbp_type);
       BUMP_UNWIND_CATCHALL_ERROR();
       return 1;
     }
@@ -545,11 +656,9 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
         return 1;
       }
 
-      previous_rsp = unwind_state->sp + 8 +
-                     ((((unwind_state->ip & 15) >= threshold)) << 3);
+      previous_rsp = unwind_state->sp + 8 + ((((unwind_state->ip & 15) >= threshold)) << 3);
     } else {
-      bpf_printk("\t[error] register %d not valid (expected $rbp or $rsp)",
-                 found_cfa_type);
+      bpf_printk("\t[error] register %d not valid (expected $rbp or $rsp)", found_cfa_type);
       BUMP_UNWIND_CATCHALL_ERROR();
       return 1;
     }
@@ -565,16 +674,24 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     // HACK(javierhonduco): This is an architectural shortcut we can take. As we
     // only support x86_64 at the minute, we can assume that the return address
     // is *always* 8 bytes ahead of the previous stack pointer.
-    u64 previous_rip_addr =
-        previous_rsp - 8; // the saved return address is 8 bytes ahead of the
-                          // previous stack pointer
+    u64 previous_rip_addr = previous_rsp - 8; // the saved return address is 8 bytes ahead of the previous stack pointer
     u64 previous_rip = 0;
-    int err = bpf_probe_read_user(
-        &previous_rip, 8,
-        (void *)(previous_rip_addr)); // 8 bytes, a whole word
-                                      // in a 64 bits machine
+    int err = bpf_probe_read_user(&previous_rip, 8, (void *)(previous_rip_addr)); // 8 bytes, a whole word in a 64 bits machine
 
     if (previous_rip == 0) {
+      int user_pid = pid_tgid;
+      process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &user_pid);
+      if (proc_info == NULL) {
+        bpf_printk("[error] should never happen");
+        return 1;
+      }
+
+      if (proc_info->is_jit_compiler) {
+        bpf_printk("[info] rip=0, Section not added, yet");
+        BUMP_UNWIND_JIT_ERRORS();
+        return 1;
+      }
+
       bpf_printk("[error] previous_rip should not be zero. This can mean that "
                  "the read failed, ret=%d while reading @ %llx.",
                  err, previous_rip_addr);
@@ -588,12 +705,10 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
       previous_rbp = unwind_state->bp;
     } else {
       u64 previous_rbp_addr = previous_rsp + found_rbp_offset;
-      bpf_printk("\t(bp_offset: %d, bp value stored at %llx)", found_rbp_offset,
-                 previous_rbp_addr);
-      int ret = bpf_probe_read_user(
-          &previous_rbp, 8,
-          (void *)(previous_rbp_addr)); // 8 bytes, a whole word in a 64 bits
-                                        // machine
+      bpf_printk("\t(bp_offset: %d, bp value stored at %llx)", found_rbp_offset, previous_rbp_addr);
+      int ret = bpf_probe_read_user(&previous_rbp, 8,
+                                    (void *)(previous_rbp_addr)); // 8 bytes, a whole word in a 64 bits
+                                                                  // machine
 
       if (ret != 0) {
         bpf_printk("[error] previous_rbp should not be zero. This can mean "
@@ -630,23 +745,30 @@ int walk_user_stacktrace_impl(struct bpf_perf_event_data *ctx) {
     // https://refspecs.linuxbase.org/elf/x86_64-abi-0.99.pdf
     if (unwind_state->bp == 0) {
       bpf_printk("======= reached main! =======");
-      add_stacks(ctx, pid_tgid, STACK_WALKING_METHOD_DWARF, unwind_state);
+      add_stack(ctx, pid_tgid, STACK_WALKING_METHOD_DWARF, unwind_state);
       BUMP_UNWIND_SUCCESS();
       bpf_printk("yesssss :)");
     } else {
-      // TODO(javierhonduco): The current code doesn't have good support for
-      // JIT'ed code, this is something that will be worked on in future
-      // iterations.
-      bpf_printk("[error] Could not find unwind table and rbp != 0 (%llx). "
-                 "JIT'ed / bug?",
-                 unwind_state->bp);
+
+      int user_pid = pid_tgid;
+      process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &user_pid);
+      if (proc_info == NULL) {
+        bpf_printk("[error] should never happen");
+        return 1;
+      }
+
+      if (proc_info->is_jit_compiler) {
+        bpf_printk("[info] Section not added, yet");
+        BUMP_UNWIND_JIT_ERRORS();
+        return 1;
+      }
+
+      bpf_printk("[error] Could not find unwind table and rbp != 0 (%llx) bug?", unwind_state->bp);
       BUMP_UNWIND_SHOULD_NEVER_HAPPEN_ERROR();
     }
     return 0;
-  } else if (unwind_state->stack.len < MAX_STACK_DEPTH &&
-             unwind_state->tail_calls < MAX_TAIL_CALLS) {
-    bpf_printk("Continuing walking the stack in a tail call, current tail %d",
-               unwind_state->tail_calls);
+  } else if (unwind_state->stack.len < MAX_STACK_DEPTH && unwind_state->tail_calls < MAX_TAIL_CALLS) {
+    bpf_printk("Continuing walking the stack in a tail call, current tail %d", unwind_state->tail_calls);
     unwind_state->tail_calls++;
     bpf_tail_call(ctx, &programs, 0);
   }
@@ -676,8 +798,7 @@ static __always_inline void set_initial_state(bpf_user_pt_regs_t *regs) {
   unwind_state->tail_calls = 0;
 }
 
-static __always_inline int
-walk_user_stacktrace(struct bpf_perf_event_data *ctx) {
+static __always_inline int walk_user_stacktrace(struct bpf_perf_event_data *ctx) {
 
   bump_samples();
 
@@ -700,7 +821,8 @@ int profile_cpu(struct bpf_perf_event_data *ctx) {
     return 0;
 
   if (config.debug) {
-    bpf_printk("debug mode enabled, make sure you specified process name");
+    // very noisy
+    // bpf_printk("debug mode enabled, make sure you specified process name");
     if (!is_debug_enabled_for_pid(user_tgid))
       return 0;
   }
@@ -709,33 +831,16 @@ int profile_cpu(struct bpf_perf_event_data *ctx) {
   // Check if the process is eligible for the unwind table or frame pointer
   // unwinders.
   if (!has_unwind_info) {
-    add_stacks(ctx, pid_tgid, STACK_WALKING_METHOD_FP, NULL);
+    add_stack(ctx, pid_tgid, STACK_WALKING_METHOD_FP, NULL);
   } else {
-    stack_unwind_table_t *unwind_table =
-        find_unwind_table(user_pid, ctx->regs.ip);
-    if (unwind_table == NULL) {
-      bpf_printk("IP not covered. In kernel space / bug? IP %llx)",
-                 ctx->regs.ip);
+    shard_info_t *shard = NULL;
+    find_unwind_table(&shard, user_pid, ctx->regs.ip, NULL);
+    if (shard == NULL) {
+      bpf_printk("IP not covered. In kernel space / bug? IP %llx)", ctx->regs.ip);
       BUMP_UNWIND_PC_NOT_COVERED_ERROR();
       return 0;
     }
 
-    u64 last_idx = unwind_table->table_len - 1;
-    // Appease the verifier.
-    if (last_idx < 0 || last_idx >= MAX_UNWIND_TABLE_SIZE) {
-      bpf_printk("\t[error] this should never happen");
-      BUMP_UNWIND_SHOULD_NEVER_HAPPEN_ERROR();
-      return 0;
-    }
-
-    // javierhonduco: Debug output to ensure that the maps are correctly
-    // populated by comparing it with the data
-    // we are writing. Remove later on.
-    show_row(unwind_table, 0);
-    show_row(unwind_table, 1);
-    show_row(unwind_table, 2);
-    show_row(unwind_table, last_idx);
-
     bpf_printk("pid %d tgid %d", user_pid, user_tgid);
     walk_user_stacktrace(ctx);
   }
@@ -744,7 +849,6 @@ int profile_cpu(struct bpf_perf_event_data *ctx) {
 }
 
 #define KBUILD_MODNAME "parca-agent"
-volatile const char bpf_metadata_name[] SEC(".rodata") =
-    "parca-agent (https://github.com/parca-dev/parca-agent)";
+volatile const char bpf_metadata_name[] SEC(".rodata") = "parca-agent (https://github.com/parca-dev/parca-agent)";
 unsigned int VERSION SEC("version") = 1;
 char LICENSE[] SEC("license") = "GPL";
diff --git a/go.mod b/go.mod
index 8ebba72221..cf4735fcec 100644
--- a/go.mod
+++ b/go.mod
@@ -34,6 +34,7 @@ require (
 	github.com/rzajac/flexbuf v0.14.0
 	github.com/stretchr/testify v1.8.1
 	github.com/xyproto/ainur v1.3.0
+	golang.org/x/exp v0.0.0-20221212164502-fae10dda9338
 	golang.org/x/sync v0.1.0
 	golang.org/x/sys v0.4.0
 	google.golang.org/grpc v1.52.0
@@ -140,7 +141,6 @@ require (
 	go.uber.org/atomic v1.10.0 // indirect
 	go.uber.org/goleak v1.2.0 // indirect
 	golang.org/x/crypto v0.1.0 // indirect
-	golang.org/x/exp v0.0.0-20221212164502-fae10dda9338 // indirect
 	golang.org/x/net v0.4.0 // indirect
 	golang.org/x/oauth2 v0.3.0 // indirect
 	golang.org/x/term v0.3.0 // indirect
diff --git a/pkg/profiler/cpu/cpu.go b/pkg/profiler/cpu/cpu.go
index e5ac03b6eb..ec96083504 100644
--- a/pkg/profiler/cpu/cpu.go
+++ b/pkg/profiler/cpu/cpu.go
@@ -23,6 +23,7 @@ import (
 	"encoding/binary"
 	"errors"
 	"fmt"
+	"os"
 	"regexp"
 	"runtime"
 	"strings"
@@ -82,9 +83,8 @@ type CPU struct {
 	debuginfoManager profiler.DebugInfoManager
 	labelsManager    profiler.LabelsManager
 
-	psMapCache         profiler.ProcessMapCache
-	objFileCache       profiler.ObjectFileCache
-	unwindTableBuilder *unwind.UnwindTableBuilder
+	psMapCache   profiler.ProcessMapCache
+	objFileCache profiler.ObjectFileCache
 
 	metrics *metrics
 
@@ -131,9 +131,8 @@ func NewCPUProfiler(
 		processMappings:  process.NewMapping(psMapCache),
 
 		// Shared caches between all profilers.
-		psMapCache:         psMapCache,
-		objFileCache:       objFileCache,
-		unwindTableBuilder: unwind.NewUnwindTableBuilder(logger),
+		psMapCache:   psMapCache,
+		objFileCache: objFileCache,
 
 		profilingDuration:          profilingDuration,
 		profilingSamplingFrequency: profilingSamplingFrequency,
@@ -189,6 +188,10 @@ func bpfCheck() error {
 	return result.ErrorOrNil()
 }
 
+func (p *CPU) debugProcesses() bool {
+	return len(p.debugProcessNames) > 0
+}
+
 func (p *CPU) Run(ctx context.Context) error {
 	level.Debug(p.logger).Log("msg", "starting cpu profiler")
 
@@ -214,7 +217,7 @@ func (p *CPU) Run(ctx context.Context) error {
 	level.Debug(p.logger).Log("msg", "actual memory locked rlimit", "cur", profiler.HumanizeRLimit(rLimit.Cur), "max", profiler.HumanizeRLimit(rLimit.Max))
 
 	var matchers []*regexp.Regexp
-	if len(p.debugProcessNames) > 0 {
+	if p.debugProcesses() {
 		level.Info(p.logger).Log("msg", "process names specified, debugging processes", "matchers", strings.Join(p.debugProcessNames, ", "))
 		for _, exp := range p.debugProcessNames {
 			regex, err := regexp.Compile(exp)
@@ -315,17 +318,15 @@ func (p *CPU) Run(ctx context.Context) error {
 		return fmt.Errorf("failed to create maps: %w", err)
 	}
 
-	if debugEnabled {
-		pfs, err := procfs.NewDefaultFS()
-		if err != nil {
-			return fmt.Errorf("failed to create procfs: %w", err)
-		}
-
-		level.Debug(p.logger).Log("msg", "debug process matchers found, starting process watcher")
-		// Update the debug pids map.
-		go p.watchProcesses(ctx, pfs, matchers)
+	pfs, err := procfs.NewDefaultFS()
+	if err != nil {
+		return fmt.Errorf("failed to create procfs: %w", err)
 	}
 
+	level.Debug(p.logger).Log("msg", "debug process matchers found, starting process watcher")
+	// Update the debug pids map.
+	go p.watchProcesses(ctx, pfs, matchers)
+
 	ticker := time.NewTicker(p.profilingDuration)
 	defer ticker.Stop()
 
@@ -407,7 +408,8 @@ func (p *CPU) watchProcesses(ctx context.Context, pfs procfs.FS, matchers []*reg
 	ticker := time.NewTicker(5 * time.Second)
 	defer ticker.Stop()
 
-	unwindTableCache := cache.New(cache.WithExpireAfterWrite(20 * time.Minute))
+	// @nocommit: cache on start_at
+	unwindTableCache := cache.New()
 
 	for {
 		select {
@@ -415,72 +417,163 @@ func (p *CPU) watchProcesses(ctx context.Context, pfs procfs.FS, matchers []*reg
 			return
 		case <-ticker.C:
 		}
-
-		procs, err := pfs.AllProcs()
+		allProcs, err := pfs.AllProcs()
 		if err != nil {
 			level.Error(p.logger).Log("msg", "failed to list processes", "err", err)
-			continue
+			return
 		}
 
 		pids := []int{}
-		for _, proc := range procs {
-			comm, err := proc.Comm()
-			if err != nil {
-				level.Error(p.logger).Log("msg", "failed to get process name", "err", err)
-				continue
-			}
+		if p.debugProcesses() {
+			for _, proc := range allProcs {
+				comm, err := proc.Comm()
+				if err != nil {
+					level.Error(p.logger).Log("msg", "failed to get process name", "err", err)
+					continue
+				}
 
-			if comm == "" {
-				continue
-			}
+				if comm == "" {
+					continue
+				}
 
-			for _, m := range matchers {
-				if m.MatchString(comm) {
-					level.Info(p.logger).Log("msg", "match found; debugging process", "pid", proc.PID, "comm", comm)
-					pids = append(pids, proc.PID)
+				for _, m := range matchers {
+					if m.MatchString(comm) {
+						level.Info(p.logger).Log("msg", "match found; debugging process", "pid", proc.PID, "comm", comm)
+						pids = append(pids, proc.PID)
+					}
 				}
 			}
-		}
 
-		if len(pids) > 0 {
-			level.Debug(p.logger).Log("msg", "updating debug pids map", "pids", fmt.Sprintf("%v", pids))
-			// Only meant to be used for debugging, it is not safe to use in production.
-			if err := p.bpfMaps.setDebugPIDs(pids); err != nil {
-				level.Warn(p.logger).Log("msg", "failed to update debug pids map", "err", err)
+			if len(pids) > 0 {
+				level.Debug(p.logger).Log("msg", "updating debug pids map", "pids", fmt.Sprintf("%v", pids))
+				// Only meant to be used for debugging, it is not safe to use in production.
+				if err := p.bpfMaps.setDebugPIDs(pids); err != nil {
+					level.Error(p.logger).Log("msg", "failed to update debug pids map", "err", err)
+				}
+			} else {
+				level.Debug(p.logger).Log("msg", "no processes matched the provided regex")
+				if err := p.bpfMaps.setDebugPIDs(nil); err != nil {
+					level.Error(p.logger).Log("msg", "failed to update debug pids map", "err", err)
+				}
 			}
 		} else {
-			level.Debug(p.logger).Log("msg", "no processes matched the provided regex")
-			if err := p.bpfMaps.setDebugPIDs(nil); err != nil {
-				level.Warn(p.logger).Log("msg", "failed to update debug pids map", "err", err)
+			for _, proc := range allProcs {
+				pids = append(pids, proc.PID)
 			}
-			continue
 		}
 
-		// Can only be enabled when a debug process name is specified.
+		fmt.Println("=========== about to call enableDWARFUnwinding")
+
+		count := 0
 		if p.enableDWARFUnwinding {
 			// Update unwind tables for the given pids.
 			for _, pid := range pids {
 				if _, exists := unwindTableCache.GetIfPresent(pid); exists {
+					// TODO(javierhonduco): Expire cache on pid recycling or mappings changes.
+					fmt.Println("already cached")
 					continue
 				}
-				level.Info(p.logger).Log("msg", "adding unwind tables", "pid", pid)
 
-				pt, err := p.unwindTableBuilder.UnwindTableForPid(pid)
+				executable := fmt.Sprintf("/proc/%d/exe", pid)
+				hasFramePointers, err := unwind.HasFramePointers(executable)
 				if err != nil {
-					level.Warn(p.logger).Log("msg", "failed to build unwind table", "pid", pid, "err", err)
+					// It may not exist as reading procfs is racy.
+					if !errors.Is(err, os.ErrNotExist) {
+						level.Error(p.logger).Log("msg", "frame pointer detection failed", "executable", executable, "err", err)
+						continue
+					}
+					fmt.Println("HasFramePointers failed")
+				}
+
+				if hasFramePointers {
+					fmt.Println("skipping", executable, "has fp")
 					continue
 				}
 
-				if err := p.bpfMaps.setUnwindTable(pid, pt); err != nil {
-					level.Warn(p.logger).Log("msg", "failed to update unwind tables", "pid", pid, "err", err)
+				level.Info(p.logger).Log("msg", "adding unwind tables", "pid", pid)
+
+				err = p.addUnwindTableForProcess(pid)
+				if err != nil {
+					if errors.Is(err, os.ErrNotExist) {
+						level.Debug(p.logger).Log("msg", "failed to add unwind table", "pid", pid, "err", err)
+					} else {
+						level.Error(p.logger).Log("msg", "failed to add unwind table", "pid", pid, "err", err)
+					}
 					continue
 				}
+
 				unwindTableCache.Put(pid, struct{}{})
+				count++
+			}
+
+			// Must be called after calling `addUnwindTableForProcess`, as it's possible
+			// that the current in-memory unwind table shard hasn't been written to the
+			// map.
+			// TODO: have a dirty flag.
+			err := p.bpfMaps.PersistUnwindTable()
+			if err != nil {
+				panic(err)
 			}
 		}
 	}
 }
 
+// 1. Find executable sections
+// 2. For each section, generate compact table
+// 3. Add table to maps
+// 4. Add map metadata to process
+//
+// @nocommit: later on, table caching
+func (p *CPU) addUnwindTableForProcess(pid int) error {
+	proc, err := procfs.NewProc(pid)
+	if err != nil {
+		return err
+	}
+
+	mappings, err := proc.ProcMaps()
+	if err != nil {
+		return err
+	}
+
+	executableMappings := unwind.ListExecutableMappings(mappings)
+	procInfoBuf := new(bytes.Buffer)
+	// Important: this has to be called before addUnwindTableForProcessMapping
+	// .is_jit_compiler
+	var isJitCompiler uint64
+	if executableMappings.HasJitted() {
+		isJitCompiler = 1
+	}
+	if err := binary.Write(procInfoBuf, p.bpfMaps.byteOrder, isJitCompiler); err != nil { // @nocommit
+		panic(fmt.Errorf("write proc_info .is_jit_compiler bytes: %w", err))
+	}
+
+	// .len
+	if err := binary.Write(procInfoBuf, p.bpfMaps.byteOrder, uint64(len(executableMappings))); err != nil { // @nocommit
+		panic(fmt.Errorf("write proc_info .len bytes: %w", err))
+	}
+
+	for _, executableMapping := range executableMappings {
+		err = p.addUnwindTableForProcessMapping(pid, executableMapping, procInfoBuf)
+		if err != nil {
+			panic(fmt.Errorf("calling addUnwindTableForProcessMapping: %w", err))
+		}
+	}
+
+	if err := p.bpfMaps.processInfo.Update(unsafe.Pointer(&pid), unsafe.Pointer(&procInfoBuf.Bytes()[0])); err != nil {
+		panic(fmt.Errorf("update processInfo: %w", err))
+	}
+
+	return nil
+}
+
+func (p *CPU) addUnwindTableForProcessMapping(pid int, executableMappings *unwind.ExecutableMapping, procInfoBuf *bytes.Buffer) error {
+	if err := p.bpfMaps.setUnwindTable(pid, executableMappings, procInfoBuf); err != nil {
+		panic(fmt.Errorf("setUnwindTable: %w", err))
+	}
+
+	return nil
+}
+
 func (p *CPU) report(lastError error, processLastErrors map[int]error) {
 	p.mtx.Lock()
 	defer p.mtx.Unlock()
diff --git a/pkg/profiler/cpu/maps.go b/pkg/profiler/cpu/maps.go
index 75e4398ce0..f410017f2b 100644
--- a/pkg/profiler/cpu/maps.go
+++ b/pkg/profiler/cpu/maps.go
@@ -18,13 +18,20 @@ import "C"
 
 import (
 	"bytes"
+	"debug/elf"
 	"encoding/binary"
 	"errors"
 	"fmt"
+	"os"
+	"path"
+	"sort"
+	"time"
 	"unsafe"
 
-	"github.com/parca-dev/parca-agent/internal/dwarf/frame"
+	"github.com/parca-dev/parca-agent/pkg/buildid"
+	"github.com/parca-dev/parca-agent/pkg/executable"
 	"github.com/parca-dev/parca-agent/pkg/stack/unwind"
+	"golang.org/x/exp/constraints"
 
 	bpf "github.com/aquasecurity/libbpfgo"
 )
@@ -33,33 +40,17 @@ const (
 	debugPIDsMapName        = "debug_pids"
 	stackCountsMapName      = "stack_counts"
 	stackTracesMapName      = "stack_traces"
+	unwindShardsMapName     = "unwind_shards"
 	dwarfStackTracesMapName = "dwarf_stack_traces"
 	unwindTablesMapName     = "unwind_tables"
+	processInfoMapName      = "process_info"
 	programsMapName         = "programs"
 
-	// With the current row structure, the max items we can store is 262k per map.
-	unwindTableMaxEntries = 100
+	// With the current row structure, the max items we can store is 262k per map, we rounded
+	// it down to 250k.
+	unwindTableMaxEntries = 50         // How many shards we have.
 	maxUnwindTableSize    = 250 * 1000 // Always needs to be sync with MAX_UNWIND_TABLE_SIZE in the BPF program.
-	unwindTableShardCount = 6          // Always needs to be sync with MAX_SHARDS in the BPF program.
-	maxUnwindSize         = maxUnwindTableSize * unwindTableShardCount
-)
-
-type BpfCfaType uint16
-
-const (
-	CfaRegisterUndefined BpfCfaType = iota
-	CfaRegisterRbp
-	CfaRegisterRsp
-	CfaRegisterExpression
-)
-
-type BpfRbpType uint16
-
-const (
-	RbpRuleOffsetUnchanged BpfRbpType = iota
-	RbpRuleOffset
-	RbpRuleRegister
-	RbpRegisterExpression
+	maxUnwindSize         = maxUnwindTableSize * unwindTableMaxEntries
 )
 
 var (
@@ -77,9 +68,33 @@ type bpfMaps struct {
 	stackCounts      *bpf.BPFMap
 	stackTraces      *bpf.BPFMap
 	dwarfStackTraces *bpf.BPFMap
+	processInfo      *bpf.BPFMap
 
+	unwindShards *bpf.BPFMap
 	unwindTables *bpf.BPFMap
 	programs     *bpf.BPFMap
+
+	// unwind stuff 🔬
+	buildIdMapping map[string]uint64
+	//	globalView []{shard_id:, [all the ranges it contains]}
+	// which shard we are on
+	shardIndex    uint64
+	executableId  uint64
+	unwindInfoBuf *bytes.Buffer
+	// Account where we are within a shard
+	lowIndex  int
+	highIndex int
+	// Other stats
+	totalEntries       uint64
+	uniqueMappings     uint64
+	referencedMappings uint64
+}
+
+func min[T constraints.Ordered](a, b T) T {
+	if a < b {
+		return a
+	}
+	return b
 }
 
 func initializeMaps(m *bpf.Module, byteOrder binary.ByteOrder) (*bpfMaps, error) {
@@ -87,9 +102,13 @@ func initializeMaps(m *bpf.Module, byteOrder binary.ByteOrder) (*bpfMaps, error)
 		return nil, fmt.Errorf("nil module")
 	}
 
+	unwindInfoArray := make([]byte, 0, maxUnwindTableSize)
+
 	maps := &bpfMaps{
-		module:    m,
-		byteOrder: byteOrder,
+		module:         m,
+		byteOrder:      byteOrder,
+		unwindInfoBuf:  bytes.NewBuffer(unwindInfoArray),
+		buildIdMapping: make(map[string]uint64),
 	}
 
 	return maps, nil
@@ -132,6 +151,11 @@ func (m *bpfMaps) create() error {
 		return fmt.Errorf("get stack traces map: %w", err)
 	}
 
+	unwindShards, err := m.module.GetMap(unwindShardsMapName)
+	if err != nil {
+		return fmt.Errorf("get unwind shards map: %w", err)
+	}
+
 	unwindTables, err := m.module.GetMap(unwindTablesMapName)
 	if err != nil {
 		return fmt.Errorf("get unwind tables map: %w", err)
@@ -142,11 +166,19 @@ func (m *bpfMaps) create() error {
 		return fmt.Errorf("get dwarf stack traces map: %w", err)
 	}
 
+	processInfo, err := m.module.GetMap(processInfoMapName)
+	if err != nil {
+		return fmt.Errorf("get process info map: %w", err)
+	}
+
 	m.debugPIDs = debugPIDs
 	m.stackCounts = stackCounts
 	m.stackTraces = stackTraces
+	m.unwindShards = unwindShards
 	m.unwindTables = unwindTables
 	m.dwarfStackTraces = dwarfStackTraces
+	m.processInfo = processInfo
+
 	return nil
 }
 
@@ -222,6 +254,16 @@ func (m *bpfMaps) readUserStackWithDwarf(userStackID int32, stack *combinedStack
 		return fmt.Errorf("read user stack bytes, %s: %w", err, errUnrecoverable)
 	}
 
+	/* 	userStack := stack[:stackDepth]
+	   	for i := 0; i < stackDepth; i++ {
+	   		if i < int(dwarfStack.Len) {
+	   			userStack[i] = dwarfStack.Addrs[i]
+	   			fmt.Printf("frame: %x\n", dwarfStack.Addrs[i])
+	   		} else {
+	   			userStack[i] = 0
+	   		}
+	   	} */
+
 	userStack := stack[:stackDepth]
 	for i, addr := range dwarfStack.Addrs {
 		if i >= stackDepth || i >= int(dwarfStack.Len) || addr == 0 {
@@ -310,147 +352,382 @@ func (m *bpfMaps) clean() error {
 	return nil
 }
 
-// setUnwindTable updates the unwind tables with the given unwind table.
-func (m *bpfMaps) setUnwindTable(pid int, ut unwind.UnwindTable) error {
-	buf := new(bytes.Buffer)
+func (m *bpfMaps) generateCompactUnwindTable(fullExecutablePath string, mapping *unwind.ExecutableMapping) (unwind.CompactUnwindTable, uint64, uint64, error) {
+	var minCoveredPc uint64
+	var maxCoveredPc uint64
+	var ut unwind.CompactUnwindTable
+
+	// 1. Get FDEs
+	fdes, err := unwind.ReadFDEs(fullExecutablePath) // @nocommit: this should accept an ELF file perhaps.
+	if err != nil {
+		return ut, 0, 0, err
+	}
+
+	sort.Sort(fdes) // hope this help with efficiency, too
+	minCoveredPc = fdes[0].Begin()
+	maxCoveredPc = fdes[len(fdes)-1].End()
+
+	// 2. Build unwind table
+	// 3. Get the compact, BPF-friendly representation
+	ut, err = unwind.BuildCompactUnwindTable(fdes)
+	if err != nil {
+		return ut, 0, 0, err
+	}
+	sort.Sort(ut) // 2.5 Sort @nocommit: perhaps sorting the BPF friendly one will be faster
+
+	// now we have a full compact unwind table that we have to split in different BPF maps.
+	fmt.Println("=> found", len(ut), "unwind entries for", mapping.Executable, "low pc", fmt.Sprintf("%x", minCoveredPc), "high pc", fmt.Sprintf("%x", maxCoveredPc)) // @nocommit: remove
+
+	return ut, minCoveredPc, maxCoveredPc, nil
+}
+
+// writeUnwindTableRow writes a compact unwind table row to the provided buffer.
+//
+// Note: we are writing field by field as this way we don't allocate as much memory
+// and spend less CPU time too as we skip the reflection code paths in `binary.Write`.
+func (m *bpfMaps) writeUnwindTableRow(buffer *bytes.Buffer, row unwind.CompactUnwindTableRow) error {
+	// .pc
+	if err := binary.Write(buffer, m.byteOrder, row.Pc()); err != nil {
+		return fmt.Errorf("write unwind table .pc bytes: %w", err)
+	}
+
+	// .__reserved_do_not_use
+	if err := binary.Write(buffer, m.byteOrder, row.ReservedDoNotUse()); err != nil {
+		return fmt.Errorf("write unwind table __reserved_do_not_use bytes: %w", err)
+	}
+
+	// .cfa_type
+	if err := binary.Write(buffer, m.byteOrder, row.CfaType()); err != nil {
+		return fmt.Errorf("write unwind table cfa_type bytes: %w", err)
+	}
+
+	// .rbp_type
+	if err := binary.Write(buffer, m.byteOrder, row.RbpType()); err != nil {
+		return fmt.Errorf("write unwind table rbp_type bytes: %w", err)
+	}
+
+	// .cfa_offset
+	if err := binary.Write(buffer, m.byteOrder, row.CfaOffset()); err != nil {
+		return fmt.Errorf("write unwind table cfa_offset bytes: %w", err)
+	}
+
+	// .rbp_offset
+	if err := binary.Write(buffer, m.byteOrder, row.RbpOffset()); err != nil {
+		return fmt.Errorf("write unwind table rbp_offset bytes: %w", err)
+	}
+
+	return nil
+}
+
+// writeMapping writes the memory mapping information to the provided buffer.
+//
+// Note: we are writing field by field as this way we don't allocate as much memory
+// and spend less CPU time too as we skip the reflection code paths in `binary.Write`.
+func (m *bpfMaps) writeMapping(procInfoBuf *bytes.Buffer, loadAddress uint64, startAddr uint64, endAddr uint64, executableId uint64, type_ uint64) error {
+	// .load_address
+	if err := binary.Write(procInfoBuf, m.byteOrder, loadAddress); err != nil {
+		return fmt.Errorf("write mappings .load_address bytes: %w", err)
+	}
+	// .begin
+	if err := binary.Write(procInfoBuf, m.byteOrder, startAddr); err != nil {
+		return fmt.Errorf("write mappings .begin bytes: %w", err)
+	}
+	// .end
+	if err := binary.Write(procInfoBuf, m.byteOrder, endAddr); err != nil {
+		return fmt.Errorf("write mappings .end bytes: %w", err)
+	}
+	// .executable_id
+	if err := binary.Write(procInfoBuf, m.byteOrder, executableId); err != nil {
+		return fmt.Errorf("write proc info .executable_id bytes: %w", err)
+	}
+	// .type
+	if err := binary.Write(procInfoBuf, m.byteOrder, type_); err != nil {
+		return fmt.Errorf("write proc info .type bytes: %w", err)
+	}
+
+	return nil
+}
 
-	if len(ut) >= maxUnwindSize {
-		return fmt.Errorf("maximum unwind table size reached. Table size %d, but max size is %d", len(ut), maxUnwindSize)
+// mappingId returns the internal identifier for a memory mapping.
+// It will either return the already produced ID or generate a new
+// one while indicating whether it was already seen or not.
+//
+// This allows us to reuse the unwind tables for the mappings we
+// are dealing with.
+func (m *bpfMaps) mappingId(buildId string) (uint64, bool) {
+	_, alreadySeenMapping := m.buildIdMapping[buildId]
+	if alreadySeenMapping {
+		fmt.Println("-> caching - seen this mapping before")
+		m.referencedMappings += 1
+	} else {
+		fmt.Println("-> caching - new mapping")
+
+		m.buildIdMapping[buildId] = m.executableId
 	}
 
-	// Range-partition the unwind table in the different shards.
-	shardIndex := 0
-	for i := 0; i < len(ut); i += maxUnwindTableSize {
-		upTo := i + maxUnwindTableSize
-		if upTo > len(ut) {
-			upTo = len(ut)
+	return m.buildIdMapping[buildId], alreadySeenMapping
+}
+
+// PersistUnwindTable writes the current in-flight, writable shard
+// to the corresponding BPF map's shard.
+//
+// Note: as of now, this must be called in two situations:
+//   - In the callsite, once we are done with generating the unwind
+//     tables.
+//   - Whenever the current in-flight shard is full, before we wipe
+//     it and start reusing it.
+func (m *bpfMaps) PersistUnwindTable() error {
+	totalRows := m.unwindInfoBuf.Len() / 16
+	fmt.Println("unwind rows", totalRows)
+	shardIndex := uint64(m.shardIndex)
+	var err error
+	for i := 0; i < 100; i++ {
+		err = m.unwindTables.Update(unsafe.Pointer(&shardIndex), unsafe.Pointer(&m.unwindInfoBuf.Bytes()[0]))
+		if err == nil {
+			fmt.Println("~~ worked:, rows:", totalRows, "try:", i)
+			return nil
+		} else {
+			fmt.Println("~~ failed:", err, "rows:", totalRows, "try:", i)
+			time.Sleep(100 * time.Millisecond)
 		}
+	}
+
+	return fmt.Errorf("update unwind tables: %w", err)
+}
 
-		chunk := ut[i:upTo]
+// availableEntries returns how many entries we have left
+// in the in-flight shard.
+func (m *bpfMaps) availableEntries() int {
+	return maxUnwindTableSize - m.highIndex
+}
 
-		// Write `.low_pc`
-		if err := binary.Write(buf, m.byteOrder, chunk[0].Loc); err != nil {
-			return fmt.Errorf("write the number of rows: %w", err)
+// assertInvariants checks that some invariants that should
+// always be true during the execution of the program are held.
+func (m *bpfMaps) assertInvariants() {
+	if m.lowIndex < 0 {
+		panic("m.lowIndex < 0, this should never happen")
+	}
+	if m.highIndex > maxUnwindTableSize {
+		panic("m.highIndex > 250k, this should never happen")
+	}
+}
+
+// setUnwindTable sets all the necessary metadata and unwind tables, if needed
+// to make DWARF unwinding work, such as:
+//
+//   - Continue appending information to the executable mapping information for a process
+//   - Add mapping information
+//   - If unwind table is already present, we are done here
+//   - Otherwise, we generate the unwind table for this executable
+func (m *bpfMaps) setUnwindTable(pid int, mapping *unwind.ExecutableMapping, procInfoBuf *bytes.Buffer) error {
+	fmt.Println("========================================================================================")
+	fmt.Println("setUnwindTable called (total shards:", m.shardIndex, ", total entries:", m.totalEntries, ")")
+	fmt.Println("========================================================================================")
+
+	// Deal with mappings that are not filed backed. They don't have unwind
+	// information.
+	if mapping.IsNotFileBacked() {
+		var type_ uint64
+		if mapping.IsJitted() {
+			fmt.Println("JIT section")
+			type_ = 1
 		}
-		// Write `.high_pc`.
-		if err := binary.Write(buf, m.byteOrder, chunk[len(chunk)-1].Loc); err != nil {
-			return fmt.Errorf("write the number of rows: %w", err)
+		if mapping.IsSpecial() {
+			fmt.Println("Special section")
+			type_ = 2
 		}
-		// Write number of rows `.table_len`.
-		if err := binary.Write(buf, m.byteOrder, uint64(len(chunk))); err != nil {
-			return fmt.Errorf("write the number of rows: %w", err)
+
+		err := m.writeMapping(procInfoBuf, mapping.LoadAddr, mapping.StartAddr, mapping.EndAddr, uint64(0), type_)
+		if err != nil {
+			return fmt.Errorf("writting mappings failed with %w", err)
+		}
+		return nil
+	}
+
+	// Deal with mappings that are backed by a file and might contain unwind
+	// information.
+	fullExecutablePath := path.Join("/proc/", fmt.Sprintf("%d", pid), "/root/", mapping.Executable)
+
+	elfFile, err := elf.Open(fullExecutablePath)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return nil
 		}
-		// Write padding.
-		if err := binary.Write(buf, m.byteOrder, uint64(0)); err != nil {
-			return fmt.Errorf("write the number of rows: %w", err)
+		return fmt.Errorf("elf.Open failed: %w", err)
+	}
+	buildId, err := buildid.BuildID(&buildid.ElfFile{File: elfFile, Path: fullExecutablePath})
+	if err != nil {
+		return fmt.Errorf("BuildID failed %s: %w", fullExecutablePath, err)
+	}
+
+	// Find the adjusted load address.
+	aslrElegible := executable.IsASLRElegibleElf(elfFile)
+
+	adjustedLoadAddress := uint64(0)
+	if mapping.IsMainObject() {
+		fmt.Println("!!!!!!! main object", mapping)
+		if aslrElegible {
+			adjustedLoadAddress = mapping.LoadAddr
 		}
-		for _, row := range chunk {
-			// Right now we only support x86_64, where the return address position
-			// is specified in the ABI, so we don't write it.
+	} else {
+		adjustedLoadAddress = mapping.LoadAddr
+	}
+
+	fmt.Println("[info] adding memory mappings in for executable with ID", m.executableId, "buildId", buildId, "exec", mapping.Executable)
+
+	// Add the memory mapping information.
+	foundExecutableId, mappingAlreadySeen := m.mappingId(buildId)
+
+	err = m.writeMapping(procInfoBuf, adjustedLoadAddress, mapping.StartAddr, mapping.EndAddr, uint64(foundExecutableId), uint64(0))
+	if err != nil {
+		return fmt.Errorf("writting mappings failed with %w", err)
+	}
+
+	// Generated and add the unwind table, if needed.
+	if !mappingAlreadySeen {
+
+		unwindShardsKeyBuf := new(bytes.Buffer)
+		unwindShardsValBuf := new(bytes.Buffer)
 
-			// Write Program Counter (PC).
-			if err := binary.Write(buf, m.byteOrder, row.Loc); err != nil {
-				return fmt.Errorf("write the program counter: %w", err)
+		chunkIndex := 0
+
+		// ==================================== generate unwind table
+
+		ut, minCoveredPc, maxCoveredPc, err := m.generateCompactUnwindTable(fullExecutablePath, mapping)
+		if err != nil {
+			if err == unwind.ErrNoFDEsFound {
+				// is it ok to return here?
+				return nil
+			}
+			if err == unwind.ErrEhFrameSectionNotFound {
+				// is it ok to return here?
+				return nil
 			}
+			return nil
+		}
+
+		threshold := min(len(ut), m.availableEntries())
+		currentChunk := ut[:threshold]
+		restChunks := ut[threshold:]
 
-			// Write __reserved_do_not_use.
-			if err := binary.Write(buf, m.byteOrder, uint16(0)); err != nil {
-				return fmt.Errorf("write CFA register bytes: %w", err)
+		numShards := 1 + len(restChunks)/maxUnwindTableSize // @nocommit: verify this
+
+		// .len
+		if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(numShards)); err != nil {
+			return fmt.Errorf("write shards .len bytes: %w", err)
+		}
+
+		for {
+			m.assertInvariants()
+
+			fmt.Println("- current chunk size", len(currentChunk))
+			fmt.Println("- rest of chunk size", len(restChunks))
+
+			m.totalEntries += uint64(len(currentChunk))
+
+			if len(currentChunk) == 0 {
+				fmt.Println("!! done with the last chunk")
+				break
 			}
 
-			var CfaRegister uint8
-			var RbpRegister uint8
-			var CfaOffset int16
-			var RbpOffset int16
-
-			// CFA.
-			switch row.CFA.Rule {
-			case frame.RuleCFA:
-				if row.CFA.Reg == frame.X86_64FramePointer {
-					CfaRegister = uint8(CfaRegisterRbp)
-				} else if row.CFA.Reg == frame.X86_64StackPointer {
-					CfaRegister = uint8(CfaRegisterRsp)
-				}
-				CfaOffset = int16(row.CFA.Offset)
-			case frame.RuleExpression:
-				CfaRegister = uint8(CfaRegisterExpression)
-				CfaOffset = int16(unwind.ExpressionIdentifier(row.CFA.Expression))
+			m.highIndex += len(currentChunk)
+			fmt.Println("- lowindex [", m.lowIndex, ":", m.highIndex, "] highIndex")
+
+			// ======================== shard info ===============================
+			// Set (executable ID) -> unwind table shards info
+			// basically have the info
 
-			default:
-				return fmt.Errorf("CFA rule is not valid. This should never happen")
+			fmt.Println("- executable", m.executableId, "mapping", mapping.Executable, "shard", chunkIndex)
+			if err := binary.Write(unwindShardsKeyBuf, m.byteOrder, uint64(m.executableId)); err != nil {
+				return fmt.Errorf("write shards key bytes: %w", err)
 			}
 
-			// Frame pointer.
-			switch row.RBP.Rule {
-			case frame.RuleUndefined:
-			case frame.RuleOffset:
-				RbpRegister = uint8(RbpRuleOffset)
-				RbpOffset = int16(row.RBP.Offset)
-			case frame.RuleRegister:
-				RbpRegister = uint8(RbpRuleRegister)
-			case frame.RuleExpression:
-				RbpRegister = uint8(RbpRegisterExpression)
+			// note this might not be correct if using the unwind table info for the first or last items
+			minPc := currentChunk[0].Pc()
+			if chunkIndex == 0 {
+				minPc = uint64(minCoveredPc)
+			}
+			// .low_pc
+			if err := binary.Write(unwindShardsValBuf, m.byteOrder, minPc); err != nil {
+				return fmt.Errorf("write shards .low_pc bytes: %w", err)
 			}
 
-			// Write CFA type (.cfa_type).
-			if err := binary.Write(buf, m.byteOrder, CfaRegister); err != nil {
-				return fmt.Errorf("write CFA register bytes: %w", err)
+			// note this might not be correct if using the unwind table info for the first or last items
+			maxPc := currentChunk[len(currentChunk)-1].Pc()
+			if chunkIndex == numShards {
+				maxPc = uint64(maxCoveredPc)
+			}
+			// .high_pc
+			if err := binary.Write(unwindShardsValBuf, m.byteOrder, maxPc); err != nil {
+				return fmt.Errorf("write shards .high_pc bytes: %w", err)
 			}
 
-			// Write frame pointer type (.rbp_type).
-			if err := binary.Write(buf, m.byteOrder, RbpRegister); err != nil {
-				return fmt.Errorf("write CFA register bytes: %w", err)
+			// .shard_index
+			if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(m.shardIndex)); err != nil {
+				return fmt.Errorf("write shards .shard_index bytes: %w", err)
 			}
 
-			// Write CFA offset (.cfa_offset).
-			if err := binary.Write(buf, m.byteOrder, CfaOffset); err != nil {
-				return fmt.Errorf("write CFA offset bytes: %w", err)
+			// .low_index
+			if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(m.lowIndex)); err != nil {
+				return fmt.Errorf("write shards .low_index bytes: %w", err)
+			}
+			// .high_index
+			if err := binary.Write(unwindShardsValBuf, m.byteOrder, uint64(m.highIndex)); err != nil {
+				return fmt.Errorf("write shards .high_index bytes: %w", err)
 			}
 
-			// Write frame pointer offset (.rbp_offset).
-			if err := binary.Write(buf, m.byteOrder, RbpOffset); err != nil {
-				return fmt.Errorf("write RBP offset bytes: %w", err)
+			m.lowIndex = m.highIndex // @nocommit this is wrong???
+
+			// ====================== Write unwind table =====================
+			for _, row := range currentChunk {
+				if err := m.writeUnwindTableRow(m.unwindInfoBuf, row); err != nil {
+					return fmt.Errorf("writing unwind table row: %w", err)
+				}
 			}
-		}
 
-		// Set (PID, shard ID) -> unwind table for each shard.
-		keyBuf := new(bytes.Buffer)
-		if err := binary.Write(keyBuf, m.byteOrder, int32(pid)); err != nil {
-			return fmt.Errorf("write RBP offset bytes: %w", err)
-		}
-		if err := binary.Write(keyBuf, m.byteOrder, int32(shardIndex)); err != nil {
-			return fmt.Errorf("write RBP offset bytes: %w", err)
-		}
+			// Need a new shard?
+			if m.availableEntries() == 0 {
+				fmt.Println("run out of space in the 'live' shard, creating a new one")
+				err := m.PersistUnwindTable()
+				if err != nil {
+					return fmt.Errorf("failed to write unwind table: %w", err)
+				}
+				m.shardIndex++
+				m.unwindInfoBuf.Reset() // @nocommit is it stored??
+				m.lowIndex = 0
+				m.highIndex = 0
+
+				if m.shardIndex == unwindTableMaxEntries {
+					fmt.Println(m.buildIdMapping)
+					fmt.Println("Not enough shards - this is not implemented but we should deal with this")
+				}
+			}
+
+			// Recalculate for next iteration
+			threshold := min(len(restChunks), m.availableEntries())
+			currentChunk = restChunks[:threshold]
+			restChunks = restChunks[threshold:]
 
-		if err := m.unwindTables.Update(unsafe.Pointer(&keyBuf.Bytes()[0]), unsafe.Pointer(&buf.Bytes()[0])); err != nil {
-			return fmt.Errorf("update unwind tables: %w", err)
+			chunkIndex++
 		}
-		shardIndex++
-		buf.Reset()
-	}
-
-	// HACK(javierhonduco): remove this.
-	// Debug stuff to compare this with the BPF program's view of the world.
-	/* printRow := func(w io.Writer, pt unwind.UnwindTable, index int) {
-		cfaInfo := ""
-		switch ut[index].CFA.Rule {
-		case frame.RuleCFA:
-			cfaInfo = fmt.Sprintf("CFA Reg: %d Offset:%d", ut[index].CFA.Reg, ut[index].CFA.Offset)
-		case frame.RuleExpression:
-			cfaInfo = "CFA exp"
-		default:
-			panic("CFA rule is not valid. This should never happen.")
+
+		if err := m.unwindShards.Update(
+			unsafe.Pointer(&unwindShardsKeyBuf.Bytes()[0]),
+			unsafe.Pointer(&unwindShardsValBuf.Bytes()[0])); err != nil {
+			return fmt.Errorf("failed to update unwind shard: %w", err)
 		}
 
-		fmt.Fprintf(w, "\trow[%d]. Loc: %x, %s, $rbp: %d\n", index, pt[index].Loc, cfaInfo, pt[index].RBP.Offset)
+		m.executableId++
+		m.uniqueMappings++
 	}
 
-	fmt.Fprintf(os.Stdout, "\t- Total entries %d\n\n", len(ut))
-	printRow(os.Stdout, ut, 0)
-	printRow(os.Stdout, ut, 1)
-	printRow(os.Stdout, ut, 2)
-	printRow(os.Stdout, ut, 6)
-	printRow(os.Stdout, ut, len(ut)-1) */
+	m.assertInvariants()
+
+	// @nocommit NO SPACE LEFT
+	if m.availableEntries() == 0 {
+		panic("no space left, this should never happen")
+	}
 
+	// @nocommit TODO: check if we are full and flush if that's the case
 	return nil
 }
diff --git a/pkg/stack/unwind/unwind_table.go b/pkg/stack/unwind/unwind_table.go
index 4b1b27e1b5..42941dcbbb 100644
--- a/pkg/stack/unwind/unwind_table.go
+++ b/pkg/stack/unwind/unwind_table.go
@@ -16,26 +16,22 @@ package unwind
 
 import (
 	"debug/elf"
+	"errors"
 	"fmt"
 	"io"
-	"path"
-	"sort"
-	"strings"
 
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
 	"github.com/hashicorp/go-multierror"
-	"github.com/prometheus/procfs"
 
 	"github.com/parca-dev/parca-agent/internal/dwarf/frame"
-	"github.com/parca-dev/parca-agent/pkg/executable"
 )
 
-// UnwindTableBuilder helps to build UnwindTable for a given PID.
-//
-// javierhonduco(note): Caching on PID alone will result in hard to debug issues as
-// PIDs are reused. Right now we will parse the CIEs and FDEs over and over. Caching
-// will be added later on.
+var (
+	ErrNoFDEsFound            = errors.New("no FDEs found")
+	ErrEhFrameSectionNotFound = errors.New("failed to find .eh_frame section")
+)
+
 type UnwindTableBuilder struct {
 	logger log.Logger
 }
@@ -44,126 +40,6 @@ func NewUnwindTableBuilder(logger log.Logger) *UnwindTableBuilder {
 	return &UnwindTableBuilder{logger: logger}
 }
 
-type UnwindTable []UnwindTableRow
-
-func (t UnwindTable) Len() int           { return len(t) }
-func (t UnwindTable) Less(i, j int) bool { return t[i].Loc < t[j].Loc }
-func (t UnwindTable) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
-
-// TODO(kakkoyun): Unify with existing process maps mechanisms.
-// - pkg/process/mappings.go
-// The rest of the code base share a cache for process maps.
-
-// processMaps returns a map of file-backed memory mappings for a given
-// process which contains at least one executable section. The value of
-// mapping contains the metadata for the first mapping for each file, no
-// matter if it's executable or not.
-//
-// This is needed as typically the first mapped section for a dynamic library
-// is not executable, as it may contain only data, such as the `.bss` or the
-// `.rodata` section.
-func processMaps(pid int) (map[string]*procfs.ProcMap, string, error) {
-	p, err := procfs.NewProc(pid)
-	if err != nil {
-		return nil, "", fmt.Errorf("could not get process: %w", err)
-	}
-	maps, err := p.ProcMaps()
-	if err != nil {
-		return nil, "", fmt.Errorf("could not get maps: %w", err)
-	}
-
-	// Find the file-backed memory mappings that contain at least one
-	// executable section.
-	filesWithSomeExecutable := make(map[string]bool)
-	for _, map_ := range maps {
-		if map_.Pathname != "" && map_.Perms.Execute {
-			filesWithSomeExecutable[map_.Pathname] = true
-		}
-	}
-
-	dynamicExecutables := make(map[string]*procfs.ProcMap)
-	mainExecutable := ""
-
-	// Find all the dynamically loaded libraries. We need to make sure
-	// that we skip the files that do not have a single executable mapping
-	// as these are just data.
-	for _, map_ := range maps {
-		path := map_.Pathname
-		if path == "" {
-			continue
-		}
-		if !strings.HasPrefix(path, "/") {
-			continue
-		}
-		// The first entry should be the "main" executable, and not
-		// a dynamic library.
-		if mainExecutable == "" {
-			mainExecutable = map_.Pathname
-		}
-		_, ok := dynamicExecutables[path]
-		if ok {
-			continue
-		}
-
-		_, ok = filesWithSomeExecutable[path]
-		if ok {
-			dynamicExecutables[path] = map_
-		}
-	}
-
-	return dynamicExecutables, mainExecutable, nil
-}
-
-func (ptb *UnwindTableBuilder) UnwindTableForPid(pid int) (UnwindTable, error) {
-	mappedFiles, mainExec, err := processMaps(pid)
-	if err != nil {
-		return nil, fmt.Errorf("error opening the maps %w", err)
-	}
-
-	ut := UnwindTable{}
-	for _, m := range mappedFiles {
-		executablePath := path.Join(fmt.Sprintf("/proc/%d/root", pid), m.Pathname)
-
-		level.Info(ptb.logger).Log("msg", "finding tables for mapped executable", "path", executablePath, "starting address", fmt.Sprintf("%x", m.StartAddr))
-		fdes, err := ptb.readFDEs(executablePath)
-		// TODO(javierhonduco): Add markers in between executable sections.
-		if err != nil {
-			level.Error(ptb.logger).Log("msg", "failed to read frame description entries", "obj", executablePath, "err", err)
-			continue
-		}
-
-		rows := ptb.buildUnwindTable(fdes)
-		if len(rows) == 0 {
-			level.Error(ptb.logger).Log("msg", "unwind table empty for", "obj", executablePath)
-			continue
-		}
-
-		level.Info(ptb.logger).Log("msg", "adding tables for mapped executable", "path", executablePath, "rows", len(rows), "low pc", fmt.Sprintf("%x", rows[0].Loc), "high pc", fmt.Sprintf("%x", rows[len(rows)-1].Loc))
-
-		aslrElegible, err := executable.IsASLRElegible(executablePath)
-		if err != nil {
-			return nil, fmt.Errorf("ASLR check failed with with: %w", err)
-		}
-
-		if strings.Contains(executablePath, mainExec) {
-			if aslrElegible {
-				for i := range rows {
-					rows[i].Loc += uint64(m.StartAddr)
-				}
-			}
-		} else {
-			for i := range rows {
-				rows[i].Loc += uint64(m.StartAddr)
-			}
-		}
-		ut = append(ut, rows...)
-	}
-
-	// Sort the entries so we can binary search over them.
-	sort.Sort(ut)
-	return ut, nil
-}
-
 func x64RegisterToString(reg uint64) string {
 	// TODO(javierhonduco):
 	// - add source for this table.
@@ -183,7 +59,7 @@ func x64RegisterToString(reg uint64) string {
 
 // PrintTable is a debugging helper that prints the unwinding table to the given io.Writer.
 func (ptb *UnwindTableBuilder) PrintTable(writer io.Writer, path string, compact bool) error {
-	fdes, err := ptb.readFDEs(path)
+	fdes, err := ReadFDEs(path)
 	if err != nil {
 		return err
 	}
@@ -261,7 +137,7 @@ func (ptb *UnwindTableBuilder) PrintTable(writer io.Writer, path string, compact
 	return nil
 }
 
-func (ptb *UnwindTableBuilder) readFDEs(path string) (frame.FrameDescriptionEntries, error) {
+func ReadFDEs(path string) (frame.FrameDescriptionEntries, error) {
 	obj, err := elf.Open(path)
 	if err != nil {
 		return nil, fmt.Errorf("failed to open elf: %w", err)
@@ -270,7 +146,7 @@ func (ptb *UnwindTableBuilder) readFDEs(path string) (frame.FrameDescriptionEntr
 
 	sec := obj.Section(".eh_frame")
 	if sec == nil {
-		return nil, fmt.Errorf("failed to find .eh_frame section")
+		return nil, ErrEhFrameSectionNotFound
 	}
 
 	// TODO(kakkoyun): Consider using the debug_frame section as a fallback.
@@ -286,18 +162,22 @@ func (ptb *UnwindTableBuilder) readFDEs(path string) (frame.FrameDescriptionEntr
 		return nil, fmt.Errorf("failed to parse frame data: %w", err)
 	}
 
+	if len(fdes) == 0 {
+		return nil, ErrNoFDEsFound
+	}
+
 	return fdes, nil
 }
 
-func (ptb *UnwindTableBuilder) buildUnwindTable(fdes frame.FrameDescriptionEntries) UnwindTable {
+func BuildUnwindTable(fdes frame.FrameDescriptionEntries) UnwindTable {
 	// The frame package can raise in case of malformed unwind data.
+	table := make(UnwindTable, 0, 4*len(fdes)) // heuristic
 	defer func() {
 		if r := recover(); r != nil {
-			level.Info(ptb.logger).Log("msg", "recovered a panic in buildUnwindTable", "stack", r)
+			//level.Info(ptb.logger).Log("msg", "recovered a panic in buildUnwindTable", "stack", r)
 		}
 	}()
 
-	table := make(UnwindTable, 0)
 	for _, fde := range fdes {
 		frameContext := frame.ExecuteDwarfProgram(fde, nil)
 		for insCtx := frameContext.Next(); frameContext.HasNext(); insCtx = frameContext.Next() {
@@ -323,6 +203,12 @@ type UnwindTableRow struct {
 	RA frame.DWRule
 }
 
+type UnwindTable []UnwindTableRow
+
+func (t UnwindTable) Len() int           { return len(t) }
+func (t UnwindTable) Less(i, j int) bool { return t[i].Loc < t[j].Loc }
+func (t UnwindTable) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
+
 func unwindTableRow(instructionContext *frame.InstructionContext) *UnwindTableRow {
 	if instructionContext == nil {
 		return nil
diff --git a/pkg/stack/unwind/unwind_table_test.go b/pkg/stack/unwind/unwind_table_test.go
index 80fb513b44..4a1a8f064d 100644
--- a/pkg/stack/unwind/unwind_table_test.go
+++ b/pkg/stack/unwind/unwind_table_test.go
@@ -17,20 +17,16 @@ package unwind
 import (
 	"testing"
 
-	"github.com/go-kit/log"
 	"github.com/stretchr/testify/require"
 
 	"github.com/parca-dev/parca-agent/internal/dwarf/frame"
 )
 
 func TestBuildUnwindTable(t *testing.T) {
-	logger := log.NewNopLogger()
-	utb := NewUnwindTableBuilder(logger)
-
-	fdes, err := utb.readFDEs("../../../testdata/out/basic-cpp")
+	fdes, err := ReadFDEs("../../../testdata/out/basic-cpp")
 	require.NoError(t, err)
 
-	unwindTable := utb.buildUnwindTable(fdes)
+	unwindTable := BuildUnwindTable(fdes)
 	require.Equal(t, 38, len(unwindTable))
 
 	require.Equal(t, uint64(0x401020), unwindTable[0].Loc)
@@ -47,12 +43,10 @@ func benchmarkParsingDwarfUnwindInformation(b *testing.B, executable string) {
 	b.Helper()
 	b.ReportAllocs()
 
-	logger := log.NewNopLogger()
 	var rbpOffset int64
-	utb := NewUnwindTableBuilder(logger)
 
 	for n := 0; n < b.N; n++ {
-		fdes, err := utb.readFDEs(executable)
+		fdes, err := ReadFDEs(executable)
 		if err != nil {
 			panic("could not read FDEs")
 		}
diff --git a/things_to_do_next.txt b/things_to_do_next.txt
new file mode 100644
index 0000000000..d6cd4ab1ea
--- /dev/null
+++ b/things_to_do_next.txt
@@ -0,0 +1,36 @@
+1. Unwind table generation doesn't have to happen if the mappings are cachedLabels
+2. Clean up the code
+3. Add JIT checks?
+4. Refresh PIDs from time to time
+5. Debugging endpoint
+6. Re-do unwind info if shards are full
+
+================
+later
+- don't write all the time
+- reuse buffers
+
+================
+
+1. Read the code
+2. Make it better
+3. Try to improve it
+4. Add WebUI
+5. Add JIT detection?
+6. Do lots of testing (edge cases etc)
+7. Add caching
+
+
+
+
+== bugs
+
+- nginx fails sometimes
+- make && sudo dist/parca-agent --node=test --remote-store-insecure  --remote-store-address=127.0.0.1:7070 --experimental-enable-dwarf-unwinding --debug-process-names="(systemd|python|ruby|irb|bash)" fails
+
+perhaps and edge case?
+
+- left 226562 right 250000
+
+
+