diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c index 2acbad38f..a1dcc253d 100644 --- a/scheds/rust/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c @@ -25,7 +25,8 @@ char _license[] SEC("license") = "GPL"; extern unsigned CONFIG_HZ __kconfig; -const volatile u32 debug = 0; +const volatile u32 debug; +const volatile s32 layered_tgid; const volatile u64 slice_ns; const volatile u64 max_exec_ns; const volatile u32 nr_possible_cpus = 1; @@ -1038,13 +1039,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) vtime = layer->vtime_now - slice_ns; /* - * Special-case per-cpu kthreads which aren't in a preempting layer so - * that they run between preempting and non-preempting layers. This is - * to give reasonable boost to per-cpu kthreads by default as they are - * usually important for system performance and responsiveness. + * Special-case per-cpu kthreads and scx_layered userspace so that they + * run before preempting layers. This is to guarantee timely execution + * of layered userspace code and give boost to per-cpu kthreads as they + * are usually important for system performance and responsiveness. */ - if (!layer->preempt && - (p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) { + if (((p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) || + p->tgid == layered_tgid) { struct cpumask *layer_cpumask; if (layer->kind == LAYER_KIND_CONFINED && @@ -1837,52 +1838,67 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable) { struct cpu_ctx *cpuc; struct task_ctx *taskc; - struct layer *layer; + struct layer *task_layer, *cpu_layer = NULL; u64 now = bpf_ktime_get_ns(); - s32 lid; - u64 used; + bool is_fallback; + s32 task_lid, target_ppk; + u64 used, cpu_slice; if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p))) return; + is_fallback = cpuc->cpu == fallback_cpu; - lid = taskc->layer_id; - if (!(layer = lookup_layer(lid))) + task_lid = taskc->layer_id; + if (!(task_layer = lookup_layer(task_lid))) return; - used = now - taskc->running_at; + if (cpuc->layer_id != MAX_LAYERS && + !(cpu_layer = lookup_layer(cpuc->layer_id))) + return; - u64 slice_ns = layer_slice_ns(layer); + used = now - taskc->running_at; if (cpuc->running_owned) { - cpuc->layer_usages[lid][LAYER_USAGE_OWNED] += used; + cpuc->layer_usages[task_lid][LAYER_USAGE_OWNED] += used; if (cpuc->protect_owned) - cpuc->layer_usages[lid][LAYER_USAGE_PROTECTED] += used; + cpuc->layer_usages[task_lid][LAYER_USAGE_PROTECTED] += used; cpuc->owned_usage += used; } else { - cpuc->layer_usages[lid][LAYER_USAGE_OPEN] += used; + cpuc->layer_usages[task_lid][LAYER_USAGE_OPEN] += used; cpuc->open_usage += used; } + /* + * Owned execution protection. + */ + if (cpu_layer) { + target_ppk = cpu_layer->owned_usage_target_ppk; + cpu_slice = layer_slice_ns(cpu_layer); + } else { + target_ppk = 0; + cpu_slice = slice_ns; + } + + /* + * For the fallback CPU, execution for layers without any CPU counts as + * owned. Guarantee that at least half of the fallback CPU is used for + * empty execution so that empty layers can easily ramp up even when + * there are saturating preempt layers. Note that a fallback DSQ may + * belong to a layer under saturation. In such cases, tasks from both + * the owner and empty layers would count as owned with empty layers + * being prioritized. + */ + if (is_fallback && target_ppk < 512) + target_ppk = 512; + /* * Apply owned protection iff the CPU stayed saturated for longer than - * twice the slice. + * twice the default slice. */ - if (layer->owned_usage_target_ppk && - (cpuc->owned_usage + cpuc->open_usage) - cpuc->usage_at_idle > 2 * slice_ns) { + if (target_ppk && + (cpuc->owned_usage + cpuc->open_usage) - cpuc->usage_at_idle > 2 * cpu_slice) { u64 owned = cpuc->owned_usage - cpuc->prev_owned_usage[0]; u64 open = cpuc->open_usage - cpuc->prev_open_usage[0]; - u32 target_ppk; - - /* - * For the fallback CPU, execution for layers without any CPU - * counts as owned. Guarantee that at least half of the fallback - * CPU is used for that so that empty layers can easily ramp up - * even when there are saturating preempt layers. - */ - if (cpuc->cpu == fallback_cpu) - target_ppk = 512; - else - target_ppk = layer->owned_usage_target_ppk; cpuc->protect_owned = 1024 * owned / (owned + open) <= target_ppk; } else { @@ -1897,10 +1913,10 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable) * Apply min_exec_us, scale the execution time by the inverse of the * weight and charge. */ - if (used < layer->min_exec_ns) { - lstat_inc(LSTAT_MIN_EXEC, layer, cpuc); - lstat_add(LSTAT_MIN_EXEC_NS, layer, cpuc, layer->min_exec_ns - used); - used = layer->min_exec_ns; + if (used < task_layer->min_exec_ns) { + lstat_inc(LSTAT_MIN_EXEC, task_layer, cpuc); + lstat_add(LSTAT_MIN_EXEC_NS, task_layer, cpuc, task_layer->min_exec_ns - used); + used = task_layer->min_exec_ns; } if (cpuc->yielding && used < slice_ns) @@ -1979,6 +1995,7 @@ void BPF_STRUCT_OPS(layered_update_idle, s32 cpu, bool idle) if (!idle || !(cpuc = lookup_cpu_ctx(cpu))) return; + cpuc->protect_owned = false; cpuc->usage_at_idle = cpuc->owned_usage + cpuc->open_usage; } diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs index 4aae3b6ad..fe4f849af 100644 --- a/scheds/rust/scx_layered/src/main.rs +++ b/scheds/rust/scx_layered/src/main.rs @@ -1584,6 +1584,9 @@ impl<'a> Scheduler<'a> { skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len; skel.maps.rodata_data.debug = opts.verbose as u32; + // Running scx_layered inside a PID namespace would break the + // following. + skel.maps.rodata_data.layered_tgid = std::process::id() as i32; skel.maps.rodata_data.slice_ns = opts.slice_us * 1000; skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 { opts.max_exec_us * 1000 diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs index c51b5c6c6..15bd79609 100644 --- a/scheds/rust/scx_layered/src/stats.rs +++ b/scheds/rust/scx_layered/src/stats.rs @@ -56,10 +56,10 @@ pub struct LayerStats { pub index: usize, #[stat(desc = "Total CPU utilization (100% means one full CPU)")] pub util: f64, - #[stat(desc = "Protected CPU utilization %")] - pub util_protected: f64, #[stat(desc = "Open CPU utilization %")] - pub util_open: f64, + pub util_open_frac: f64, + #[stat(desc = "Protected CPU utilization %")] + pub util_protected_frac: f64, #[stat(desc = "fraction of total CPU utilization")] pub util_frac: f64, #[stat(desc = "sum of weight * duty_cycle for tasks")] @@ -188,13 +188,13 @@ impl LayerStats { Self { index: lidx, util: util_sum * 100.0, - util_protected: if util_sum != 0.0 { - stats.layer_utils[lidx][LAYER_USAGE_PROTECTED] / util_sum * 100.0 + util_open_frac: if util_sum != 0.0 { + stats.layer_utils[lidx][LAYER_USAGE_OPEN] / util_sum * 100.0 } else { 0.0 }, - util_open: if util_sum != 0.0 { - stats.layer_utils[lidx][LAYER_USAGE_OPEN] / util_sum * 100.0 + util_protected_frac: if util_sum != 0.0 { + stats.layer_utils[lidx][LAYER_USAGE_PROTECTED] / util_sum * 100.0 } else { 0.0 }, @@ -259,11 +259,11 @@ impl LayerStats { pub fn format(&self, w: &mut W, name: &str, header_width: usize) -> Result<()> { writeln!( w, - " {: