Merge pull request #1008 from sched-ext/htejun/layered-updates

scx_layered: Prioritize sched userspace and fix owned execution protection
sched-ext · Nov 28, 2024 · 5b57cdf · 5b57cdf
2 parents 3a1e673 + 4a95873
commit 5b57cdf
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 46 deletions.
diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -25,7 +25,8 @@ char _license[] SEC("license") = "GPL";
 
 extern unsigned CONFIG_HZ __kconfig;
 
-const volatile u32 debug = 0;
+const volatile u32 debug;
+const volatile s32 layered_tgid;
 const volatile u64 slice_ns;
 const volatile u64 max_exec_ns;
 const volatile u32 nr_possible_cpus = 1;
@@ -1038,13 +1039,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 		vtime = layer->vtime_now - slice_ns;
 
 	/*
-	 * Special-case per-cpu kthreads which aren't in a preempting layer so
-	 * that they run between preempting and non-preempting layers. This is
-	 * to give reasonable boost to per-cpu kthreads by default as they are
-	 * usually important for system performance and responsiveness.
+	 * Special-case per-cpu kthreads and scx_layered userspace so that they
+	 * run before preempting layers. This is to guarantee timely execution
+	 * of layered userspace code and give boost to per-cpu kthreads as they
+	 * are usually important for system performance and responsiveness.
 	 */
-	if (!layer->preempt &&
-	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) {
+	if (((p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) ||
+	    p->tgid == layered_tgid) {
 		struct cpumask *layer_cpumask;
 
 		if (layer->kind == LAYER_KIND_CONFINED &&
@@ -1837,52 +1838,67 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 {
 	struct cpu_ctx *cpuc;
 	struct task_ctx *taskc;
-	struct layer *layer;
+	struct layer *task_layer, *cpu_layer = NULL;
 	u64 now = bpf_ktime_get_ns();
-	s32 lid;
-	u64 used;
+	bool is_fallback;
+	s32 task_lid, target_ppk;
+	u64 used, cpu_slice;
 
 	if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
 		return;
+	is_fallback = cpuc->cpu == fallback_cpu;
 
-	lid = taskc->layer_id;
-	if (!(layer = lookup_layer(lid)))
+	task_lid = taskc->layer_id;
+	if (!(task_layer = lookup_layer(task_lid)))
 		return;
 
-	used = now - taskc->running_at;
+	if (cpuc->layer_id != MAX_LAYERS &&
+	    !(cpu_layer = lookup_layer(cpuc->layer_id)))
+		return;
 
-	u64 slice_ns = layer_slice_ns(layer);
+	used = now - taskc->running_at;
 
 	if (cpuc->running_owned) {
-		cpuc->layer_usages[lid][LAYER_USAGE_OWNED] += used;
+		cpuc->layer_usages[task_lid][LAYER_USAGE_OWNED] += used;
 		if (cpuc->protect_owned)
-			cpuc->layer_usages[lid][LAYER_USAGE_PROTECTED] += used;
+			cpuc->layer_usages[task_lid][LAYER_USAGE_PROTECTED] += used;
 		cpuc->owned_usage += used;
 	} else {
-		cpuc->layer_usages[lid][LAYER_USAGE_OPEN] += used;
+		cpuc->layer_usages[task_lid][LAYER_USAGE_OPEN] += used;
 		cpuc->open_usage += used;
 	}
 
+	/*
+	 * Owned execution protection.
+	 */
+	if (cpu_layer) {
+		target_ppk = cpu_layer->owned_usage_target_ppk;
+		cpu_slice = layer_slice_ns(cpu_layer);
+	} else {
+		target_ppk = 0;
+		cpu_slice = slice_ns;
+	}
+
+	/*
+	 * For the fallback CPU, execution for layers without any CPU counts as
+	 * owned. Guarantee that at least half of the fallback CPU is used for
+	 * empty execution so that empty layers can easily ramp up even when
+	 * there are saturating preempt layers. Note that a fallback DSQ may
+	 * belong to a layer under saturation. In such cases, tasks from both
+	 * the owner and empty layers would count as owned with empty layers
+	 * being prioritized.
+	 */
+	if (is_fallback && target_ppk < 512)
+		target_ppk = 512;
+
 	/*
 	 * Apply owned protection iff the CPU stayed saturated for longer than
-	 * twice the slice.
+	 * twice the default slice.
 	 */
-	if (layer->owned_usage_target_ppk &&
-	    (cpuc->owned_usage + cpuc->open_usage) - cpuc->usage_at_idle > 2 * slice_ns) {
+	if (target_ppk &&
+	    (cpuc->owned_usage + cpuc->open_usage) - cpuc->usage_at_idle > 2 * cpu_slice) {
 		u64 owned = cpuc->owned_usage - cpuc->prev_owned_usage[0];
 		u64 open = cpuc->open_usage - cpuc->prev_open_usage[0];
-		u32 target_ppk;
-
-		/*
-		 * For the fallback CPU, execution for layers without any CPU
-		 * counts as owned. Guarantee that at least half of the fallback
-		 * CPU is used for that so that empty layers can easily ramp up
-		 * even when there are saturating preempt layers.
-		 */
-		if (cpuc->cpu == fallback_cpu)
-			target_ppk = 512;
-		else
-			target_ppk = layer->owned_usage_target_ppk;
 
 		cpuc->protect_owned = 1024 * owned / (owned + open) <= target_ppk;
 	} else {
@@ -1897,10 +1913,10 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
 	 * Apply min_exec_us, scale the execution time by the inverse of the
 	 * weight and charge.
 	 */
-	if (used < layer->min_exec_ns) {
-		lstat_inc(LSTAT_MIN_EXEC, layer, cpuc);
-		lstat_add(LSTAT_MIN_EXEC_NS, layer, cpuc, layer->min_exec_ns - used);
-		used = layer->min_exec_ns;
+	if (used < task_layer->min_exec_ns) {
+		lstat_inc(LSTAT_MIN_EXEC, task_layer, cpuc);
+		lstat_add(LSTAT_MIN_EXEC_NS, task_layer, cpuc, task_layer->min_exec_ns - used);
+		used = task_layer->min_exec_ns;
 	}
 
 	if (cpuc->yielding && used < slice_ns)
@@ -1979,6 +1995,7 @@ void BPF_STRUCT_OPS(layered_update_idle, s32 cpu, bool idle)
 	if (!idle || !(cpuc = lookup_cpu_ctx(cpu)))
 		return;
 
+	cpuc->protect_owned = false;
 	cpuc->usage_at_idle = cpuc->owned_usage + cpuc->open_usage;
 }
 

diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs
@@ -1584,6 +1584,9 @@ impl<'a> Scheduler<'a> {
         skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
 
         skel.maps.rodata_data.debug = opts.verbose as u32;
+        // Running scx_layered inside a PID namespace would break the
+        // following.
+        skel.maps.rodata_data.layered_tgid = std::process::id() as i32;
         skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
         skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 {
             opts.max_exec_us * 1000

diff --git a/scheds/rust/scx_layered/src/stats.rs b/scheds/rust/scx_layered/src/stats.rs
@@ -56,10 +56,10 @@ pub struct LayerStats {
     pub index: usize,
     #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
     pub util: f64,
-    #[stat(desc = "Protected CPU utilization %")]
-    pub util_protected: f64,
     #[stat(desc = "Open CPU utilization %")]
-    pub util_open: f64,
+    pub util_open_frac: f64,
+    #[stat(desc = "Protected CPU utilization %")]
+    pub util_protected_frac: f64,
     #[stat(desc = "fraction of total CPU utilization")]
     pub util_frac: f64,
     #[stat(desc = "sum of weight * duty_cycle for tasks")]
@@ -188,13 +188,13 @@ impl LayerStats {
         Self {
             index: lidx,
             util: util_sum * 100.0,
-            util_protected: if util_sum != 0.0 {
-                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED] / util_sum * 100.0
+            util_open_frac: if util_sum != 0.0 {
+                stats.layer_utils[lidx][LAYER_USAGE_OPEN] / util_sum * 100.0
             } else {
                 0.0
             },
-            util_open: if util_sum != 0.0 {
-                stats.layer_utils[lidx][LAYER_USAGE_OPEN] / util_sum * 100.0
+            util_protected_frac: if util_sum != 0.0 {
+                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED] / util_sum * 100.0
             } else {
                 0.0
             },
@@ -259,11 +259,11 @@ impl LayerStats {
     pub fn format<W: Write>(&self, w: &mut W, name: &str, header_width: usize) -> Result<()> {
         writeln!(
             w,
-            "  {:<width$}: util/prot/open/frac={:6.1}/{}/{}/{:7.1} tasks={:6} load={:9.2}",
+            "  {:<width$}: util/open/prot/frac={:6.1}/{}/{}/{:7.1} tasks={:6} load={:9.2}",
             name,
             self.util,
-            fmt_pct(self.util_protected),
-            fmt_pct(self.util_open),
+            fmt_pct(self.util_open_frac),
+            fmt_pct(self.util_protected_frac),
             self.util_frac,
             self.tasks,
             self.load,