Skip to content

Commit

Permalink
Merge pull request #1008 from sched-ext/htejun/layered-updates
Browse files Browse the repository at this point in the history
scx_layered: Prioritize sched userspace and fix owned execution protection
  • Loading branch information
htejun authored Nov 28, 2024
2 parents 3a1e673 + 4a95873 commit 5b57cdf
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 46 deletions.
89 changes: 53 additions & 36 deletions scheds/rust/scx_layered/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ char _license[] SEC("license") = "GPL";

extern unsigned CONFIG_HZ __kconfig;

const volatile u32 debug = 0;
const volatile u32 debug;
const volatile s32 layered_tgid;
const volatile u64 slice_ns;
const volatile u64 max_exec_ns;
const volatile u32 nr_possible_cpus = 1;
Expand Down Expand Up @@ -1038,13 +1039,13 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
vtime = layer->vtime_now - slice_ns;

/*
* Special-case per-cpu kthreads which aren't in a preempting layer so
* that they run between preempting and non-preempting layers. This is
* to give reasonable boost to per-cpu kthreads by default as they are
* usually important for system performance and responsiveness.
* Special-case per-cpu kthreads and scx_layered userspace so that they
* run before preempting layers. This is to guarantee timely execution
* of layered userspace code and give boost to per-cpu kthreads as they
* are usually important for system performance and responsiveness.
*/
if (!layer->preempt &&
(p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) {
if (((p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) ||
p->tgid == layered_tgid) {
struct cpumask *layer_cpumask;

if (layer->kind == LAYER_KIND_CONFINED &&
Expand Down Expand Up @@ -1837,52 +1838,67 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
{
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
struct layer *layer;
struct layer *task_layer, *cpu_layer = NULL;
u64 now = bpf_ktime_get_ns();
s32 lid;
u64 used;
bool is_fallback;
s32 task_lid, target_ppk;
u64 used, cpu_slice;

if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
return;
is_fallback = cpuc->cpu == fallback_cpu;

lid = taskc->layer_id;
if (!(layer = lookup_layer(lid)))
task_lid = taskc->layer_id;
if (!(task_layer = lookup_layer(task_lid)))
return;

used = now - taskc->running_at;
if (cpuc->layer_id != MAX_LAYERS &&
!(cpu_layer = lookup_layer(cpuc->layer_id)))
return;

u64 slice_ns = layer_slice_ns(layer);
used = now - taskc->running_at;

if (cpuc->running_owned) {
cpuc->layer_usages[lid][LAYER_USAGE_OWNED] += used;
cpuc->layer_usages[task_lid][LAYER_USAGE_OWNED] += used;
if (cpuc->protect_owned)
cpuc->layer_usages[lid][LAYER_USAGE_PROTECTED] += used;
cpuc->layer_usages[task_lid][LAYER_USAGE_PROTECTED] += used;
cpuc->owned_usage += used;
} else {
cpuc->layer_usages[lid][LAYER_USAGE_OPEN] += used;
cpuc->layer_usages[task_lid][LAYER_USAGE_OPEN] += used;
cpuc->open_usage += used;
}

/*
* Owned execution protection.
*/
if (cpu_layer) {
target_ppk = cpu_layer->owned_usage_target_ppk;
cpu_slice = layer_slice_ns(cpu_layer);
} else {
target_ppk = 0;
cpu_slice = slice_ns;
}

/*
* For the fallback CPU, execution for layers without any CPU counts as
* owned. Guarantee that at least half of the fallback CPU is used for
* empty execution so that empty layers can easily ramp up even when
* there are saturating preempt layers. Note that a fallback DSQ may
* belong to a layer under saturation. In such cases, tasks from both
* the owner and empty layers would count as owned with empty layers
* being prioritized.
*/
if (is_fallback && target_ppk < 512)
target_ppk = 512;

/*
* Apply owned protection iff the CPU stayed saturated for longer than
* twice the slice.
* twice the default slice.
*/
if (layer->owned_usage_target_ppk &&
(cpuc->owned_usage + cpuc->open_usage) - cpuc->usage_at_idle > 2 * slice_ns) {
if (target_ppk &&
(cpuc->owned_usage + cpuc->open_usage) - cpuc->usage_at_idle > 2 * cpu_slice) {
u64 owned = cpuc->owned_usage - cpuc->prev_owned_usage[0];
u64 open = cpuc->open_usage - cpuc->prev_open_usage[0];
u32 target_ppk;

/*
* For the fallback CPU, execution for layers without any CPU
* counts as owned. Guarantee that at least half of the fallback
* CPU is used for that so that empty layers can easily ramp up
* even when there are saturating preempt layers.
*/
if (cpuc->cpu == fallback_cpu)
target_ppk = 512;
else
target_ppk = layer->owned_usage_target_ppk;

cpuc->protect_owned = 1024 * owned / (owned + open) <= target_ppk;
} else {
Expand All @@ -1897,10 +1913,10 @@ void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
* Apply min_exec_us, scale the execution time by the inverse of the
* weight and charge.
*/
if (used < layer->min_exec_ns) {
lstat_inc(LSTAT_MIN_EXEC, layer, cpuc);
lstat_add(LSTAT_MIN_EXEC_NS, layer, cpuc, layer->min_exec_ns - used);
used = layer->min_exec_ns;
if (used < task_layer->min_exec_ns) {
lstat_inc(LSTAT_MIN_EXEC, task_layer, cpuc);
lstat_add(LSTAT_MIN_EXEC_NS, task_layer, cpuc, task_layer->min_exec_ns - used);
used = task_layer->min_exec_ns;
}

if (cpuc->yielding && used < slice_ns)
Expand Down Expand Up @@ -1979,6 +1995,7 @@ void BPF_STRUCT_OPS(layered_update_idle, s32 cpu, bool idle)
if (!idle || !(cpuc = lookup_cpu_ctx(cpu)))
return;

cpuc->protect_owned = false;
cpuc->usage_at_idle = cpuc->owned_usage + cpuc->open_usage;
}

Expand Down
3 changes: 3 additions & 0 deletions scheds/rust/scx_layered/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1584,6 +1584,9 @@ impl<'a> Scheduler<'a> {
skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;

skel.maps.rodata_data.debug = opts.verbose as u32;
// Running scx_layered inside a PID namespace would break the
// following.
skel.maps.rodata_data.layered_tgid = std::process::id() as i32;
skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 {
opts.max_exec_us * 1000
Expand Down
20 changes: 10 additions & 10 deletions scheds/rust/scx_layered/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ pub struct LayerStats {
pub index: usize,
#[stat(desc = "Total CPU utilization (100% means one full CPU)")]
pub util: f64,
#[stat(desc = "Protected CPU utilization %")]
pub util_protected: f64,
#[stat(desc = "Open CPU utilization %")]
pub util_open: f64,
pub util_open_frac: f64,
#[stat(desc = "Protected CPU utilization %")]
pub util_protected_frac: f64,
#[stat(desc = "fraction of total CPU utilization")]
pub util_frac: f64,
#[stat(desc = "sum of weight * duty_cycle for tasks")]
Expand Down Expand Up @@ -188,13 +188,13 @@ impl LayerStats {
Self {
index: lidx,
util: util_sum * 100.0,
util_protected: if util_sum != 0.0 {
stats.layer_utils[lidx][LAYER_USAGE_PROTECTED] / util_sum * 100.0
util_open_frac: if util_sum != 0.0 {
stats.layer_utils[lidx][LAYER_USAGE_OPEN] / util_sum * 100.0
} else {
0.0
},
util_open: if util_sum != 0.0 {
stats.layer_utils[lidx][LAYER_USAGE_OPEN] / util_sum * 100.0
util_protected_frac: if util_sum != 0.0 {
stats.layer_utils[lidx][LAYER_USAGE_PROTECTED] / util_sum * 100.0
} else {
0.0
},
Expand Down Expand Up @@ -259,11 +259,11 @@ impl LayerStats {
pub fn format<W: Write>(&self, w: &mut W, name: &str, header_width: usize) -> Result<()> {
writeln!(
w,
" {:<width$}: util/prot/open/frac={:6.1}/{}/{}/{:7.1} tasks={:6} load={:9.2}",
" {:<width$}: util/open/prot/frac={:6.1}/{}/{}/{:7.1} tasks={:6} load={:9.2}",
name,
self.util,
fmt_pct(self.util_protected),
fmt_pct(self.util_open),
fmt_pct(self.util_open_frac),
fmt_pct(self.util_protected_frac),
self.util_frac,
self.tasks,
self.load,
Expand Down

0 comments on commit 5b57cdf

Please sign in to comment.