Skip to content

Commit f4dc571

Browse files
authored
Merge pull request raspberrypi#113 from sched-ext/htejun
scx: Sync schedulers from SCX v0.1.5 (74923c6cdbc3)
2 parents 8c7f9b2 + 88e7560 commit f4dc571

20 files changed

+392
-144
lines changed

tools/sched_ext/include/scx/common.bpf.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#include "vmlinux.h"
1111
#include <bpf/bpf_helpers.h>
1212
#include <bpf/bpf_tracing.h>
13-
#include <linux/errno.h>
13+
#include <asm-generic/errno.h>
1414
#include "user_exit_info.h"
1515

1616
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
@@ -68,6 +68,7 @@ const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
6868
const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
6969
void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
7070
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
71+
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
7172
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
7273
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
7374
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;

tools/sched_ext/scx_central.bpf.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,14 @@ static bool dispatch_to_cpu(s32 cpu)
161161
__sync_fetch_and_add(&nr_mismatches, 1);
162162
scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
163163
bpf_task_release(p);
164+
/*
165+
* We might run out of dispatch buffer slots if we continue dispatching
166+
* to the fallback DSQ, without dispatching to the local DSQ of the
167+
* target CPU. In such a case, break the loop now as will fail the
168+
* next dispatch operation.
169+
*/
170+
if (!scx_bpf_dispatch_nr_slots())
171+
break;
164172
continue;
165173
}
166174

tools/sched_ext/scx_central.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <sched.h>
99
#include <stdio.h>
1010
#include <unistd.h>
11+
#include <inttypes.h>
1112
#include <signal.h>
1213
#include <libgen.h>
1314
#include <bpf/bpf.h>
@@ -103,17 +104,17 @@ int main(int argc, char **argv)
103104

104105
while (!exit_req && !uei_exited(&skel->bss->uei)) {
105106
printf("[SEQ %llu]\n", seq++);
106-
printf("total :%10lu local:%10lu queued:%10lu lost:%10lu\n",
107+
printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n",
107108
skel->bss->nr_total,
108109
skel->bss->nr_locals,
109110
skel->bss->nr_queued,
110111
skel->bss->nr_lost_pids);
111-
printf("timer :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
112+
printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
112113
skel->bss->nr_timers,
113114
skel->bss->nr_dispatches,
114115
skel->bss->nr_mismatches,
115116
skel->bss->nr_retries);
116-
printf("overflow:%10lu\n",
117+
printf("overflow:%10" PRIu64 "\n",
117118
skel->bss->nr_overflows);
118119
fflush(stdout);
119120
sleep(1);

tools/sched_ext/scx_flatcg.bpf.c

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ struct {
123123
} task_ctx SEC(".maps");
124124

125125
/* gets inc'd on weight tree changes to expire the cached hweights */
126-
unsigned long hweight_gen = 1;
126+
u64 hweight_gen = 1;
127127

128128
static u64 div_round_up(u64 dividend, u64 divisor)
129129
{
@@ -302,16 +302,18 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
302302
bpf_spin_unlock(&cgv_tree_lock);
303303
}
304304

305-
void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
305+
s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
306306
{
307307
struct fcg_task_ctx *taskc;
308-
struct cgroup *cgrp;
309-
struct fcg_cgrp_ctx *cgc;
308+
bool is_idle = false;
309+
s32 cpu;
310+
311+
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
310312

311313
taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
312314
if (!taskc) {
313315
scx_bpf_error("task_ctx lookup failed");
314-
return;
316+
return cpu;
315317
}
316318

317319
/*
@@ -321,7 +323,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
321323
* affinities so that we don't have to worry about per-cgroup dq's
322324
* containing tasks that can't be executed from some CPUs.
323325
*/
324-
if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) {
326+
if (is_idle || p->nr_cpus_allowed != nr_cpus) {
325327
/*
326328
* Tell fcg_stopping() that this bypassed the regular scheduling
327329
* path and should be force charged to the cgroup. 0 is used to
@@ -338,14 +340,28 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
338340
* implement per-cgroup fallback dq's instead so that we have
339341
* more control over when tasks with custom cpumask get issued.
340342
*/
341-
if ((enq_flags & SCX_ENQ_LOCAL) ||
343+
if (is_idle ||
342344
(p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
343345
stat_inc(FCG_STAT_LOCAL);
344-
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
346+
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
345347
} else {
346348
stat_inc(FCG_STAT_GLOBAL);
347-
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
349+
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
348350
}
351+
}
352+
353+
return cpu;
354+
}
355+
356+
void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
357+
{
358+
struct fcg_task_ctx *taskc;
359+
struct cgroup *cgrp;
360+
struct fcg_cgrp_ctx *cgc;
361+
362+
taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
363+
if (!taskc) {
364+
scx_bpf_error("task_ctx lookup failed");
349365
return;
350366
}
351367

@@ -756,8 +772,8 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
756772
}
757773
}
758774

759-
s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
760-
struct scx_enable_args *args)
775+
s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
776+
struct scx_init_task_args *args)
761777
{
762778
struct fcg_task_ctx *taskc;
763779
struct fcg_cgrp_ctx *cgc;
@@ -893,13 +909,14 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
893909

894910
SEC(".struct_ops.link")
895911
struct sched_ext_ops flatcg_ops = {
912+
.select_cpu = (void *)fcg_select_cpu,
896913
.enqueue = (void *)fcg_enqueue,
897914
.dispatch = (void *)fcg_dispatch,
898915
.runnable = (void *)fcg_runnable,
899916
.running = (void *)fcg_running,
900917
.stopping = (void *)fcg_stopping,
901918
.quiescent = (void *)fcg_quiescent,
902-
.prep_enable = (void *)fcg_prep_enable,
919+
.init_task = (void *)fcg_init_task,
903920
.cgroup_set_weight = (void *)fcg_cgroup_set_weight,
904921
.cgroup_init = (void *)fcg_cgroup_init,
905922
.cgroup_exit = (void *)fcg_cgroup_exit,

tools/sched_ext/scx_flatcg.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <unistd.h>
1010
#include <libgen.h>
1111
#include <limits.h>
12+
#include <inttypes.h>
1213
#include <fcntl.h>
1314
#include <time.h>
1415
#include <bpf/bpf.h>
@@ -183,7 +184,7 @@ int main(int argc, char **argv)
183184

184185
memcpy(last_stats, acc_stats, sizeof(acc_stats));
185186

186-
printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n",
187+
printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
187188
seq++, cpu_util * 100.0, skel->data->hweight_gen);
188189
printf(" act:%6llu deact:%6llu local:%6llu global:%6llu\n",
189190
stats[FCG_STAT_ACT],
@@ -210,6 +211,7 @@ int main(int argc, char **argv)
210211
stats[FCG_STAT_PNC_GONE]);
211212
printf("BAD remove:%6llu\n",
212213
acc_stats[FCG_STAT_BAD_REMOVAL]);
214+
fflush(stdout);
213215

214216
nanosleep(&intv_ts, NULL);
215217
}

tools/sched_ext/scx_layered/Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "scx_layered"
3-
version = "0.0.1"
3+
version = "0.0.4"
44
authors = ["Tejun Heo <[email protected]>", "Meta"]
55
edition = "2021"
66
description = "Userspace scheduling with BPF for Ads"
@@ -13,16 +13,16 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
1313
ctrlc = { version = "3.1", features = ["termination"] }
1414
fb_procfs = "0.7"
1515
lazy_static = "1.4"
16-
libbpf-rs = "0.21"
16+
libbpf-rs = "0.22"
1717
libc = "0.2"
1818
log = "0.4"
19-
scx_utils = "0.3"
19+
scx_utils = "0.5"
2020
serde = { version = "1.0", features = ["derive"] }
2121
serde_json = "1.0"
2222
simplelog = "0.12"
2323

2424
[build-dependencies]
25-
scx_utils = "0.3"
25+
scx_utils = "0.5"
2626

2727
[features]
2828
enable_backtrace = []

tools/sched_ext/scx_layered/README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# scx_layered
2+
3+
This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
4+
5+
## Overview
6+
7+
A highly configurable multi-layer BPF / user space hybrid scheduler.
8+
9+
scx_layered allows the user to classify tasks into multiple layers, and apply
10+
different scheduling policies to those layers. For example, a layer could be
11+
created of all tasks that are part of the `user.slice` cgroup slice, and a
12+
policy could be specified that ensures that the layer is given at least 80% CPU
13+
utilization for some subset of CPUs on the system.
14+
15+
## How To Install
16+
17+
Available as a [Rust crate](https://crates.io/crates/scx_layered): `cargo add scx_layered`
18+
19+
## Typical Use Case
20+
21+
scx_layered is designed to be highly customizable, and can be targeted for
22+
specific applications. For example, if you had a high-priority service that
23+
required priority access to all but 1 physical core to ensure acceptable p99
24+
latencies, you could specify that the service would get priority access to all
25+
but 1 core on the system. If that service ends up not utilizing all of those
26+
cores, they could be used by other layers until they're needed.
27+
28+
## Production Ready?
29+
30+
Yes. If tuned correctly, scx_layered should be performant across various CPU
31+
architectures and workloads.
32+
33+
That said, you may run into an issue with infeasible weights, where a task with
34+
a very high weight may cause the scheduler to incorrectly leave cores idle
35+
because it thinks they're necessary to accommodate the compute for a single
36+
task. This can also happen in CFS, and should soon be addressed for
37+
scx_layered.

tools/sched_ext/scx_layered/src/bpf/main.bpf.c

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -745,8 +745,8 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
745745
bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
746746
}
747747

748-
s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
749-
struct scx_enable_args *args)
748+
s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
749+
struct scx_init_task_args *args)
750750
{
751751
struct task_ctx tctx_init = {
752752
.pid = p->pid,
@@ -805,14 +805,8 @@ s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
805805
return 0;
806806
}
807807

808-
void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
809-
{
810-
s32 pid = p->pid;
811-
812-
bpf_map_delete_elem(&task_ctxs, &pid);
813-
}
814-
815-
void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
808+
void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
809+
struct scx_exit_task_args *args)
816810
{
817811
struct cpu_ctx *cctx;
818812
struct task_ctx *tctx;
@@ -977,9 +971,8 @@ struct sched_ext_ops layered = {
977971
.quiescent = (void *)layered_quiescent,
978972
.set_weight = (void *)layered_set_weight,
979973
.set_cpumask = (void *)layered_set_cpumask,
980-
.prep_enable = (void *)layered_prep_enable,
981-
.cancel_enable = (void *)layered_cancel_enable,
982-
.disable = (void *)layered_disable,
974+
.init_task = (void *)layered_init_task,
975+
.exit_task = (void *)layered_exit_task,
983976
.init = (void *)layered_init,
984977
.exit = (void *)layered_exit,
985978
.name = "layered",

tools/sched_ext/scx_layered/src/main.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,10 +1122,10 @@ struct Scheduler<'a> {
11221122

11231123
impl<'a> Scheduler<'a> {
11241124
fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec<LayerSpec>) -> Result<()> {
1125-
skel.rodata().nr_layers = specs.len() as u32;
1125+
skel.rodata_mut().nr_layers = specs.len() as u32;
11261126

11271127
for (spec_i, spec) in specs.iter().enumerate() {
1128-
let layer = &mut skel.bss().layers[spec_i];
1128+
let layer = &mut skel.bss_mut().layers[spec_i];
11291129

11301130
for (or_i, or) in spec.matches.iter().enumerate() {
11311131
for (and_i, and) in or.iter().enumerate() {
@@ -1176,12 +1176,12 @@ impl<'a> Scheduler<'a> {
11761176
let mut skel = skel_builder.open().context("Failed to open BPF program")?;
11771177

11781178
// Initialize skel according to @opts.
1179-
skel.rodata().debug = opts.verbose as u32;
1180-
skel.rodata().slice_ns = opts.slice_us * 1000;
1181-
skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
1182-
skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
1179+
skel.rodata_mut().debug = opts.verbose as u32;
1180+
skel.rodata_mut().slice_ns = opts.slice_us * 1000;
1181+
skel.rodata_mut().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
1182+
skel.rodata_mut().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
11831183
for cpu in cpu_pool.all_cpus.iter_ones() {
1184-
skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8);
1184+
skel.rodata_mut().all_cpus[cpu / 8] |= 1 << (cpu % 8);
11851185
}
11861186
Self::init_layers(&mut skel, &layer_specs)?;
11871187

@@ -1274,7 +1274,7 @@ impl<'a> Scheduler<'a> {
12741274
{
12751275
Self::update_bpf_layer_cpumask(
12761276
&self.layers[idx],
1277-
&mut self.skel.bss().layers[idx],
1277+
&mut self.skel.bss_mut().layers[idx],
12781278
);
12791279
updated = true;
12801280
}
@@ -1288,7 +1288,7 @@ impl<'a> Scheduler<'a> {
12881288
let nr_available_cpus = available_cpus.count_ones();
12891289
for idx in 0..self.layers.len() {
12901290
let layer = &mut self.layers[idx];
1291-
let bpf_layer = &mut self.skel.bss().layers[idx];
1291+
let bpf_layer = &mut self.skel.bss_mut().layers[idx];
12921292
match &layer.kind {
12931293
LayerKind::Open { .. } => {
12941294
layer.cpus.copy_from_bitslice(&available_cpus);
@@ -1299,7 +1299,7 @@ impl<'a> Scheduler<'a> {
12991299
}
13001300
}
13011301

1302-
self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
1302+
self.skel.bss_mut().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
13031303

13041304
for (lidx, layer) in self.layers.iter().enumerate() {
13051305
self.nr_layer_cpus_min_max[lidx] = (

tools/sched_ext/scx_pair.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*/
77
#include <stdio.h>
88
#include <unistd.h>
9+
#include <inttypes.h>
910
#include <signal.h>
1011
#include <libgen.h>
1112
#include <bpf/bpf.h>
@@ -142,18 +143,18 @@ int main(int argc, char **argv)
142143

143144
while (!exit_req && !uei_exited(&skel->bss->uei)) {
144145
printf("[SEQ %llu]\n", seq++);
145-
printf(" total:%10lu dispatch:%10lu missing:%10lu\n",
146+
printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n",
146147
skel->bss->nr_total,
147148
skel->bss->nr_dispatched,
148149
skel->bss->nr_missing);
149-
printf(" kicks:%10lu preemptions:%7lu\n",
150+
printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
150151
skel->bss->nr_kicks,
151152
skel->bss->nr_preemptions);
152-
printf(" exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
153+
printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
153154
skel->bss->nr_exps,
154155
skel->bss->nr_exp_waits,
155156
skel->bss->nr_exp_empty);
156-
printf("cgnext:%10lu cgcoll:%10lu cgempty:%10lu\n",
157+
printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n",
157158
skel->bss->nr_cgrp_next,
158159
skel->bss->nr_cgrp_coll,
159160
skel->bss->nr_cgrp_empty);

0 commit comments

Comments
 (0)