Skip to content

Commit cce0b9f

Browse files
authored
Merge pull request #2891 from kkdwivedi/layered/switch
Dynamically gate layer admission for HintEquals
2 parents 8314159 + 15a9b1c commit cce0b9f

File tree

5 files changed

+322
-70
lines changed

5 files changed

+322
-70
lines changed

scheds/rust/scx_layered/src/bpf/intf.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,8 @@ enum layer_match_kind {
272272
MATCH_CGROUP_CONTAINS,
273273
MATCH_CGROUP_REGEX,
274274
MATCH_HINT_EQUALS,
275+
MATCH_SYSTEM_CPU_UTIL_BELOW,
276+
MATCH_DSQ_INSERT_BELOW,
275277

276278
NR_LAYER_MATCH_KINDS,
277279
};
@@ -299,6 +301,8 @@ struct layer_match {
299301
u64 min_avg_runtime_us;
300302
u64 max_avg_runtime_us;
301303
u64 hint;
304+
u64 system_cpu_util_below; /* ratio * 10000 */
305+
u64 dsq_insert_below; /* ratio * 10000 */
302306
};
303307

304308
struct layer_match_ands {
@@ -384,4 +388,10 @@ struct scx_cmd {
384388
u8 cmd[SCXCMD_COMLEN];
385389
} __attribute__((packed));
386390

391+
struct hint_layer_info {
392+
u32 layer_id;
393+
u64 system_cpu_util_below; /* ratio * 10000, u64::MAX = disabled */
394+
u64 dsq_insert_below; /* ratio * 10000, u64::MAX = disabled */
395+
};
396+
387397
#endif /* __INTF_H */

scheds/rust/scx_layered/src/bpf/main.bpf.c

Lines changed: 125 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,17 @@ struct {
110110
struct {
111111
__uint(type, BPF_MAP_TYPE_ARRAY);
112112
__type(key, u32);
113-
__type(value, u32);
113+
__type(value, struct hint_layer_info);
114114
__uint(map_flags, 0);
115115
__uint(max_entries, 1025);
116116
} hint_to_layer_id_map SEC(".maps");
117117

118118
const volatile bool task_hint_map_enabled;
119119

120+
/* EWMA value updated from userspace */
121+
u64 system_cpu_util_ewma = 0;
122+
u64 layer_dsq_insert_ewma[MAX_LAYERS];
123+
120124
static inline s32 prio_to_nice(s32 static_prio)
121125
{
122126
/* See DEFAULT_PRIO and PRIO_TO_NICE in include/linux/sched/prio.h */
@@ -650,57 +654,79 @@ static struct task_hint *lookup_task_hint(struct task_struct *p)
650654
return hint;
651655
}
652656

653-
static int lookup_task_hint_layer_id(struct task_struct *p) {
657+
static struct hint_layer_info *lookup_task_hint_layer_id(struct task_struct *p) {
654658
struct task_hint *hint;
655-
u32 *layer_idp;
656659
u32 hint_val;
657660

658661
hint = lookup_task_hint(p);
659662
if (!hint)
660-
return -ENOENT;
663+
return NULL;
661664

662665
hint_val = hint->hint;
663-
layer_idp = bpf_map_lookup_elem(&hint_to_layer_id_map, &hint_val);
664-
if (!layer_idp)
665-
return -EFAULT;
666-
return *layer_idp;
666+
return bpf_map_lookup_elem(&hint_to_layer_id_map, &hint_val);
667667
}
668668

669669
static void switch_to_layer(struct task_struct *, struct task_ctx *, u64 layer_id, u64 now);
670670

671-
static int lookup_task_layer_from_hint(struct task_struct *p, struct task_ctx *taskc)
671+
static bool is_task_layer_hint_stale(struct task_struct *p, struct task_ctx *taskc)
672672
{
673-
u64 layer_id;
674-
int ret;
673+
struct hint_layer_info *info;
675674

676-
ret = lookup_task_hint_layer_id(p);
677-
if (ret < 0)
678-
return ret;
679-
layer_id = ret;
675+
if (!task_hint_map_enabled)
676+
return false;
680677

681-
if (taskc->layer_id == layer_id)
682-
return -EAGAIN;
683-
/*
684-
* If the existing layer does not match the one corresponding to the
685-
* hint, we must switch the layers, return the new layer_id.
686-
*/
687-
return layer_id;
678+
info = lookup_task_hint_layer_id(p);
679+
if (!info)
680+
return false;
681+
return taskc->layer_id != info->layer_id;
688682
}
689683

690-
static void maybe_refresh_task_layer_from_hint(struct task_struct *p)
684+
static void maybe_refresh_task_layer_from_hint(struct task_struct *p, struct task_ctx *taskc)
691685
{
692-
struct task_ctx *taskc;
693-
u64 layer_id;
694-
int ret;
686+
struct hint_layer_info *info;
687+
bool switch_layer = false;
695688

696-
if (!(taskc = lookup_task_ctx_may_fail(p)))
689+
if (!task_hint_map_enabled)
697690
return;
698691

699-
ret = lookup_task_layer_from_hint(p, taskc);
700-
if (ret < 0)
692+
/* We are already going to refresh this task, skip. */
693+
if (taskc->refresh_layer)
694+
return;
695+
696+
info = lookup_task_hint_layer_id(p);
697+
if (!info)
701698
return;
702-
layer_id = ret;
703-
switch_to_layer(p, taskc, layer_id, scx_bpf_now());
699+
700+
/*
701+
* We need to check whether the task layer matches the HintEquals specification,
702+
* additionally qualified by predicates gating layer admission.
703+
*/
704+
if (taskc->layer_id != info->layer_id)
705+
switch_layer = true;
706+
707+
if (info->system_cpu_util_below != (u64)-1) {
708+
if (system_cpu_util_ewma >= info->system_cpu_util_below) {
709+
/*
710+
* Refresh task so that it gets evicted out. It only needs
711+
* to be done for tasks in the current layer, for incoming
712+
* tasks we just reject admission.
713+
*/
714+
taskc->refresh_layer = switch_layer == false;
715+
switch_layer = false;
716+
}
717+
}
718+
719+
if (info->dsq_insert_below != (u64)-1 && taskc->layer_id < MAX_LAYERS) {
720+
if (layer_dsq_insert_ewma[taskc->layer_id] >= info->dsq_insert_below) {
721+
/* Same idea as above. */
722+
taskc->refresh_layer = switch_layer == false;
723+
switch_layer = false;
724+
}
725+
}
726+
727+
/* All conditions satisfied for layer matching. */
728+
if (switch_layer)
729+
switch_to_layer(p, taskc, info->layer_id, scx_bpf_now());
704730
}
705731

706732
int save_gpu_tgid_pid(void) {
@@ -724,8 +750,8 @@ int save_gpu_tgid_pid(void) {
724750
taskc->refresh_layer = true;
725751
}
726752

727-
/*
728-
* GPU kprobe fire has expired for the member.
753+
/*
754+
* GPU kprobe fire has expired for the member.
729755
* Force a recheck to see if we should put it
730756
* back into the GPU layer.
731757
*/
@@ -734,7 +760,7 @@ int save_gpu_tgid_pid(void) {
734760

735761
timestamp = taskc->running_at;
736762
}
737-
763+
738764
/* Same logic for the parent. */
739765
if ((parent = lookup_task_ctx_may_fail(p->parent))) {
740766
if(!bpf_map_lookup_elem(&gpu_tgid, &pid)) {
@@ -1343,11 +1369,11 @@ s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64
13431369
s32 cpu;
13441370

13451371
maybe_refresh_layer_cpumasks();
1346-
maybe_refresh_task_layer_from_hint(p);
13471372

13481373
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
13491374
return prev_cpu;
13501375

1376+
maybe_refresh_task_layer_from_hint(p, taskc);
13511377
/*
13521378
* We usually update the layer in layered_runnable() to avoid confusion.
13531379
* As layered_select_cpu() takes place before runnable, new tasks would
@@ -1545,13 +1571,14 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
15451571
u64 *lstats;
15461572

15471573
maybe_refresh_layer_cpumasks();
1548-
/* Only invoke if we never went through select_cpu path. */
1549-
if (!__COMPAT_is_enq_cpu_selected(enq_flags))
1550-
maybe_refresh_task_layer_from_hint(p);
15511574

15521575
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
15531576
return;
15541577

1578+
/* Only invoke if we never went through select_cpu path. */
1579+
if (!__COMPAT_is_enq_cpu_selected(enq_flags))
1580+
maybe_refresh_task_layer_from_hint(p, taskc);
1581+
15551582
layer_id = taskc->layer_id;
15561583
if (!(layer = lookup_layer(layer_id)))
15571584
return;
@@ -1803,14 +1830,14 @@ static inline void check_member_expired(struct task_ctx *taskc, u64 now)
18031830
{
18041831
u64 recheck = taskc->recheck_layer_membership;
18051832

1806-
/*
1833+
/*
18071834
* Don't trigger a recheck if:
18081835
* - Membership never expires
18091836
* - Membership already expired
18101837
* - Member has been tested and hasn't joined any GPU layer
18111838
* even though it matches the GPU rule.
18121839
*/
1813-
if (recheck == MEMBER_NOEXPIRE ||
1840+
if (recheck == MEMBER_NOEXPIRE ||
18141841
recheck == MEMBER_EXPIRED ||
18151842
recheck == MEMBER_CANTMATCH)
18161843
return;
@@ -2247,7 +2274,7 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
22472274
* Do not refresh the slice in case we need the task to be reenqueued
22482275
* for layer membership change and subsequent CPU selection.
22492276
*/
2250-
if (lookup_task_layer_from_hint(prev, prev_taskc) >= 0)
2277+
if (is_task_layer_hint_stale(prev, prev_taskc))
22512278
return;
22522279
prev->scx.slice = prev_layer->slice_ns;
22532280
return;
@@ -2391,7 +2418,7 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
23912418
* Do not refresh the slice in case we need the task to be reenqueued
23922419
* for layer membership change and subsequent CPU selection.
23932420
*/
2394-
if (prev_taskc && prev_layer && lookup_task_layer_from_hint(prev, prev_taskc) < 0)
2421+
if (prev_taskc && prev_layer && !is_task_layer_hint_stale(prev, prev_taskc))
23952422
prev->scx.slice = prev_layer->slice_ns;
23962423
}
23972424

@@ -2538,13 +2565,13 @@ static __noinline bool match_one(struct layer *layer, struct layer_match *match,
25382565
if (!last_used)
25392566
return !must_be_used;
25402567

2541-
/*
2542-
* If the last kprobe fire was more than member_expire_ms ago, timestamp is stale.
2568+
/*
2569+
* If the last kprobe fire was more than member_expire_ms ago, timestamp is stale.
25432570
* Mark us as expired to trigger a rematch if we fire off any GPU kprobes.
25442571
*/
25452572
recently_used = true;
2546-
if (taskc && taskc->recheck_layer_membership != MEMBER_NOEXPIRE &&
2547-
taskc->recheck_layer_membership != MEMBER_CANTMATCH &&
2573+
if (taskc && taskc->recheck_layer_membership != MEMBER_NOEXPIRE &&
2574+
taskc->recheck_layer_membership != MEMBER_CANTMATCH &&
25482575
(*last_used) + layer->member_expire_ms * 1000 * 1000 < now) {
25492576

25502577
recently_used = false;
@@ -2578,6 +2605,50 @@ static __noinline bool match_one(struct layer *layer, struct layer_match *match,
25782605
return false;
25792606
return match->hint == hint->hint;
25802607
}
2608+
case MATCH_SYSTEM_CPU_UTIL_BELOW: {
2609+
struct task_hint *hint;
2610+
struct hint_layer_info *info;
2611+
u32 hint_val;
2612+
2613+
hint = lookup_task_hint(p);
2614+
if (!hint)
2615+
return false;
2616+
2617+
hint_val = hint->hint;
2618+
info = bpf_map_lookup_elem(&hint_to_layer_id_map, &hint_val);
2619+
if (!info)
2620+
return false;
2621+
2622+
/* Check if this matcher is enabled for this hint */
2623+
if (info->system_cpu_util_below == (u64)-1)
2624+
return false;
2625+
2626+
return system_cpu_util_ewma < info->system_cpu_util_below;
2627+
}
2628+
case MATCH_DSQ_INSERT_BELOW: {
2629+
struct task_hint *hint;
2630+
struct hint_layer_info *info;
2631+
u32 hint_val;
2632+
2633+
hint = lookup_task_hint(p);
2634+
if (!hint)
2635+
return false;
2636+
2637+
hint_val = hint->hint;
2638+
info = bpf_map_lookup_elem(&hint_to_layer_id_map, &hint_val);
2639+
if (!info)
2640+
return false;
2641+
2642+
/* Check if this matcher is enabled for this hint */
2643+
if (info->dsq_insert_below == (u64)-1)
2644+
return false;
2645+
2646+
/* Check per-layer DSQ insertion ratio */
2647+
if (layer->id >= MAX_LAYERS)
2648+
return false;
2649+
2650+
return layer_dsq_insert_ewma[layer->id] < info->dsq_insert_below;
2651+
}
25812652

25822653
default:
25832654
scx_bpf_error("invalid match kind %d", match->kind);
@@ -2634,7 +2705,7 @@ int match_layer(u32 layer_id, struct task_ctx *taskc,
26342705
matched_gpu = true;
26352706
}
26362707

2637-
/*
2708+
/*
26382709
* Matched GPU rule but not the rest. That means we'll never match them,
26392710
* and should mark ourselves as such to avoid being forced to rematch
26402711
* every time we touch the GPU.
@@ -2681,7 +2752,7 @@ static void switch_to_layer(struct task_struct *p, struct task_ctx *taskc, u64 l
26812752
taskc->layered_cpus_llc.seq = -1;
26822753
taskc->layered_cpus_node.seq = -1;
26832754

2684-
if (taskc->recheck_layer_membership != MEMBER_EXPIRED &&
2755+
if (taskc->recheck_layer_membership != MEMBER_EXPIRED &&
26852756
taskc->recheck_layer_membership != MEMBER_CANTMATCH) {
26862757

26872758
/* Default to MEMBER_NOEXPIRE unless member_expire_ms is set. */
@@ -3795,6 +3866,12 @@ static s32 init_layer(int layer_id)
37953866
case MATCH_HINT_EQUALS:
37963867
dbg("%s HINT_EQUALS %d", header, match->hint);
37973868
break;
3869+
case MATCH_SYSTEM_CPU_UTIL_BELOW:
3870+
dbg("%s SYSTEM_CPU_UTIL_BELOW %llu", header, match->system_cpu_util_below);
3871+
break;
3872+
case MATCH_DSQ_INSERT_BELOW:
3873+
dbg("%s DSQ_INSERT_BELOW %llu", header, match->dsq_insert_below);
3874+
break;
37983875
default:
37993876
scx_bpf_error("%s Invalid kind", header);
38003877
return -EINVAL;

scheds/rust/scx_layered/src/config.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ pub enum LayerMatch {
9898
UsedGpuPid(bool),
9999
AvgRuntime(u64, u64),
100100
HintEquals(u64),
101+
SystemCpuUtilBelow(f64),
102+
DsqInsertBelow(f64),
101103
}
102104

103105
#[derive(Clone, Debug, Serialize, Deserialize)]

0 commit comments

Comments
 (0)