@@ -110,13 +110,17 @@ struct {
110110struct {
111111 __uint (type , BPF_MAP_TYPE_ARRAY );
112112 __type (key , u32 );
113- __type (value , u32 );
113+ __type (value , struct hint_layer_info );
114114 __uint (map_flags , 0 );
115115 __uint (max_entries , 1025 );
116116} hint_to_layer_id_map SEC (".maps" );
117117
118118const volatile bool task_hint_map_enabled ;
119119
120+ /* EWMA value updated from userspace */
121+ u64 system_cpu_util_ewma = 0 ;
122+ u64 layer_dsq_insert_ewma [MAX_LAYERS ];
123+
120124static inline s32 prio_to_nice (s32 static_prio )
121125{
122126 /* See DEFAULT_PRIO and PRIO_TO_NICE in include/linux/sched/prio.h */
@@ -650,57 +654,79 @@ static struct task_hint *lookup_task_hint(struct task_struct *p)
650654 return hint ;
651655}
652656
653- static int lookup_task_hint_layer_id (struct task_struct * p ) {
657+ static struct hint_layer_info * lookup_task_hint_layer_id (struct task_struct * p ) {
654658 struct task_hint * hint ;
655- u32 * layer_idp ;
656659 u32 hint_val ;
657660
658661 hint = lookup_task_hint (p );
659662 if (!hint )
660- return - ENOENT ;
663+ return NULL ;
661664
662665 hint_val = hint -> hint ;
663- layer_idp = bpf_map_lookup_elem (& hint_to_layer_id_map , & hint_val );
664- if (!layer_idp )
665- return - EFAULT ;
666- return * layer_idp ;
666+ return bpf_map_lookup_elem (& hint_to_layer_id_map , & hint_val );
667667}
668668
669669static void switch_to_layer (struct task_struct * , struct task_ctx * , u64 layer_id , u64 now );
670670
671- static int lookup_task_layer_from_hint (struct task_struct * p , struct task_ctx * taskc )
671+ static bool is_task_layer_hint_stale (struct task_struct * p , struct task_ctx * taskc )
672672{
673- u64 layer_id ;
674- int ret ;
673+ struct hint_layer_info * info ;
675674
676- ret = lookup_task_hint_layer_id (p );
677- if (ret < 0 )
678- return ret ;
679- layer_id = ret ;
675+ if (!task_hint_map_enabled )
676+ return false;
680677
681- if (taskc -> layer_id == layer_id )
682- return - EAGAIN ;
683- /*
684- * If the existing layer does not match the one corresponding to the
685- * hint, we must switch the layers, return the new layer_id.
686- */
687- return layer_id ;
678+ info = lookup_task_hint_layer_id (p );
679+ if (!info )
680+ return false;
681+ return taskc -> layer_id != info -> layer_id ;
688682}
689683
690- static void maybe_refresh_task_layer_from_hint (struct task_struct * p )
684+ static void maybe_refresh_task_layer_from_hint (struct task_struct * p , struct task_ctx * taskc )
691685{
692- struct task_ctx * taskc ;
693- u64 layer_id ;
694- int ret ;
686+ struct hint_layer_info * info ;
687+ bool switch_layer = false;
695688
696- if (!( taskc = lookup_task_ctx_may_fail ( p )) )
689+ if (!task_hint_map_enabled )
697690 return ;
698691
699- ret = lookup_task_layer_from_hint (p , taskc );
700- if (ret < 0 )
692+ /* We are already going to refresh this task, skip. */
693+ if (taskc -> refresh_layer )
694+ return ;
695+
696+ info = lookup_task_hint_layer_id (p );
697+ if (!info )
701698 return ;
702- layer_id = ret ;
703- switch_to_layer (p , taskc , layer_id , scx_bpf_now ());
699+
700+ /*
701+ * We need to check whether the task layer matches the HintEquals specification,
702+ * additionally qualified by predicates gating layer admission.
703+ */
704+ if (taskc -> layer_id != info -> layer_id )
705+ switch_layer = true;
706+
707+ if (info -> system_cpu_util_below != (u64 )- 1 ) {
708+ if (system_cpu_util_ewma >= info -> system_cpu_util_below ) {
709+ /*
710+ * Refresh task so that it gets evicted out. It only needs
711+ * to be done for tasks in the current layer, for incoming
712+ * tasks we just reject admission.
713+ */
714+ taskc -> refresh_layer = switch_layer == false;
715+ switch_layer = false;
716+ }
717+ }
718+
719+ if (info -> dsq_insert_below != (u64 )- 1 && taskc -> layer_id < MAX_LAYERS ) {
720+ if (layer_dsq_insert_ewma [taskc -> layer_id ] >= info -> dsq_insert_below ) {
721+ /* Same idea as above. */
722+ taskc -> refresh_layer = switch_layer == false;
723+ switch_layer = false;
724+ }
725+ }
726+
727+ /* All conditions satisfied for layer matching. */
728+ if (switch_layer )
729+ switch_to_layer (p , taskc , info -> layer_id , scx_bpf_now ());
704730}
705731
706732int save_gpu_tgid_pid (void ) {
@@ -724,8 +750,8 @@ int save_gpu_tgid_pid(void) {
724750 taskc -> refresh_layer = true;
725751 }
726752
727- /*
728- * GPU kprobe fire has expired for the member.
753+ /*
754+ * GPU kprobe fire has expired for the member.
729755 * Force a recheck to see if we should put it
730756 * back into the GPU layer.
731757 */
@@ -734,7 +760,7 @@ int save_gpu_tgid_pid(void) {
734760
735761 timestamp = taskc -> running_at ;
736762 }
737-
763+
738764 /* Same logic for the parent. */
739765 if ((parent = lookup_task_ctx_may_fail (p -> parent ))) {
740766 if (!bpf_map_lookup_elem (& gpu_tgid , & pid )) {
@@ -1343,11 +1369,11 @@ s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64
13431369 s32 cpu ;
13441370
13451371 maybe_refresh_layer_cpumasks ();
1346- maybe_refresh_task_layer_from_hint (p );
13471372
13481373 if (!(cpuc = lookup_cpu_ctx (-1 )) || !(taskc = lookup_task_ctx (p )))
13491374 return prev_cpu ;
13501375
1376+ maybe_refresh_task_layer_from_hint (p , taskc );
13511377 /*
13521378 * We usually update the layer in layered_runnable() to avoid confusion.
13531379 * As layered_select_cpu() takes place before runnable, new tasks would
@@ -1545,13 +1571,14 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
15451571 u64 * lstats ;
15461572
15471573 maybe_refresh_layer_cpumasks ();
1548- /* Only invoke if we never went through select_cpu path. */
1549- if (!__COMPAT_is_enq_cpu_selected (enq_flags ))
1550- maybe_refresh_task_layer_from_hint (p );
15511574
15521575 if (!(cpuc = lookup_cpu_ctx (-1 )) || !(taskc = lookup_task_ctx (p )))
15531576 return ;
15541577
1578+ /* Only invoke if we never went through select_cpu path. */
1579+ if (!__COMPAT_is_enq_cpu_selected (enq_flags ))
1580+ maybe_refresh_task_layer_from_hint (p , taskc );
1581+
15551582 layer_id = taskc -> layer_id ;
15561583 if (!(layer = lookup_layer (layer_id )))
15571584 return ;
@@ -1803,14 +1830,14 @@ static inline void check_member_expired(struct task_ctx *taskc, u64 now)
18031830{
18041831 u64 recheck = taskc -> recheck_layer_membership ;
18051832
1806- /*
1833+ /*
18071834 * Don't trigger a recheck if:
18081835 * - Membership never expires
18091836 * - Membership already expired
18101837 * - Member has been tested and hasn't joined any GPU layer
18111838 * even though it matches the GPU rule.
18121839 */
1813- if (recheck == MEMBER_NOEXPIRE ||
1840+ if (recheck == MEMBER_NOEXPIRE ||
18141841 recheck == MEMBER_EXPIRED ||
18151842 recheck == MEMBER_CANTMATCH )
18161843 return ;
@@ -2247,7 +2274,7 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
22472274 * Do not refresh the slice in case we need the task to be reenqueued
22482275 * for layer membership change and subsequent CPU selection.
22492276 */
2250- if (lookup_task_layer_from_hint (prev , prev_taskc ) >= 0 )
2277+ if (is_task_layer_hint_stale (prev , prev_taskc ))
22512278 return ;
22522279 prev -> scx .slice = prev_layer -> slice_ns ;
22532280 return ;
@@ -2391,7 +2418,7 @@ void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
23912418 * Do not refresh the slice in case we need the task to be reenqueued
23922419 * for layer membership change and subsequent CPU selection.
23932420 */
2394- if (prev_taskc && prev_layer && lookup_task_layer_from_hint (prev , prev_taskc ) < 0 )
2421+ if (prev_taskc && prev_layer && ! is_task_layer_hint_stale (prev , prev_taskc ))
23952422 prev -> scx .slice = prev_layer -> slice_ns ;
23962423}
23972424
@@ -2538,13 +2565,13 @@ static __noinline bool match_one(struct layer *layer, struct layer_match *match,
25382565 if (!last_used )
25392566 return !must_be_used ;
25402567
2541- /*
2542- * If the last kprobe fire was more than member_expire_ms ago, timestamp is stale.
2568+ /*
2569+ * If the last kprobe fire was more than member_expire_ms ago, timestamp is stale.
25432570 * Mark us as expired to trigger a rematch if we fire off any GPU kprobes.
25442571 */
25452572 recently_used = true;
2546- if (taskc && taskc -> recheck_layer_membership != MEMBER_NOEXPIRE &&
2547- taskc -> recheck_layer_membership != MEMBER_CANTMATCH &&
2573+ if (taskc && taskc -> recheck_layer_membership != MEMBER_NOEXPIRE &&
2574+ taskc -> recheck_layer_membership != MEMBER_CANTMATCH &&
25482575 (* last_used ) + layer -> member_expire_ms * 1000 * 1000 < now ) {
25492576
25502577 recently_used = false;
@@ -2578,6 +2605,50 @@ static __noinline bool match_one(struct layer *layer, struct layer_match *match,
25782605 return false;
25792606 return match -> hint == hint -> hint ;
25802607 }
2608+ case MATCH_SYSTEM_CPU_UTIL_BELOW : {
2609+ struct task_hint * hint ;
2610+ struct hint_layer_info * info ;
2611+ u32 hint_val ;
2612+
2613+ hint = lookup_task_hint (p );
2614+ if (!hint )
2615+ return false;
2616+
2617+ hint_val = hint -> hint ;
2618+ info = bpf_map_lookup_elem (& hint_to_layer_id_map , & hint_val );
2619+ if (!info )
2620+ return false;
2621+
2622+ /* Check if this matcher is enabled for this hint */
2623+ if (info -> system_cpu_util_below == (u64 )- 1 )
2624+ return false;
2625+
2626+ return system_cpu_util_ewma < info -> system_cpu_util_below ;
2627+ }
2628+ case MATCH_DSQ_INSERT_BELOW : {
2629+ struct task_hint * hint ;
2630+ struct hint_layer_info * info ;
2631+ u32 hint_val ;
2632+
2633+ hint = lookup_task_hint (p );
2634+ if (!hint )
2635+ return false;
2636+
2637+ hint_val = hint -> hint ;
2638+ info = bpf_map_lookup_elem (& hint_to_layer_id_map , & hint_val );
2639+ if (!info )
2640+ return false;
2641+
2642+ /* Check if this matcher is enabled for this hint */
2643+ if (info -> dsq_insert_below == (u64 )- 1 )
2644+ return false;
2645+
2646+ /* Check per-layer DSQ insertion ratio */
2647+ if (layer -> id >= MAX_LAYERS )
2648+ return false;
2649+
2650+ return layer_dsq_insert_ewma [layer -> id ] < info -> dsq_insert_below ;
2651+ }
25812652
25822653 default :
25832654 scx_bpf_error ("invalid match kind %d" , match -> kind );
@@ -2634,7 +2705,7 @@ int match_layer(u32 layer_id, struct task_ctx *taskc,
26342705 matched_gpu = true;
26352706 }
26362707
2637- /*
2708+ /*
26382709 * Matched GPU rule but not the rest. That means we'll never match them,
26392710 * and should mark ourselves as such to avoid being forced to rematch
26402711 * every time we touch the GPU.
@@ -2681,7 +2752,7 @@ static void switch_to_layer(struct task_struct *p, struct task_ctx *taskc, u64 l
26812752 taskc -> layered_cpus_llc .seq = -1 ;
26822753 taskc -> layered_cpus_node .seq = -1 ;
26832754
2684- if (taskc -> recheck_layer_membership != MEMBER_EXPIRED &&
2755+ if (taskc -> recheck_layer_membership != MEMBER_EXPIRED &&
26852756 taskc -> recheck_layer_membership != MEMBER_CANTMATCH ) {
26862757
26872758 /* Default to MEMBER_NOEXPIRE unless member_expire_ms is set. */
@@ -3795,6 +3866,12 @@ static s32 init_layer(int layer_id)
37953866 case MATCH_HINT_EQUALS :
37963867 dbg ("%s HINT_EQUALS %d" , header , match -> hint );
37973868 break ;
3869+ case MATCH_SYSTEM_CPU_UTIL_BELOW :
3870+ dbg ("%s SYSTEM_CPU_UTIL_BELOW %llu" , header , match -> system_cpu_util_below );
3871+ break ;
3872+ case MATCH_DSQ_INSERT_BELOW :
3873+ dbg ("%s DSQ_INSERT_BELOW %llu" , header , match -> dsq_insert_below );
3874+ break ;
37983875 default :
37993876 scx_bpf_error ("%s Invalid kind" , header );
38003877 return - EINVAL ;
0 commit comments