3232 scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \
3333 scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)))
3434
35+ /*
36+ * The following defines are from 'linux/include/uapi/linux/futex.h'
37+ */
38+ #define FUTEX_WAIT 0
39+ #define FUTEX_WAKE 1
40+ #define FUTEX_FD 2
41+ #define FUTEX_REQUEUE 3
42+ #define FUTEX_CMP_REQUEUE 4
43+ #define FUTEX_WAKE_OP 5
44+ #define FUTEX_LOCK_PI 6
45+ #define FUTEX_UNLOCK_PI 7
46+ #define FUTEX_TRYLOCK_PI 8
47+ #define FUTEX_WAIT_BITSET 9
48+ #define FUTEX_WAKE_BITSET 10
49+ #define FUTEX_WAIT_REQUEUE_PI 11
50+ #define FUTEX_CMP_REQUEUE_PI 12
51+ #define FUTEX_LOCK_PI2 13
52+
53+ #define FUTEX_PRIVATE_FLAG 128
54+ #define FUTEX_CLOCK_REALTIME 256
55+ #define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
56+
57+ struct tp_syscall_enter_futex {
58+ struct trace_entry ent ;
59+ int __syscall_nr ;
60+ u32 __attribute__((btf_type_tag ("user" ))) * uaddr ;
61+ int op ;
62+ u32 val ;
63+ struct __kernel_timespec __attribute__((btf_type_tag ("user" ))) * utime ;
64+ u32 __attribute__((btf_type_tag ("user" ))) * uaddr2 ;
65+ u32 val3 ;
66+ };
67+
3568const volatile int ppid_targeting_ppid = 1 ;
3669const volatile bool ppid_targeting_inclusive = false; /* include ppid_targeting_ppid in chaos */
3770
@@ -52,6 +85,11 @@ const volatile u64 degradation_frac7 = 0;
5285
5386const volatile u32 kprobe_delays_freq_frac32 = 1 ;
5487
88+ const volatile u64 futex_uncontended_delay_ns = 1 ;
89+ const volatile u64 futex_contended_delay_min_ns = 1 ;
90+ const volatile u64 futex_contended_delay_max_ns = 1 ;
91+
92+
5593#define MIN (x , y ) ((x) < (y) ? (x) : (y))
5694#define MAX (x , y ) ((x) > (y) ? (x) : (y))
5795
@@ -81,6 +119,30 @@ struct {
81119 __type (value , u64 );
82120} chaos_stats SEC (".maps" );
83121
122+ struct chaos_futex_key {
123+ u32 tgid ;
124+ u64 uaddr ;
125+ };
126+
127+ struct chaos_futex_waiter {
128+ struct bpf_spin_lock lock ;
129+ u64 timeout_key ;
130+ u32 pid ;
131+ s32 delay_dsq_cpu_idx ;
132+ };
133+
134+ struct {
135+ __uint (type , BPF_MAP_TYPE_HASH );
136+ __uint (max_entries , 1024 * 1024 );
137+ __type (key , struct chaos_futex_key );
138+ __type (value , struct chaos_futex_waiter );
139+ } chaos_futex_waiters SEC (".maps" );
140+
141+ static __always_inline u64 chaos_get_prandom_u64 ()
142+ {
143+ return ((u64 )bpf_get_prandom_u32 () << 32 ) | bpf_get_prandom_u32 ();
144+ }
145+
84146struct chaos_task_ctx * lookup_create_chaos_task_ctx (struct task_struct * p )
85147{
86148 return bpf_task_storage_get (& chaos_task_ctxs , p , NULL , BPF_LOCAL_STORAGE_GET_F_CREATE );
@@ -114,9 +176,13 @@ static __always_inline enum chaos_trait_kind choose_chaos(struct chaos_task_ctx
114176
115177static __always_inline bool chaos_trait_skips_select_cpu (struct chaos_task_ctx * taskc )
116178{
117- if (taskc -> next_trait == CHAOS_TRAIT_RANDOM_DELAYS )
179+ switch (taskc -> next_trait ) {
180+ case CHAOS_TRAIT_RANDOM_DELAYS :
181+ case CHAOS_TRAIT_FUTEX_DELAYS :
118182 return true;
119- return false;
183+ default :
184+ return false;
185+ }
120186}
121187
122188static __always_inline u64 get_cpu_delay_dsq (int cpu_idx )
@@ -252,14 +318,141 @@ static __always_inline s32 calculate_chaos_match(struct task_struct *p)
252318 return ret ;
253319}
254320
321+ // Traverse a DSQ to find the first element with a key with hideous complexity.
322+ // This is O(n) in DSQ members.
323+ //
324+ // To improve:
325+ // - Add this as a kfunc to the kernel where it can be O(log n)
326+ // - Use arena DSQs where we can get this behaviour in O(log n)
327+ static __always_inline
328+ void bpf_iter_scx_dsq_search (struct bpf_iter_scx_dsq * it ,
329+ struct task_struct * * p ,
330+ u64 dsq_id ,
331+ u64 flags ,
332+ u64 key )
333+ {
334+ bpf_iter_scx_dsq_new (it , dsq_id , flags );
335+
336+ while ((* p = bpf_iter_scx_dsq_next (it ))) {
337+ if ((* p )-> scx .dsq_vtime == key )
338+ return ;
339+
340+ if ((* p )-> scx .dsq_vtime > key )
341+ break ;
342+ }
343+
344+ * p = NULL ;
345+ }
346+
347+ static __always_inline bool update_delayed_task_vtime (s32 cpu_idx , u64 key ,
348+ u64 pid , u64 new_vtime )
349+ {
350+ u64 dsq_id = get_cpu_delay_dsq (cpu_idx );
351+ struct bpf_iter_scx_dsq it ;
352+ struct task_struct * p ;
353+ bool ret = false;
354+
355+ bpf_iter_scx_dsq_search (& it , & p , dsq_id , 0 , key );
356+ if (!p )
357+ goto out ;
358+
359+ while (p -> pid != pid && (p = bpf_iter_scx_dsq_next (& it )) && p -> scx .dsq_vtime == key ) {}
360+ if (!p || p -> pid != pid )
361+ goto out ;
362+
363+ ret = true;
364+ scx_bpf_dsq_move_set_vtime (& it , new_vtime );
365+ ret = scx_bpf_dsq_move_vtime (& it , p , dsq_id , 0 );
366+
367+ out :
368+ bpf_iter_scx_dsq_destroy (& it );
369+ return ret ;
370+ }
371+
372+ __weak s32 enqueue_futex_delay (struct task_struct * p __arg_trusted ,
373+ u64 enq_flags ,
374+ struct chaos_task_ctx * taskc __arg_nonnull )
375+ {
376+ s64 ret ;
377+ struct chaos_futex_key key ;
378+ struct chaos_futex_waiter * entry ;
379+ struct chaos_futex_waiter val ;
380+ u64 vtime , now ;
381+ s32 cpu ;
382+
383+ key .tgid = p -> tgid ;
384+ key .uaddr = taskc -> futex_uaddr ;
385+
386+ // First ensure an entry exists but in a largely empty state. We need the
387+ // spinlock to correctly interlock with the delay DSQ.
388+ val .pid = -1 ;
389+
390+ ret = bpf_map_update_elem (& chaos_futex_waiters , & key , & val , BPF_NOEXIST );
391+ if (ret && ret != - EEXIST ) {
392+ scx_bpf_error ("failed to create chaos_futex_waiter in runnable_futex_delays" );
393+ return false;
394+ }
395+
396+ // Get the real element. This might be an empty element that we inserted
397+ // or it might be an element filled with another PID. It doesn't matter
398+ // whether we inserted the element or somebody else did, this races.
399+ entry = (struct chaos_futex_waiter * )bpf_map_lookup_elem (& chaos_futex_waiters , & key );
400+ if (!entry ) {
401+ scx_bpf_error ("failed to lookup chaos_futex_waiter in runnable_futex_delays" );
402+ return false;
403+ }
404+
405+ // enqueue ourselves before entering the spinlock. critical sections
406+ // can't call kfuncs.
407+ now = bpf_ktime_get_ns ();
408+ cpu = bpf_get_smp_processor_id ();
409+
410+ chaos_stat_inc (CHAOS_STAT_TRAIT_FUTEX_DELAYS );
411+ scx_bpf_dsq_insert_vtime (p , get_cpu_delay_dsq (cpu ), 0 , now + futex_uncontended_delay_ns , enq_flags );
412+
413+ // critical sections can't call kfuncs which makes this very complicated.
414+ // we must have already enqueued ourselves, and we must then insert
415+ // ourselves in the hashmap. when we take a task out of the lock we
416+ // should attempt to re-queue it after. the task will not hit this path
417+ // again until it has been re-queued, thus this isn't racy - either we
418+ // will re-queue it, or it will run naturally when its delay expires.
419+ // This might mean it doesn't get quite enough delay, but no invariants
420+ // are broken.
421+ bpf_spin_lock (& entry -> lock );
422+
423+ val .pid = entry -> pid ;
424+ val .timeout_key = entry -> timeout_key ;
425+ val .delay_dsq_cpu_idx = entry -> delay_dsq_cpu_idx ;
426+
427+ // enqueue ourselves and prepare the metadata for the next one to come along
428+ entry -> pid = p -> pid ;
429+ entry -> timeout_key = now + futex_uncontended_delay_ns ;
430+ entry -> delay_dsq_cpu_idx = cpu ;
431+
432+ bpf_spin_unlock (& entry -> lock );
433+
434+ // re-queue task that has a contender behind it
435+ if (val .pid != -1 ) {
436+ vtime = now + futex_contended_delay_min_ns ;
437+ if (futex_contended_delay_min_ns != futex_contended_delay_max_ns ) {
438+ vtime += chaos_get_prandom_u64 ()
439+ % (futex_contended_delay_max_ns - futex_contended_delay_min_ns );
440+ }
441+
442+ if (update_delayed_task_vtime (val .delay_dsq_cpu_idx , val .timeout_key , val .pid , vtime ))
443+ chaos_stat_inc (CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED );
444+ }
445+
446+ return true;
447+ }
448+
255449__weak s32 enqueue_random_delay (struct task_struct * p __arg_trusted , u64 enq_flags ,
256450 struct chaos_task_ctx * taskc __arg_nonnull )
257451{
258- u64 rand64 = ((u64 )bpf_get_prandom_u32 () << 32 ) | bpf_get_prandom_u32 ();
259-
260452 u64 vtime = bpf_ktime_get_ns () + random_delays_min_ns ;
261453 if (random_delays_min_ns != random_delays_max_ns ) {
262- vtime += rand64 % (random_delays_max_ns - random_delays_min_ns );
454+ vtime += chaos_get_prandom_u64 ()
455+ % (random_delays_max_ns - random_delays_min_ns );
263456 }
264457
265458 scx_bpf_dsq_insert_vtime (p , get_cpu_delay_dsq (-1 ), 0 , vtime , enq_flags );
@@ -278,6 +471,10 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags,
278471 out = enqueue_random_delay (p , enq_flags , taskc );
279472 break ;
280473
474+ case CHAOS_TRAIT_FUTEX_DELAYS :
475+ out = enqueue_futex_delay (p , enq_flags , taskc );
476+ break ;
477+
281478 case CHAOS_TRAIT_NONE :
282479 chaos_stat_inc (CHAOS_STAT_CHAOS_SKIPPED );
283480 out = false;
@@ -479,8 +676,7 @@ void BPF_STRUCT_OPS(chaos_enqueue, struct task_struct *p __arg_trusted, u64 enq_
479676 if (promise .kind == P2DQ_ENQUEUE_PROMISE_COMPLETE )
480677 return ;
481678
482- if (taskc -> next_trait == CHAOS_TRAIT_RANDOM_DELAYS &&
483- enqueue_chaotic (p , enq_flags , taskc ))
679+ if (enqueue_chaotic (p , enq_flags , taskc ))
484680 return ;
485681
486682 // NOTE: this may not work for affinitized tasks because p2dq does
@@ -582,6 +778,51 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(chaos_init_task, struct task_struct *p,
582778 return 0 ;
583779}
584780
781+ SEC ("?tracepoint/syscalls/sys_enter_futex" )
782+ int rtp_sys_enter_futex (struct tp_syscall_enter_futex * ctx )
783+ {
784+ struct task_struct * p ;
785+ struct chaos_task_ctx * taskc ;
786+ int futex_op ;
787+ s32 ret ;
788+
789+ // should be detached from userspace but if it is attached then no-op
790+ if (!futex_uncontended_delay_ns && !futex_contended_delay_min_ns &&
791+ !futex_contended_delay_max_ns )
792+ return 0 ;
793+
794+ p = (struct task_struct * )bpf_get_current_task_btf ();
795+ taskc = lookup_create_chaos_task_ctx (p );
796+ if (!taskc )
797+ return 0 ;
798+
799+ if (!(taskc -> match & CHAOS_MATCH_COMPLETE )) {
800+ ret = calculate_chaos_match (p );
801+ if (ret ) {
802+ scx_bpf_error ("failed to match task" );
803+ return 0 ;
804+ }
805+ }
806+
807+ if (taskc -> match & CHAOS_MATCH_EXCLUDED )
808+ return 0 ;
809+
810+ futex_op = ctx -> op & FUTEX_CMD_MASK ;
811+
812+ if (futex_op != FUTEX_WAIT && futex_op != FUTEX_WAIT_BITSET &&
813+ futex_op != FUTEX_WAIT_REQUEUE_PI )
814+ return 0 ;
815+
816+ // The task is either about to wait because it hit FUTEX_WAIT on the slow
817+ // path or hit the fast path. The fast path is irrelevant for our purposes
818+ // as we have no scheduler input there, so it's safe to delay our work
819+ // until a struct_ops .runnable callback comes along.
820+ taskc -> pending_trait = CHAOS_TRAIT_FUTEX_DELAYS ;
821+ taskc -> futex_uaddr = (u64 )ctx -> uaddr ;
822+
823+ return 0 ;
824+ }
825+
585826SCX_OPS_DEFINE (chaos ,
586827 .dispatch = (void * )chaos_dispatch ,
587828 .enqueue = (void * )chaos_enqueue ,
0 commit comments