3737 scx_bpf_dispatch_vtime_from_dsq___compat( \
3838 (it__iter), (p), (dsq_id), (enq_flags)))
3939
40+ /*
41+ * The following defines are from 'linux/include/uapi/linux/futex.h'
42+ */
43+ #define FUTEX_WAIT 0
44+ #define FUTEX_WAKE 1
45+ #define FUTEX_FD 2
46+ #define FUTEX_REQUEUE 3
47+ #define FUTEX_CMP_REQUEUE 4
48+ #define FUTEX_WAKE_OP 5
49+ #define FUTEX_LOCK_PI 6
50+ #define FUTEX_UNLOCK_PI 7
51+ #define FUTEX_TRYLOCK_PI 8
52+ #define FUTEX_WAIT_BITSET 9
53+ #define FUTEX_WAKE_BITSET 10
54+ #define FUTEX_WAIT_REQUEUE_PI 11
55+ #define FUTEX_CMP_REQUEUE_PI 12
56+ #define FUTEX_LOCK_PI2 13
57+
58+ #define FUTEX_PRIVATE_FLAG 128
59+ #define FUTEX_CLOCK_REALTIME 256
60+ #define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
61+
62+ struct tp_syscall_enter_futex {
63+ struct trace_entry ent ;
64+ int __syscall_nr ;
65+ u32 __attribute__((btf_type_tag ("user" ))) * uaddr ;
66+ int op ;
67+ u32 val ;
68+ struct __kernel_timespec __attribute__((btf_type_tag ("user" ))) * utime ;
69+ u32 __attribute__((btf_type_tag ("user" ))) * uaddr2 ;
70+ u32 val3 ;
71+ };
72+
4073const volatile int ppid_targeting_ppid = 1 ;
4174const volatile bool ppid_targeting_inclusive =
4275 false; /* include ppid_targeting_ppid in chaos */
@@ -60,6 +93,11 @@ const volatile u32 kprobe_delays_freq_frac32 = 1;
6093const volatile u64 kprobe_delays_min_ns = 1 ;
6194const volatile u64 kprobe_delays_max_ns = 2 ;
6295
96+ const volatile u64 futex_uncontended_delay_ns = 1 ;
97+ const volatile u64 futex_contended_delay_min_ns = 1 ;
98+ const volatile u64 futex_contended_delay_max_ns = 1 ;
99+
100+
63101#define MIN (x , y ) ((x) < (y) ? (x) : (y))
64102#define MAX (x , y ) ((x) > (y) ? (x) : (y))
65103
@@ -89,6 +127,30 @@ struct {
89127 __type (value , u64 );
90128} chaos_stats SEC (".maps" );
91129
130+ struct chaos_futex_key {
131+ u32 tgid ;
132+ u64 uaddr ;
133+ };
134+
135+ struct chaos_futex_waiter {
136+ struct bpf_spin_lock lock ;
137+ u64 timeout_key ;
138+ u32 pid ;
139+ s32 delay_dsq_cpu_idx ;
140+ };
141+
142+ struct {
143+ __uint (type , BPF_MAP_TYPE_HASH );
144+ __uint (max_entries , 1024 * 1024 );
145+ __type (key , struct chaos_futex_key );
146+ __type (value , struct chaos_futex_waiter );
147+ } chaos_futex_waiters SEC (".maps" );
148+
149+ static __always_inline u64 chaos_get_prandom_u64 ()
150+ {
151+ return ((u64 )bpf_get_prandom_u32 () << 32 ) | bpf_get_prandom_u32 ();
152+ }
153+
92154struct chaos_task_ctx * lookup_create_chaos_task_ctx (struct task_struct * p )
93155{
94156 return bpf_task_storage_get (& chaos_task_ctxs , p , NULL ,
@@ -166,8 +228,14 @@ choose_chaos(struct chaos_task_ctx *taskc)
166228static __always_inline bool
167229chaos_trait_skips_select_cpu (struct chaos_task_ctx * taskc )
168230{
169- return taskc -> next_trait == CHAOS_TRAIT_RANDOM_DELAYS ||
170- taskc -> next_trait == CHAOS_TRAIT_KPROBE_RANDOM_DELAYS ;
231+ switch (taskc -> next_trait ) {
232+ case CHAOS_TRAIT_RANDOM_DELAYS :
233+ case CHAOS_TRAIT_KPROBE_RANDOM_DELAYS :
234+ case CHAOS_TRAIT_FUTEX_DELAYS :
235+ return true;
236+ default :
237+ return false;
238+ }
171239}
172240
173241static __always_inline u64 get_cpu_delay_dsq (int cpu_idx )
@@ -306,6 +374,134 @@ static __always_inline s32 calculate_chaos_match(struct task_struct *p)
306374 return ret ;
307375}
308376
377+ // Traverse a DSQ to find the first element with a key with hideous complexity.
378+ // This is O(n) in DSQ members.
379+ //
380+ // To improve:
381+ // - Add this as a kfunc to the kernel where it can be O(log n)
382+ // - Use arena DSQs where we can get this behaviour in O(log n)
383+ static __always_inline
384+ void bpf_iter_scx_dsq_search (struct bpf_iter_scx_dsq * it ,
385+ struct task_struct * * p ,
386+ u64 dsq_id ,
387+ u64 flags ,
388+ u64 key )
389+ {
390+ bpf_iter_scx_dsq_new (it , dsq_id , flags );
391+
392+ while ((* p = bpf_iter_scx_dsq_next (it ))) {
393+ if ((* p )-> scx .dsq_vtime == key )
394+ return ;
395+
396+ if ((* p )-> scx .dsq_vtime > key )
397+ break ;
398+ }
399+
400+ * p = NULL ;
401+ }
402+
403+ static __always_inline bool update_delayed_task_vtime (s32 cpu_idx , u64 key ,
404+ u64 pid , u64 new_vtime )
405+ {
406+ u64 dsq_id = get_cpu_delay_dsq (cpu_idx );
407+ struct bpf_iter_scx_dsq it ;
408+ struct task_struct * p ;
409+ bool ret = false;
410+
411+ bpf_iter_scx_dsq_search (& it , & p , dsq_id , 0 , key );
412+ if (!p )
413+ goto out ;
414+
415+ while (p -> pid != pid && (p = bpf_iter_scx_dsq_next (& it )) && p -> scx .dsq_vtime == key ) {}
416+ if (!p || p -> pid != pid )
417+ goto out ;
418+
419+ ret = true;
420+ __COMPAT_chaos_scx_bpf_dsq_move_set_vtime (& it , new_vtime );
421+ ret = __COMPAT_chaos_scx_bpf_dsq_move_vtime (& it , p , dsq_id , 0 );
422+
423+ out :
424+ bpf_iter_scx_dsq_destroy (& it );
425+ return ret ;
426+ }
427+
428+ __weak s32 enqueue_futex_delay (struct task_struct * p __arg_trusted ,
429+ u64 enq_flags ,
430+ struct chaos_task_ctx * taskc __arg_nonnull )
431+ {
432+ s64 ret ;
433+ struct chaos_futex_key key ;
434+ struct chaos_futex_waiter * entry ;
435+ struct chaos_futex_waiter val ;
436+ u64 vtime , now ;
437+ s32 cpu ;
438+
439+ key .tgid = p -> tgid ;
440+ key .uaddr = taskc -> futex_uaddr ;
441+
442+ // First ensure an entry exists but in a largely empty state. We need the
443+ // spinlock to correctly interlock with the delay DSQ.
444+ val .pid = -1 ;
445+
446+ ret = bpf_map_update_elem (& chaos_futex_waiters , & key , & val , BPF_NOEXIST );
447+ if (ret && ret != - EEXIST ) {
448+ scx_bpf_error ("failed to create chaos_futex_waiter in runnable_futex_delays" );
449+ return false;
450+ }
451+
452+ // Get the real element. This might be an empty element that we inserted
453+ // or it might be an element filled with another PID. It doesn't matter
454+ // whether we inserted the element or somebody else did, this races.
455+ entry = (struct chaos_futex_waiter * )bpf_map_lookup_elem (& chaos_futex_waiters , & key );
456+ if (!entry ) {
457+ scx_bpf_error ("failed to lookup chaos_futex_waiter in runnable_futex_delays" );
458+ return false;
459+ }
460+
461+ // enqueue ourselves before entering the spinlock. critical sections
462+ // can't call kfuncs.
463+ now = bpf_ktime_get_ns ();
464+ cpu = bpf_get_smp_processor_id ();
465+
466+ chaos_stat_inc (CHAOS_STAT_TRAIT_FUTEX_DELAYS );
467+ scx_bpf_dsq_insert_vtime (p , get_cpu_delay_dsq (cpu ), 0 , now + futex_uncontended_delay_ns , enq_flags );
468+
469+ // critical sections can't call kfuncs which makes this very complicated.
470+ // we must have already enqueued ourselves, and we must then insert
471+ // ourselves in the hashmap. when we take a task out of the lock we
472+ // should attempt to re-queue it after. the task will not hit this path
473+ // again until it has been re-queued, thus this isn't racy - either we
474+ // will re-queue it, or it will run naturally when its delay expires.
475+ // This might mean it doesn't get quite enough delay, but no invariants
476+ // are broken.
477+ bpf_spin_lock (& entry -> lock );
478+
479+ val .pid = entry -> pid ;
480+ val .timeout_key = entry -> timeout_key ;
481+ val .delay_dsq_cpu_idx = entry -> delay_dsq_cpu_idx ;
482+
483+ // enqueue ourselves and prepare the metadata for the next one to come along
484+ entry -> pid = p -> pid ;
485+ entry -> timeout_key = now + futex_uncontended_delay_ns ;
486+ entry -> delay_dsq_cpu_idx = cpu ;
487+
488+ bpf_spin_unlock (& entry -> lock );
489+
490+ // re-queue task that has a contender behind it
491+ if (val .pid != -1 ) {
492+ vtime = now + futex_contended_delay_min_ns ;
493+ if (futex_contended_delay_min_ns != futex_contended_delay_max_ns ) {
494+ vtime += chaos_get_prandom_u64 ()
495+ % (futex_contended_delay_max_ns - futex_contended_delay_min_ns );
496+ }
497+
498+ if (update_delayed_task_vtime (val .delay_dsq_cpu_idx , val .timeout_key , val .pid , vtime ))
499+ chaos_stat_inc (CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED );
500+ }
501+
502+ return true;
503+ }
504+
309505__weak s32 enqueue_random_delay (struct task_struct * p __arg_trusted ,
310506 u64 enq_flags ,
311507 struct chaos_task_ctx * taskc __arg_nonnull ,
@@ -334,6 +530,10 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags,
334530 random_delays_max_ns );
335531 chaos_stat_inc (CHAOS_STAT_TRAIT_RANDOM_DELAYS );
336532 break ;
533+
534+ case CHAOS_TRAIT_FUTEX_DELAYS :
535+ out = enqueue_futex_delay (p , enq_flags , taskc );
536+ break ;
337537 case CHAOS_TRAIT_NONE :
338538 chaos_stat_inc (CHAOS_STAT_CHAOS_SKIPPED );
339539 out = false;
@@ -345,7 +545,6 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags,
345545 break ;
346546 }
347547
348- taskc -> next_trait = CHAOS_TRAIT_NONE ;
349548 return out ;
350549}
351550
@@ -580,10 +779,10 @@ void BPF_STRUCT_OPS(chaos_enqueue, struct task_struct *p __arg_trusted,
580779 if (promise .kind == P2DQ_ENQUEUE_PROMISE_FAILED )
581780 goto cleanup ;
582781
583- if ((taskc -> next_trait == CHAOS_TRAIT_RANDOM_DELAYS ||
584- taskc -> next_trait == CHAOS_TRAIT_KPROBE_RANDOM_DELAYS ) &&
585- enqueue_chaotic (p , enq_flags , taskc ))
782+ if (enqueue_chaotic (p , enq_flags , taskc )) {
783+ taskc -> next_trait = CHAOS_TRAIT_NONE ;
586784 goto cleanup ;
785+ }
587786
588787 // NOTE: this may not work for affinitized tasks because p2dq does
589788 // direct dispatch in some situations.
@@ -696,6 +895,52 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(chaos_init_task, struct task_struct *p,
696895 return 0 ;
697896}
698897
898+ SEC ("?tracepoint/syscalls/sys_enter_futex" )
899+ int rtp_sys_enter_futex (struct tp_syscall_enter_futex * ctx )
900+ {
901+ struct task_struct * p ;
902+ struct chaos_task_ctx * taskc ;
903+ int futex_op ;
904+ s32 ret ;
905+
906+ // should be detached from userspace but if it is attached then no-op
907+ if (!futex_uncontended_delay_ns && !futex_contended_delay_min_ns &&
908+ !futex_contended_delay_max_ns )
909+ return 0 ;
910+
911+ p = (struct task_struct * )bpf_get_current_task_btf ();
912+ taskc = lookup_create_chaos_task_ctx (p );
913+ if (!taskc )
914+ return 0 ;
915+
916+ if (!(taskc -> match & CHAOS_MATCH_COMPLETE )) {
917+ ret = calculate_chaos_match (p );
918+ if (ret ) {
919+ scx_bpf_error ("failed to match task" );
920+ return 0 ;
921+ }
922+ }
923+
924+ if (taskc -> match & CHAOS_MATCH_EXCLUDED )
925+ return 0 ;
926+
927+ futex_op = ctx -> op & FUTEX_CMD_MASK ;
928+
929+ if (futex_op != FUTEX_WAIT && futex_op != FUTEX_WAIT_BITSET &&
930+ futex_op != FUTEX_WAIT_REQUEUE_PI )
931+ return 0 ;
932+
933+ // The task is either about to wait because it hit FUTEX_WAIT on the slow
934+ // path or hit the fast path. The fast path is irrelevant for our purposes
935+ // as we have no scheduler input there, so it's safe to delay our work
936+ // until a struct_ops .runnable callback comes along.
937+ taskc -> pending_trait = CHAOS_TRAIT_FUTEX_DELAYS ;
938+ taskc -> futex_uaddr = (u64 )ctx -> uaddr ;
939+
940+ return 0 ;
941+ }
942+
943+
699944SEC ("kprobe/generic" )
700945int generic (struct pt_regs * ctx )
701946{
0 commit comments