Skip to content

Commit 69434ac

Browse files
committed
chaos: add futex delays trait
Add futex delays to chaos. To best reproduce deadlocks and other futex issues we need to affect locking. The approach here: - Delays a waiter when a lock has contention up to futex_uncontended_delay_ns. - Swaps out the existing delayed waiter when another waiter comes along. - Delays the previous waiter by a random delay between futex_contended_delay_ns and futex_uncontended_delay_ns. This approach is chosen over random delays to flip futex conditions with minimal performance impact on a machine/process. If we had a futex and pair of threads that have many idle seconds after a short period of contention we would need huge random delays to affect their ordering at all, on every task that touches the futex. Instead we can limit the delays to a solo waiter at any point, and have a much smaller delay when we know the mutex is already under contention. We'll see how this works in practice. This is the most complicated chaos trait in terms of data structures by far. Currently we use a BPF hash map and a built in DSQ to maintain the data. The hash map maps a specific futex (well, close, a tgid/uaddr pair) to an entry in a CPU's delay DSQ. The delay DSQ holds the task until its timeout, and the map stores how to find that entry in the DSQ to re-queue it with the uncontended timeout. As commented in the code, the complexity of a search in a native DSQ is hideous - it's O(n). We can change the implementation in the future while keeping the logic the same. Test plan: - Lightly tested. Futex is attached to and sees many entries. Slow futex waiters are delayed. The hand off between an old delayed waiter and a new delayed waiter are not reliable and likely have a bug. - This change is a no-op unless you provide new command line flags.
1 parent b6a8eed commit 69434ac

File tree

4 files changed

+336
-7
lines changed

4 files changed

+336
-7
lines changed

scheds/rust/scx_chaos/src/bpf/intf.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ enum chaos_trait_kind {
3333
CHAOS_TRAIT_CPU_FREQ,
3434
CHAOS_TRAIT_DEGRADATION,
3535
CHAOS_TRAIT_KPROBE_RANDOM_DELAYS,
36+
CHAOS_TRAIT_FUTEX_DELAYS,
3637
CHAOS_TRAIT_MAX,
3738
};
3839

@@ -44,12 +45,17 @@ struct chaos_task_ctx {
4445
enum chaos_trait_kind pending_trait;
4546
u64 enq_flags;
4647
u64 p2dq_vtime;
48+
49+
// Futex delay state
50+
u64 futex_uaddr;
4751
};
4852

4953
enum chaos_stat_idx {
5054
CHAOS_STAT_TRAIT_RANDOM_DELAYS,
5155
CHAOS_STAT_TRAIT_CPU_FREQ,
5256
CHAOS_STAT_TRAIT_DEGRADATION,
57+
CHAOS_STAT_TRAIT_FUTEX_DELAYS,
58+
CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED,
5359
CHAOS_STAT_CHAOS_EXCLUDED,
5460
CHAOS_STAT_CHAOS_SKIPPED,
5561
CHAOS_STAT_KPROBE_RANDOM_DELAYS,

scheds/rust/scx_chaos/src/bpf/main.bpf.c

Lines changed: 251 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,39 @@
3737
scx_bpf_dispatch_vtime_from_dsq___compat( \
3838
(it__iter), (p), (dsq_id), (enq_flags)))
3939

40+
/*
41+
* The following defines are from 'linux/include/uapi/linux/futex.h'
42+
*/
43+
#define FUTEX_WAIT 0
44+
#define FUTEX_WAKE 1
45+
#define FUTEX_FD 2
46+
#define FUTEX_REQUEUE 3
47+
#define FUTEX_CMP_REQUEUE 4
48+
#define FUTEX_WAKE_OP 5
49+
#define FUTEX_LOCK_PI 6
50+
#define FUTEX_UNLOCK_PI 7
51+
#define FUTEX_TRYLOCK_PI 8
52+
#define FUTEX_WAIT_BITSET 9
53+
#define FUTEX_WAKE_BITSET 10
54+
#define FUTEX_WAIT_REQUEUE_PI 11
55+
#define FUTEX_CMP_REQUEUE_PI 12
56+
#define FUTEX_LOCK_PI2 13
57+
58+
#define FUTEX_PRIVATE_FLAG 128
59+
#define FUTEX_CLOCK_REALTIME 256
60+
#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
61+
62+
struct tp_syscall_enter_futex {
63+
struct trace_entry ent;
64+
int __syscall_nr;
65+
u32 __attribute__((btf_type_tag("user"))) * uaddr;
66+
int op;
67+
u32 val;
68+
struct __kernel_timespec __attribute__((btf_type_tag("user"))) * utime;
69+
u32 __attribute__((btf_type_tag("user"))) * uaddr2;
70+
u32 val3;
71+
};
72+
4073
const volatile int ppid_targeting_ppid = 1;
4174
const volatile bool ppid_targeting_inclusive =
4275
false; /* include ppid_targeting_ppid in chaos */
@@ -60,6 +93,11 @@ const volatile u32 kprobe_delays_freq_frac32 = 1;
6093
const volatile u64 kprobe_delays_min_ns = 1;
6194
const volatile u64 kprobe_delays_max_ns = 2;
6295

96+
const volatile u64 futex_uncontended_delay_ns = 1;
97+
const volatile u64 futex_contended_delay_min_ns = 1;
98+
const volatile u64 futex_contended_delay_max_ns = 1;
99+
100+
63101
#define MIN(x, y) ((x) < (y) ? (x) : (y))
64102
#define MAX(x, y) ((x) > (y) ? (x) : (y))
65103

@@ -89,6 +127,30 @@ struct {
89127
__type(value, u64);
90128
} chaos_stats SEC(".maps");
91129

130+
struct chaos_futex_key {
131+
u32 tgid;
132+
u64 uaddr;
133+
};
134+
135+
struct chaos_futex_waiter {
136+
struct bpf_spin_lock lock;
137+
u64 timeout_key;
138+
u32 pid;
139+
s32 delay_dsq_cpu_idx;
140+
};
141+
142+
struct {
143+
__uint(type, BPF_MAP_TYPE_HASH);
144+
__uint(max_entries, 1024*1024);
145+
__type(key, struct chaos_futex_key);
146+
__type(value, struct chaos_futex_waiter);
147+
} chaos_futex_waiters SEC(".maps");
148+
149+
static __always_inline u64 chaos_get_prandom_u64()
150+
{
151+
return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32();
152+
}
153+
92154
struct chaos_task_ctx *lookup_create_chaos_task_ctx(struct task_struct *p)
93155
{
94156
return bpf_task_storage_get(&chaos_task_ctxs, p, NULL,
@@ -166,8 +228,14 @@ choose_chaos(struct chaos_task_ctx *taskc)
166228
static __always_inline bool
167229
chaos_trait_skips_select_cpu(struct chaos_task_ctx *taskc)
168230
{
169-
return taskc->next_trait == CHAOS_TRAIT_RANDOM_DELAYS ||
170-
taskc->next_trait == CHAOS_TRAIT_KPROBE_RANDOM_DELAYS;
231+
switch (taskc->next_trait) {
232+
case CHAOS_TRAIT_RANDOM_DELAYS:
233+
case CHAOS_TRAIT_KPROBE_RANDOM_DELAYS:
234+
case CHAOS_TRAIT_FUTEX_DELAYS:
235+
return true;
236+
default:
237+
return false;
238+
}
171239
}
172240

173241
static __always_inline u64 get_cpu_delay_dsq(int cpu_idx)
@@ -306,6 +374,134 @@ static __always_inline s32 calculate_chaos_match(struct task_struct *p)
306374
return ret;
307375
}
308376

377+
// Traverse a DSQ to find the first element with a key with hideous complexity.
378+
// This is O(n) in DSQ members.
379+
//
380+
// To improve:
381+
// - Add this as a kfunc to the kernel where it can be O(log n)
382+
// - Use arena DSQs where we can get this behaviour in O(log n)
383+
static __always_inline
384+
void bpf_iter_scx_dsq_search(struct bpf_iter_scx_dsq *it,
385+
struct task_struct **p,
386+
u64 dsq_id,
387+
u64 flags,
388+
u64 key)
389+
{
390+
bpf_iter_scx_dsq_new(it, dsq_id, flags);
391+
392+
while((*p = bpf_iter_scx_dsq_next(it))) {
393+
if ((*p)->scx.dsq_vtime == key)
394+
return;
395+
396+
if ((*p)->scx.dsq_vtime > key)
397+
break;
398+
}
399+
400+
*p = NULL;
401+
}
402+
403+
static __always_inline bool update_delayed_task_vtime(s32 cpu_idx, u64 key,
404+
u64 pid, u64 new_vtime)
405+
{
406+
u64 dsq_id = get_cpu_delay_dsq(cpu_idx);
407+
struct bpf_iter_scx_dsq it;
408+
struct task_struct *p;
409+
bool ret = false;
410+
411+
bpf_iter_scx_dsq_search(&it, &p, dsq_id, 0, key);
412+
if (!p)
413+
goto out;
414+
415+
while (p->pid != pid && (p = bpf_iter_scx_dsq_next(&it)) && p->scx.dsq_vtime == key) {}
416+
if (!p || p->pid != pid)
417+
goto out;
418+
419+
ret = true;
420+
__COMPAT_chaos_scx_bpf_dsq_move_set_vtime(&it, new_vtime);
421+
ret = __COMPAT_chaos_scx_bpf_dsq_move_vtime(&it, p, dsq_id, 0);
422+
423+
out:
424+
bpf_iter_scx_dsq_destroy(&it);
425+
return ret;
426+
}
427+
428+
__weak s32 enqueue_futex_delay(struct task_struct *p __arg_trusted,
429+
u64 enq_flags,
430+
struct chaos_task_ctx *taskc __arg_nonnull)
431+
{
432+
s64 ret;
433+
struct chaos_futex_key key;
434+
struct chaos_futex_waiter *entry;
435+
struct chaos_futex_waiter val;
436+
u64 vtime, now;
437+
s32 cpu;
438+
439+
key.tgid = p->tgid;
440+
key.uaddr = taskc->futex_uaddr;
441+
442+
// First ensure an entry exists but in a largely empty state. We need the
443+
// spinlock to correctly interlock with the delay DSQ.
444+
val.pid = -1;
445+
446+
ret = bpf_map_update_elem(&chaos_futex_waiters, &key, &val, BPF_NOEXIST);
447+
if (ret && ret != -EEXIST) {
448+
scx_bpf_error("failed to create chaos_futex_waiter in runnable_futex_delays");
449+
return false;
450+
}
451+
452+
// Get the real element. This might be an empty element that we inserted
453+
// or it might be an element filled with another PID. It doesn't matter
454+
// whether we inserted the element or somebody else did, this races.
455+
entry = (struct chaos_futex_waiter*)bpf_map_lookup_elem(&chaos_futex_waiters, &key);
456+
if (!entry) {
457+
scx_bpf_error("failed to lookup chaos_futex_waiter in runnable_futex_delays");
458+
return false;
459+
}
460+
461+
// enqueue ourselves before entering the spinlock. critical sections
462+
// can't call kfuncs.
463+
now = bpf_ktime_get_ns();
464+
cpu = bpf_get_smp_processor_id();
465+
466+
chaos_stat_inc(CHAOS_STAT_TRAIT_FUTEX_DELAYS);
467+
scx_bpf_dsq_insert_vtime(p, get_cpu_delay_dsq(cpu), 0, now + futex_uncontended_delay_ns, enq_flags);
468+
469+
// critical sections can't call kfuncs which makes this very complicated.
470+
// we must have already enqueued ourselves, and we must then insert
471+
// ourselves in the hashmap. when we take a task out of the lock we
472+
// should attempt to re-queue it after. the task will not hit this path
473+
// again until it has been re-queued, thus this isn't racy - either we
474+
// will re-queue it, or it will run naturally when its delay expires.
475+
// This might mean it doesn't get quite enough delay, but no invariants
476+
// are broken.
477+
bpf_spin_lock(&entry->lock);
478+
479+
val.pid = entry->pid;
480+
val.timeout_key = entry->timeout_key;
481+
val.delay_dsq_cpu_idx = entry->delay_dsq_cpu_idx;
482+
483+
// enqueue ourselves and prepare the metadata for the next one to come along
484+
entry->pid = p->pid;
485+
entry->timeout_key = now + futex_uncontended_delay_ns;
486+
entry->delay_dsq_cpu_idx = cpu;
487+
488+
bpf_spin_unlock(&entry->lock);
489+
490+
// re-queue task that has a contender behind it
491+
if (val.pid != -1) {
492+
vtime = now + futex_contended_delay_min_ns;
493+
if (futex_contended_delay_min_ns != futex_contended_delay_max_ns) {
494+
vtime += chaos_get_prandom_u64()
495+
% (futex_contended_delay_max_ns - futex_contended_delay_min_ns);
496+
}
497+
498+
if (update_delayed_task_vtime(val.delay_dsq_cpu_idx, val.timeout_key, val.pid, vtime))
499+
chaos_stat_inc(CHAOS_STAT_TRAIT_FUTEX_DELAYS_CONTENDED);
500+
}
501+
502+
return true;
503+
}
504+
309505
__weak s32 enqueue_random_delay(struct task_struct *p __arg_trusted,
310506
u64 enq_flags,
311507
struct chaos_task_ctx *taskc __arg_nonnull,
@@ -334,6 +530,10 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags,
334530
random_delays_max_ns);
335531
chaos_stat_inc(CHAOS_STAT_TRAIT_RANDOM_DELAYS);
336532
break;
533+
534+
case CHAOS_TRAIT_FUTEX_DELAYS:
535+
out = enqueue_futex_delay(p, enq_flags, taskc);
536+
break;
337537
case CHAOS_TRAIT_NONE:
338538
chaos_stat_inc(CHAOS_STAT_CHAOS_SKIPPED);
339539
out = false;
@@ -345,7 +545,6 @@ __weak s32 enqueue_chaotic(struct task_struct *p __arg_trusted, u64 enq_flags,
345545
break;
346546
}
347547

348-
taskc->next_trait = CHAOS_TRAIT_NONE;
349548
return out;
350549
}
351550

@@ -580,10 +779,10 @@ void BPF_STRUCT_OPS(chaos_enqueue, struct task_struct *p __arg_trusted,
580779
if (promise.kind == P2DQ_ENQUEUE_PROMISE_FAILED)
581780
goto cleanup;
582781

583-
if ((taskc->next_trait == CHAOS_TRAIT_RANDOM_DELAYS ||
584-
taskc->next_trait == CHAOS_TRAIT_KPROBE_RANDOM_DELAYS) &&
585-
enqueue_chaotic(p, enq_flags, taskc))
782+
if (enqueue_chaotic(p, enq_flags, taskc)) {
783+
taskc->next_trait = CHAOS_TRAIT_NONE;
586784
goto cleanup;
785+
}
587786

588787
// NOTE: this may not work for affinitized tasks because p2dq does
589788
// direct dispatch in some situations.
@@ -696,6 +895,52 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(chaos_init_task, struct task_struct *p,
696895
return 0;
697896
}
698897

898+
SEC("?tracepoint/syscalls/sys_enter_futex")
899+
int rtp_sys_enter_futex(struct tp_syscall_enter_futex *ctx)
900+
{
901+
struct task_struct *p;
902+
struct chaos_task_ctx *taskc;
903+
int futex_op;
904+
s32 ret;
905+
906+
// should be detached from userspace but if it is attached then no-op
907+
if (!futex_uncontended_delay_ns && !futex_contended_delay_min_ns &&
908+
!futex_contended_delay_max_ns)
909+
return 0;
910+
911+
p = (struct task_struct *)bpf_get_current_task_btf();
912+
taskc = lookup_create_chaos_task_ctx(p);
913+
if (!taskc)
914+
return 0;
915+
916+
if (!(taskc->match & CHAOS_MATCH_COMPLETE)) {
917+
ret = calculate_chaos_match(p);
918+
if (ret) {
919+
scx_bpf_error("failed to match task");
920+
return 0;
921+
}
922+
}
923+
924+
if (taskc->match & CHAOS_MATCH_EXCLUDED)
925+
return 0;
926+
927+
futex_op = ctx->op & FUTEX_CMD_MASK;
928+
929+
if (futex_op != FUTEX_WAIT && futex_op != FUTEX_WAIT_BITSET &&
930+
futex_op != FUTEX_WAIT_REQUEUE_PI)
931+
return 0;
932+
933+
// The task is either about to wait because it hit FUTEX_WAIT on the slow
934+
// path or hit the fast path. The fast path is irrelevant for our purposes
935+
// as we have no scheduler input there, so it's safe to delay our work
936+
// until a struct_ops .runnable callback comes along.
937+
taskc->pending_trait = CHAOS_TRAIT_FUTEX_DELAYS;
938+
taskc->futex_uaddr = (u64)ctx->uaddr;
939+
940+
return 0;
941+
}
942+
943+
699944
SEC("kprobe/generic")
700945
int generic(struct pt_regs *ctx)
701946
{

0 commit comments

Comments
 (0)