From dfc097b7b68bd0dcc835730bc600ba59114fc17d Mon Sep 17 00:00:00 2001 From: MalikHou Date: Thu, 26 Feb 2026 21:57:19 +0800 Subject: [PATCH 1/8] add active task --- .gitignore | 1 + README.md | 1 + README_cn.md | 1 + docs/cn/bthread_active_task.md | 309 ++++++ src/bthread/bthread.cpp | 141 +++ src/bthread/butex.cpp | 66 +- src/bthread/butex.h | 8 + src/bthread/parking_lot.h | 7 +- src/bthread/task_control.cpp | 16 +- src/bthread/task_control.h | 7 + src/bthread/task_group.cpp | 428 ++++++- src/bthread/task_group.h | 47 +- src/bthread/task_group_inl.h | 9 + src/bthread/task_meta.h | 12 + src/bthread/unstable.h | 74 ++ test/bthread_active_task_unittest.cpp | 1470 +++++++++++++++++++++++++ 16 files changed, 2560 insertions(+), 37 deletions(-) create mode 100644 docs/cn/bthread_active_task.md create mode 100644 test/bthread_active_task_unittest.cpp diff --git a/.gitignore b/.gitignore index 44a371abb8..1759703ea3 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ /output /test/output build/ +.cache # Ignore hidden files .* diff --git a/README.md b/README.md index fc778acf4b..4d642c3342 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ You can use it to: * [bthread or not](docs/cn/bthread_or_not.md) * [thread-local](docs/cn/thread_local.md) * [Execution Queue](docs/cn/execution_queue.md) + * [Active Task (experimental)](docs/cn/bthread_active_task.md) * Client * [Basics](docs/en/client.md) * [Error code](docs/en/error_code.md) diff --git a/README_cn.md b/README_cn.md index bed6e8437f..f0546ecedf 100644 --- a/README_cn.md +++ b/README_cn.md @@ -38,6 +38,7 @@ * [bthread or not](docs/cn/bthread_or_not.md) * [thread-local](docs/cn/thread_local.md) * [Execution Queue](docs/cn/execution_queue.md) + * [Active Task(实验性)](docs/cn/bthread_active_task.md) * [bthread tracer](docs/cn/bthread_tracer.md) * Client * [基础功能](docs/cn/client.md) diff --git a/docs/cn/bthread_active_task.md b/docs/cn/bthread_active_task.md new file mode 100644 index 0000000000..4291add01c --- /dev/null +++ b/docs/cn/bthread_active_task.md @@ -0,0 +1,309 @@ +# bthread Active Task(实验性/UNSTABLE) + +本文介绍当前 brpc 中新增的 **Active Task** 基础设施,以及在服务端请求处理中配合 `butex` 实现: + +- 请求处理 bthread 挂起等待(例如等待 io_uring completion) +- 在 bthread worker 的 active-task hook 中收割 completion +- 在 hook 内把 waiter 恢复到当前 worker 的 **local runqueue**(不走 `_remote_rq`) + +本文描述的是 **当前实现** 的使用方式与边界,接口位于 `bthread/unstable.h`,属于 UNSTABLE API。 + +## 适用场景 + +典型场景是“每个 bthread worker 一个本地 reactor”(例如每 worker 一个 io_uring ring): + +1. worker 初始化时创建本地 reactor/ring。 +2. 提交异步 IO 后,在私有 `butex` 上通过 `bthread_butex_wait_local` 挂起。 +4. worker 的 active-task hook 收割 completion。 +5. hook 内调用 `bthread_butex_wake_within(ctx, req->butex)` 唤醒 waiter。 +6. waiter bthread 在同一个 worker 上恢复执行(不会被 steal)。 + +## 当前提供的接口(UNSTABLE) + +头文件:`src/bthread/unstable.h` + +- `bthread_register_active_task_type(...)` +- `bthread_butex_wake_within(...)` +- `bthread_butex_wait_local(...)` + +相关类型: + +- `bthread_active_task_ctx_t` +- `bthread_active_task_type_t` + +### 关键限制(当前实现) + +- Active-task callback 只允许做**短小、非阻塞**的维护逻辑(如收割 completion + 唤醒 waiter)。 +- **不支持**在 active-task hook 中创建新的 bthread。 +- `harvest` 返回值语义:`0` 表示正常;`1` 表示本轮 worker loop 跳过一次 `ParkingLot::wait`(立即重试)。 +- `bthread_butex_wake_within(ctx, butex)` 只允许在 active-task `harvest` 回调中调用。 +- `bthread_butex_wait_local(...)` 内部会对本次 wait 启用 **隐式 wait-scope 本地化 pin**: + - 从进入 wait 到返回(成功/超时/中断/被普通 `butex_wake*` 唤醒)这一段,恢复会被路由回 home worker + - 恢复前不会被 steal + - 返回后 task 恢复默认调度行为(后续 `yield` 仍可能迁移) +- `bthread_butex_wake_within` 只适用于“每请求私有 butex(单 waiter)”模型: + - 0 waiter -> 返回 `0` + - 1 waiter(同 `TaskControl`、同 tag 的 bthread waiter)-> 返回 `1` + - 否则返回 `-1` 且 `errno=EINVAL`(多 waiter / pthread waiter / 跨 tag / 跨 `TaskControl`) + +## 快速上手(推荐接入顺序) + +如果你只是要把“本地 IO completion -> 唤醒等待中的 bthread”跑通,按下面顺序做: + +1. 进程启动早期注册 active-task(在任何 bthread/brpc 初始化前)。 +2. 在 `worker_init` 里初始化每 worker 本地 reactor/ring。 +3. 请求对象里放一个私有 `butex`(每请求一个)。 +4. 请求 bthread 提交异步 IO 后调用 `bthread_butex_wait_local(...)`。 +5. 在 active-task `harvest` 里收割 completion,找到 `ReqCtx*`,然后调用 `bthread_butex_wake_within(ctx, req->done_butex)`。 +6. waiter 从 `bthread_butex_wait_local(...)` 返回后继续处理结果。 + +建议先在单 worker 环境验证,再切到多 worker。 + +## 一个最小使用流程(服务端请求处理) + +下面用“异步 IO + butex 挂起/唤醒”的模式说明。 + +### 1. 定义每 worker 的本地状态(例如 ring) + +`worker_init/worker_destroy` 用于每个 worker 上的初始化/销毁。 + +```cpp +struct WorkerIoState { + // 示例:io_uring ring / 本地 reactor / 统计等 + // io_uring ring; +}; + +static thread_local WorkerIoState* tls_worker_io = NULL; + +static int IoWorkerInit(void** worker_local, + const bthread_active_task_ctx_t* ctx, + void* user_data) { + (void)ctx; + (void)user_data; + WorkerIoState* s = new WorkerIoState; + // 初始化 ring/reactor ... + tls_worker_io = s; + *worker_local = s; + return 0; +} + +static void IoWorkerDestroy(void* worker_local, + const bthread_active_task_ctx_t* ctx, + void* user_data) { + (void)ctx; + (void)user_data; + WorkerIoState* s = static_cast(worker_local); + if (tls_worker_io == s) { + tls_worker_io = NULL; + } + // 销毁 ring/reactor ... + delete s; +} +``` + +### 2. 启动前注册 active-task 类型(必须在 bthread/brpc 初始化前) + +必须在任何 bthread/Server 启动前完成注册(例如 `main()` 早期)。 + +```cpp +static int IoHarvest( + void* worker_local, const bthread_active_task_ctx_t* ctx); + +void RegisterIoActiveTask() { + bthread_active_task_type_t t; + memset(&t, 0, sizeof(t)); + t.struct_size = sizeof(t); + t.name = "io_active_task"; + t.worker_init = IoWorkerInit; + t.worker_destroy = IoWorkerDestroy; + t.harvest = IoHarvest; + const int rc = bthread_register_active_task_type(&t); + CHECK_EQ(0, rc); +} +``` + +## 3. 请求对象里携带私有 butex(关键) + +当前框架**不会自动传递 butex** 到 active-task hook。 + +正确做法是:把 `butex` 放进请求对象里,并通过异步 completion 的 `user_data` 找回请求对象。 + +```cpp +struct ReqCtx { + void* done_butex; // 每请求私有 butex + std::atomic done; // 0 -> 未完成, 1 -> 完成 + int result; + // 其他字段:fd/buffer/offset/cancel 等 +}; +``` + +请求等待本地 IO completion 时(推荐用法): + +```cpp +ReqCtx req; +req.done_butex = bthread::butex_create(); +static_cast*>(req.done_butex)->store(0, butil::memory_order_relaxed); +req.done.store(0, std::memory_order_relaxed); + +// 提交异步 IO,把 &req 作为 completion 的 user_data(例如 io_uring cqe->user_data) +SubmitAsyncIo(&req); + +// 挂起当前 bthread,等待 active-task hook 唤醒。 +// bthread_butex_wait_local 内部会对本次 wait 启用 wait-scope 本地化 pin, +// 保证恢复在同一个 worker 上,且恢复前不会被 steal。 +int rc = bthread_butex_wait_local(req.done_butex, 0, NULL); +if (rc != 0 && errno != EWOULDBLOCK) { + // 处理错误/中断/超时(如果设置了超时) +} + +// 被唤醒后继续执行(返回点在同一个 worker 上) +UseResult(req.result); +bthread::butex_destroy(req.done_butex); +``` + +## 4. 在 active-task `harvest` 回调中收割 completion 并唤醒 waiter + +核心点: + +- 在 hook 中通过 completion 找回 `ReqCtx*` +- 写结果 +- 调 `bthread_butex_wake_within(ctx, req->done_butex)`,显式本地唤醒 + +```cpp +static bool HarvestCompletions( + void* worker_local, const bthread_active_task_ctx_t* ctx) { + WorkerIoState* s = static_cast(worker_local); + (void)s; + + bool made_progress = false; + + // 伪代码:循环收割 completion + for (;;) { + ReqCtx* req = TryPopOneCompletion(); // 例如从 io_uring CQ 取 cqe->user_data + if (req == NULL) { + break; + } + + // “做点什么事”:写 completion 结果 + req->result = 123; // 示例 + req->done.store(1, std::memory_order_release); + + errno = 0; + const int wake_rc = bthread_butex_wake_within(ctx, req->done_butex); + if (wake_rc == 1) { + made_progress = true; + } else if (wake_rc == 0) { + // 没有 waiter(可能已超时/取消),按需处理 + } else { + // EINVAL/EPERM 代表用法或上下文不满足要求,应该报警 + } + } + return made_progress; +} + +static int IoHarvest( + void* worker_local, const bthread_active_task_ctx_t* ctx) { + const bool made_progress = HarvestCompletions(worker_local, ctx); + return made_progress ? 1 : 0; // 1 == skip current park once +} +``` + +### `bthread_butex_wake_within(...)` 返回值与错误码(实用) + +- 返回 `1`:成功唤醒了 1 个 waiter(这是常见主路径) +- 返回 `0`:当前 butex 上没有 waiter(例如 timeout/取消竞争后已无人等待) +- 返回 `-1`: + - `EPERM`:不在 active-task `harvest` 回调里调用 + - `EINVAL`:butex 不满足 within 语义(多 waiter / pthread waiter / 跨 tag / 跨 `TaskControl`),或 wrong-worker invariant 被触发 + +建议: + +- `wake_rc == 0` 当作**合法分支**处理(不是异常) +- `wake_rc < 0` 视为**用法/所有权错误**并记录错误日志或计数 + +## 调用时机与可调间隔(busy/idle) + +Active-task `harvest` 回调会在 worker 调度循环的多个内部时机被调用(实现细节),你可以把业务逻辑统一写在一个 `HarvestCompletions()` 中。 + +当前实现有两个关键 gflag(都可调): + +- gflag: `bthread_active_task_poll_every_nswitch` +- 默认值:`1` +- 含义:每 N 次 bthread 切换,在 worker 主循环中额外执行一次 `harvest`(busy worker 场景) + +- gflag: `bthread_active_task_idle_wait_ns` +- 默认值:`1000000`(1ms) +- 含义:当 worker 没有 runnable task 时,进入 `ParkingLot::wait()` 的空闲等待间隔 + - `<0`:无限等待(仅靠 signal/stop 唤醒) + - `=0`:不进入 park(空转) + - `>0`:按该间隔超时醒来,再继续循环并执行 `harvest` + +推荐写法(与你的场景一致): + +- 把业务逻辑统一写在 `HarvestCompletions()` +- 用 `bthread_active_task_poll_every_nswitch=1` 降低 busy worker 的 completion 延迟 +- 用 `bthread_active_task_idle_wait_ns` 控制 idle 场景的轮询间隔(CPU/延迟折中) + +说明: + +- 长时间不 `yield`/阻塞的 bthread 会阻塞 worker 主循环,这种情况下无论 `poll_every_nswitch=1` 还是更大值都无解(协作式调度边界) +- `poll_every_nswitch=1` 优化的是“busy 且有切换”的 worker 场景 + +## `bthread_butex_wait_local(...)` 的本地化保证范围(wait-scope) + +`bthread_butex_wait_local(...)` 会在 runtime 内部对“这一次 wait”启用隐式本地化 pin。 + +在该 wait 生命周期内,当前 task 的恢复路径会被 runtime 路由回 `home worker`,并进入非 steal 队列: + +- active-task `bthread_butex_wake_within(...)` +- timeout(TimerThread) +- interruption(`bthread_interrupt` / `bthread_stop`) +- 普通 `butex_wake*`(即使误用普通 wake,也会按 pin 语义回 home worker) + +因此在这次 `bthread_butex_wait_local(...)` 调用的生命周期内: + +- 不会迁移到其他 worker +- 不会通过普通 `_rq/_remote_rq` 暴露给 steal + +说明: + +- `bthread_butex_wake_within(ctx, butex)` 仍会做 `home TaskGroup == 当前 hook TaskGroup` 的 invariant 校验; + 正常生产路径不应触发错误。 +- `bthread_butex_wait_local(...)` 返回后,task 会恢复默认调度行为(后续 `yield`/调度仍可能被 steal)。 + +## 非保证范围(避免误解) + +下面这些**不在** `bthread_butex_wait_local(...)` 的 wait-scope 本地化保证范围内: + +- `bthread_butex_wait_local(...)` 返回之后的后续调度(例如后面的 `yield`) +- 整个请求处理阶段都固定在同一 worker(本文档当前方案不保证) +- completion “一定会被正确 worker 的 `harvest` 收到”这件事本身 + +最后一条尤其重要:runtime 会在 `bthread_butex_wake_within(...)` 做 wrong-worker invariant 校验,但这只是第二道防线。业务侧(例如 per-worker io_uring)仍要保证 completion ownership 正确。 + +## 调试与测试建议 + +- 先用单 worker 环境验证链路(最容易确认“wait -> harvest -> wake -> resume”) +- 再开启多 worker 验证吞吐和尾延迟 +- 在测试里显式记录: + - waiter 挂起前的 worker 线程 + - hook 执行线程 + - waiter 恢复后的线程 +- 如果出现 `EINVAL`: + - 检查是否误用了多 waiter butex + - 检查是否在 pthread waiter 上使用了 within wake + - 检查 tag / `TaskControl` 是否匹配 + - 检查 completion 是否被错误 worker 的 `harvest` 收割(ownership/routing 问题) + +## 注意事项(务必遵守) + +- `bthread_register_active_task_type()` 必须在 bthread/brpc 初始化前调用 +- active-task hook 内不要阻塞,不要调用会导致调度切换的 bthread API +- 每请求使用私有 butex(不要让多个 waiter 复用同一个 butex) +- `bthread_butex_wait_local(...)` 的本地化保证范围是“本次 wait 生命周期”,不是整个请求处理阶段 +- 请求对象生命周期必须覆盖: + - 提交异步 IO + - completion 收割 + - waiter 恢复读取结果 +- 在 completion 可能与超时/取消竞争的场景,按业务需要处理 `wake_rc == 0`(表示此时 butex 上没有 waiter) +- 建议把 butex 值本身作为完成状态位(先写结果/状态,再 wake),避免丢唤醒 diff --git a/src/bthread/bthread.cpp b/src/bthread/bthread.cpp index ac49f269d9..44f206e0e3 100644 --- a/src/bthread/bthread.cpp +++ b/src/bthread/bthread.cpp @@ -20,6 +20,8 @@ // Date: Tue Jul 10 17:40:58 CST 2012 #include +#include +#include #include #include "butil/macros.h" // BAIDU_CASSERT #include "butil/logging.h" @@ -85,6 +87,8 @@ pthread_mutex_t g_task_control_mutex = PTHREAD_MUTEX_INITIALIZER; // Notice that we can't declare the variable as atomic which // are not constructed before main(). TaskControl* g_task_control = NULL; +static pthread_mutex_t g_active_task_registry_mutex = PTHREAD_MUTEX_INITIALIZER; +static std::vector g_active_task_types; extern BAIDU_THREAD_LOCAL TaskGroup* tls_task_group; EXTERN_BAIDU_VOLATILE_THREAD_LOCAL(TaskGroup*, tls_task_group); @@ -96,6 +100,89 @@ inline TaskControl* get_task_control() { return g_task_control; } +static bool normalize_active_task_type(const bthread_active_task_type_t* in, + bthread_active_task_type_t* out) { + if (in == NULL || out == NULL) { + return false; + } + if (in->struct_size < sizeof(bthread_active_task_type_t)) { + return false; + } + if (in->name == NULL || in->name[0] == '\0') { + return false; + } + if (in->worker_init == NULL && in->worker_destroy == NULL && + in->harvest == NULL) { + return false; + } + memset(out, 0, sizeof(*out)); + memcpy(out, in, sizeof(*out)); + out->struct_size = sizeof(*out); + return true; +} + +void get_active_task_types_snapshot(std::vector* out) { + if (out == NULL) { + return; + } + BAIDU_SCOPED_LOCK(g_active_task_registry_mutex); + *out = g_active_task_types; +} + +static inline TaskMeta* current_normal_bthread_for_local_pin(int* err) { + TaskGroup* g = tls_task_group; + if (g == NULL || g->is_current_main_task() || g->is_current_pthread_task()) { + if (err) { + *err = EPERM; + } + return NULL; + } + if (err) { + *err = 0; + } + return g->current_task(); +} + +static inline int enter_local_pin_scope(TaskMeta* m, TaskGroup* g) { + if (m == NULL || g == NULL) { + return EINVAL; + } + if (!m->local_pin_enabled) { + m->local_pin_home_group = g; + m->local_pin_home_control = g->control(); + m->local_pin_home_tag = g->tag(); + m->local_pin_depth = 1; + m->local_pin_enabled = true; + return 0; + } + if (m->local_pin_home_group != g || + m->local_pin_home_control != g->control() || + m->local_pin_home_tag != g->tag()) { + return EPERM; + } + if (m->local_pin_depth == std::numeric_limits::max()) { + return EINVAL; + } + ++m->local_pin_depth; + return 0; +} + +static inline int leave_local_pin_scope(TaskMeta* m) { + if (m == NULL) { + return EINVAL; + } + if (!m->local_pin_enabled || m->local_pin_depth == 0) { + return EINVAL; + } + if (--m->local_pin_depth == 0) { + m->local_pin_enabled = false; + m->local_pin_home_group = NULL; + m->local_pin_home_control = NULL; + m->local_pin_home_tag = BTHREAD_TAG_INVALID; + } + return 0; +} + inline TaskControl* get_or_new_task_control() { butil::atomic* p = (butil::atomic*)&g_task_control; TaskControl* c = p->load(butil::memory_order_consume); @@ -146,6 +233,11 @@ bthread_t init_for_pthread_stack_trace() { } pthread_fake_meta->attr = BTHREAD_ATTR_PTHREAD; + pthread_fake_meta->local_pin_home_group = NULL; + pthread_fake_meta->local_pin_home_control = NULL; + pthread_fake_meta->local_pin_home_tag = BTHREAD_TAG_INVALID; + pthread_fake_meta->local_pin_depth = 0; + pthread_fake_meta->local_pin_enabled = false; pthread_fake_meta->tid = make_tid(*pthread_fake_meta->version_butex, slot); // Make TaskTracer use signal trace mode for pthread. c->_task_tracer.set_running_status(syscall(SYS_gettid), pthread_fake_meta); @@ -328,6 +420,55 @@ struct TidJoiner { extern "C" { +int bthread_register_active_task_type(const bthread_active_task_type_t* type) { + bthread_active_task_type_t normalized; + if (!bthread::normalize_active_task_type(type, &normalized)) { + return EINVAL; + } + BAIDU_SCOPED_LOCK(bthread::g_task_control_mutex); + if (bthread::get_task_control() != NULL) { + return EPERM; + } + BAIDU_SCOPED_LOCK(bthread::g_active_task_registry_mutex); + bthread::g_active_task_types.push_back(normalized); + return 0; +} + +int bthread_butex_wake_within(const bthread_active_task_ctx_t* ctx, + void* butex) { + return bthread::TaskGroup::butex_wake_within_active_task(ctx, butex); +} + +int bthread_butex_wait_local(void* butex, int expected_value, + const struct timespec* abstime) { + if (butex == NULL) { + errno = EINVAL; + return -1; + } + int err = 0; + bthread::TaskMeta* m = bthread::current_normal_bthread_for_local_pin(&err); + if (m == NULL) { + errno = err; + return -1; + } + bthread::TaskGroup* g = bthread::tls_task_group; + err = bthread::enter_local_pin_scope(m, g); + if (err != 0) { + errno = err; + return -1; + } + const int rc = bthread::butex_wait(butex, expected_value, abstime); + const int saved_errno = errno; + const int leave_err = bthread::leave_local_pin_scope(m); + if (leave_err != 0) { + LOG(ERROR) << "Fail to leave local pin scope after bthread_butex_wait_local"; + errno = leave_err; + return -1; + } + errno = saved_errno; + return rc; +} + int bthread_start_urgent(bthread_t* __restrict tid, const bthread_attr_t* __restrict attr, void * (*fn)(void*), diff --git a/src/bthread/butex.cpp b/src/bthread/butex.cpp index aca1281670..ffa43f7e5f 100644 --- a/src/bthread/butex.cpp +++ b/src/bthread/butex.cpp @@ -95,6 +95,7 @@ struct ButexBthreadWaiter : public ButexWaiter { WaiterState waiter_state; int expected_value; Butex* initial_butex; + TaskGroup* home_group; TaskControl* control; const timespec* abstime; bthread_tag_t tag; @@ -295,6 +296,12 @@ inline TaskGroup* get_task_group(TaskControl* c, bthread_tag_t tag) { } inline void run_in_local_task_group(TaskGroup* g, TaskMeta* next_meta, bool nosignal) { + // Pinned tasks must go through pin-aware routing even on same-tag local fast + // paths, otherwise TaskGroup::exchange() may resume them on the wrong worker. + if (next_meta->local_pin_enabled && next_meta->local_pin_depth > 0) { + g->ready_to_run(next_meta, nosignal); + return; + } if (!nosignal) { TaskGroup::exchange(&g, next_meta); } else { @@ -302,6 +309,44 @@ inline void run_in_local_task_group(TaskGroup* g, TaskMeta* next_meta, bool nosi } } +int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { + if (arg == NULL || target_group == NULL) { + errno = EINVAL; + return -1; + } + Butex* b = container_of(static_cast*>(arg), Butex, value); + ButexBthreadWaiter* bbw = NULL; + { + BAIDU_SCOPED_LOCK(b->waiter_lock); + if (b->waiters.empty()) { + return 0; + } + butil::LinkNode* head = b->waiters.head(); + if (head->next() != b->waiters.end()) { + errno = EINVAL; + return -1; + } + ButexWaiter* bw = head->value(); + if (bw->tid == 0) { + errno = EINVAL; + return -1; + } + bbw = static_cast(bw); + if (bbw->home_group != target_group || + bbw->control != target_group->control() || + bbw->tag != target_group->tag()) { + errno = EINVAL; + return -1; + } + bw->RemoveFromList(); + bw->container.store(NULL, butil::memory_order_relaxed); + } + + unsleep_if_necessary(bbw, get_global_timer_thread()); + target_group->ready_to_run(bbw->task_meta, true); + return 1; +} + int butex_wake(void* arg, bool nosignal) { Butex* b = container_of(static_cast*>(arg), Butex, value); ButexWaiter* front = NULL; @@ -490,10 +535,10 @@ int butex_requeue(void* arg, void* arg2) { ButexBthreadWaiter* bbw = static_cast(front); unsleep_if_necessary(bbw, get_global_timer_thread()); auto g = is_same_tag(bbw->tag) ? tls_task_group : NULL; - if (g) { + if (g && !(bbw->task_meta->local_pin_enabled && bbw->task_meta->local_pin_depth > 0)) { TaskGroup::exchange(&g, bbw->task_meta); } else { - bbw->control->choose_one_group(bbw->tag)->ready_to_run_remote(bbw->task_meta); + get_task_group(bbw->control, bbw->tag)->ready_to_run_general(bbw->task_meta); } return 1; } @@ -566,6 +611,9 @@ void wait_for_butex(void* arg) { // value. { BAIDU_SCOPED_LOCK(b->waiter_lock); + if (bw->task_meta->local_pin_enabled && bw->task_meta->local_pin_depth > 0) { + DCHECK_EQ(bw->task_meta->local_pin_home_group, bw->home_group); + } if (b->value.load(butil::memory_order_relaxed) != bw->expected_value) { bw->waiter_state = WAITER_STATE_UNMATCHEDVALUE; } else if (bw->waiter_state == WAITER_STATE_READY/*1*/ && @@ -680,15 +728,27 @@ int butex_wait(void* arg, int expected_value, const timespec* abstime, bool prep if (NULL == g || g->is_current_pthread_task()) { return butex_wait_from_pthread(g, b, expected_value, abstime, prepend); } + TaskMeta* current = g->current_task(); + if (current->local_pin_enabled && current->local_pin_depth > 0) { + if (current->local_pin_home_group != g || + current->local_pin_home_control != g->control() || + current->local_pin_home_tag != g->tag()) { + errno = EPERM; + return -1; + } + } ButexBthreadWaiter bbw; // tid is 0 iff the thread is non-bthread bbw.tid = g->current_tid(); bbw.container.store(NULL, butil::memory_order_relaxed); - bbw.task_meta = g->current_task(); + bbw.task_meta = current; bbw.sleep_id = 0; bbw.waiter_state = WAITER_STATE_READY; bbw.expected_value = expected_value; bbw.initial_butex = b; + bbw.home_group = (current->local_pin_enabled && current->local_pin_depth > 0) + ? current->local_pin_home_group + : g; bbw.control = g->control(); bbw.abstime = abstime; bbw.tag = g->tag(); diff --git a/src/bthread/butex.h b/src/bthread/butex.h index bf86611ea6..2e7a195d7f 100644 --- a/src/bthread/butex.h +++ b/src/bthread/butex.h @@ -29,6 +29,8 @@ namespace bthread { +class TaskGroup; + // If a thread would suspend for less than so many microseconds, return // ETIMEDOUT directly. // Use 1: sleeping for less than 2 microsecond is inefficient and useless. @@ -67,6 +69,12 @@ int butex_wake_all(void* butex, bool nosignal = false); // Returns # of threads woken up. int butex_wake_except(void* butex, bthread_t excluded_bthread); +// Internal helper used by active-task within wake APIs. Explicitly enqueue the +// single resumed bthread into `target_group` local queue with nosignal. +// Returns 0 when there is no waiter, 1 when one waiter is woken, -1 on +// failure and sets errno. +int butex_wake_to_task_group(void* butex, TaskGroup* target_group); + // Wake up at most 1 thread waiting on |butex1|, let all other threads wait // on |butex2| instead. // Returns # of threads woken up. diff --git a/src/bthread/parking_lot.h b/src/bthread/parking_lot.h index bbc9a7c3fd..127dad4b64 100644 --- a/src/bthread/parking_lot.h +++ b/src/bthread/parking_lot.h @@ -65,6 +65,11 @@ class BAIDU_CACHELINE_ALIGNMENT ParkingLot { // Wait for tasks. // If the `expected_state' does not match, wait() may finish directly. void wait(const State& expected_state) { + wait(expected_state, NULL); + } + + // Wait for tasks with an optional relative timeout. + void wait(const State& expected_state, const timespec* timeout) { if (get_state().val != expected_state.val) { // Fast path, no need to futex_wait. return; @@ -72,7 +77,7 @@ class BAIDU_CACHELINE_ALIGNMENT ParkingLot { if (_no_signal_when_no_waiter) { _waiter_num.fetch_add(1, butil::memory_order_relaxed); } - futex_wait_private(&_pending_signal, expected_state.val, NULL); + futex_wait_private(&_pending_signal, expected_state.val, timeout); if (_no_signal_when_no_waiter) { _waiter_num.fetch_sub(1, butil::memory_order_relaxed); } diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index ba067e3976..fbcdedf3e9 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -109,7 +109,9 @@ void* TaskControl::worker_thread(void* arg) { int worker_id = c->_next_worker_id.fetch_add( 1, butil::memory_order_relaxed); + int bound_cpu = -1; if (!c->_cpus.empty()) { + bound_cpu = static_cast(c->_cpus[worker_id % c->_cpus.size()]); bind_thread_to_cpu(pthread_self(), c->_cpus[worker_id % c->_cpus.size()]); } if (FLAGS_task_group_set_worker_name) { @@ -119,12 +121,20 @@ void* TaskControl::worker_thread(void* arg) { } BT_VLOG << "Created worker=" << pthread_self() << " tid=" << g->_tid << " bthread=" << g->main_tid() << " tag=" << g->tag(); + g->_worker_index = worker_id; + g->_bound_cpu = bound_cpu; tls_task_group = g; + if (g->init_active_tasks_for_worker() != 0) { + LOG(FATAL) << "Fail to init active tasks in pthread=" << pthread_self(); + return NULL; + } c->_nworkers << 1; c->tag_nworkers(g->tag()) << 1; g->run_main_task(); + g->destroy_active_tasks_for_worker(); + stat = g->main_stat(); BT_VLOG << "Destroying worker=" << pthread_self() << " bthread=" << g->main_tid() << " idle=" << stat.cputime_ns / 1000000.0 @@ -228,6 +238,8 @@ int TaskControl::init(int concurrency) { } } + get_active_task_types_snapshot(&_active_task_types); + // task group group by tags for (int i = 0; i < FLAGS_task_group_ntags; ++i) { _tagged_ngroup[i].store(0, std::memory_order_relaxed); @@ -627,7 +639,9 @@ double TaskControl::get_cumulated_worker_time(bthread_tag_t tag) { const size_t ngroup = tag_ngroup(tag).load(butil::memory_order_relaxed); auto& groups = tag_group(tag); for (size_t i = 0; i < ngroup; ++i) { - cputime_ns += groups[i]->cumulated_cputime_ns(); + if (groups[i]) { + cputime_ns += groups[i]->cumulated_cputime_ns(); + } } return cputime_ns / 1000000000.0; } diff --git a/src/bthread/task_control.h b/src/bthread/task_control.h index 4480daa677..2bb045dda8 100644 --- a/src/bthread/task_control.h +++ b/src/bthread/task_control.h @@ -32,6 +32,7 @@ #include #include "butil/atomicops.h" // butil::atomic #include "bvar/bvar.h" // bvar::PassiveStatus +#include "bthread/unstable.h" // active task types #include "bthread/task_tracer.h" #include "bthread/task_meta.h" // TaskMeta #include "bthread/work_stealing_queue.h" // WorkStealingQueue @@ -42,6 +43,10 @@ namespace bthread { class TaskGroup; +// Internal helper implemented in bthread.cpp. TaskControl snapshots active-task +// registrations during init so worker threads can access a stable list. +void get_active_task_types_snapshot(std::vector* out); + // Control all task groups class TaskControl { friend class TaskGroup; @@ -166,6 +171,8 @@ friend bthread_t init_for_pthread_stack_trace(); bool _enable_priority_queue; std::vector> _priority_queues; + std::vector _active_task_types; + size_t _pl_num_of_each_tag; std::vector _tagged_pl; diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 877a5d406e..785c6c7260 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -63,6 +63,26 @@ DEFINE_bool(bthread_enable_cpu_clock_stat, false, "Enable CPU clock statistics for bthread"); BUTIL_VALIDATE_GFLAG(bthread_enable_cpu_clock_stat, butil::PassValidate); +DEFINE_int32(bthread_active_task_poll_every_nswitch, 1, + "Run active-task maintenance poll in worker main loop every N " + "task switches when active tasks are registered. Set to 0 to " + "disable periodic polling on busy workers."); +static bool validate_bthread_active_task_poll_every_nswitch(const char*, int32_t val) { + return val >= 0; +} +BUTIL_VALIDATE_GFLAG(bthread_active_task_poll_every_nswitch, + validate_bthread_active_task_poll_every_nswitch); + +DEFINE_int64(bthread_active_task_idle_wait_ns, 1000 * 1000, + "Active-task worker idle wait interval in nanoseconds when no " + "tasks are runnable. <0 waits indefinitely (only signals), 0 " + "skips parking, >0 uses timed ParkingLot wait."); +static bool validate_bthread_active_task_idle_wait_ns(const char*, int64_t val) { + return val >= -1; +} +BUTIL_VALIDATE_GFLAG(bthread_active_task_idle_wait_ns, + validate_bthread_active_task_idle_wait_ns); + BAIDU_VOLATILE_THREAD_LOCAL(TaskGroup*, tls_task_group, NULL); // Sync with TaskMeta::local_storage when a bthread is created or destroyed. // During running, the two fields may be inconsistent, use tls_bls as the @@ -80,6 +100,9 @@ const TaskStatistics EMPTY_STAT = { 0, 0, 0 }; void* (*g_create_span_func)() = NULL; +static __thread const bthread_active_task_ctx_t* tls_active_task_hook_ctx = NULL; +static __thread TaskGroup* tls_active_task_hook_group = NULL; + void* run_create_span_func() { if (g_create_span_func) { return g_create_span_func(); @@ -161,26 +184,219 @@ bool TaskGroup::is_stopped(bthread_t tid) { return true; } +TaskGroup* TaskGroup::validate_active_task_hook_ctx( + const bthread_active_task_ctx_t* ctx, int* err) { + if (ctx == NULL) { + if (err) { + *err = EINVAL; + } + return NULL; + } + if (ctx != tls_active_task_hook_ctx || tls_active_task_hook_group == NULL) { + if (err) { + *err = EPERM; + } + return NULL; + } + TaskGroup* g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); + if (g == NULL || g != tls_active_task_hook_group || !g->is_current_main_task()) { + if (err) { + *err = EPERM; + } + return NULL; + } + const ActiveTaskCtxImpl* impl = static_cast(ctx->impl); + if (impl == NULL || impl->magic != ActiveTaskCtxImpl::MAGIC || impl->group != g) { + if (err) { + *err = EPERM; + } + return NULL; + } + if (err) { + *err = 0; + } + return g; +} + +int TaskGroup::butex_wake_within_active_task(const bthread_active_task_ctx_t* ctx, + void* butex) { + if (butex == NULL) { + errno = EINVAL; + return -1; + } + int err = 0; + TaskGroup* g = validate_active_task_hook_ctx(ctx, &err); + if (g == NULL) { + errno = err; + return -1; + } + return butex_wake_to_task_group(butex, g); +} + +int TaskGroup::init_active_tasks_for_worker() { + _active_task_instances.clear(); + if (_control == NULL || _control->_active_task_types.empty()) { + return 0; + } + try { + _active_task_instances.resize(_control->_active_task_types.size()); + } catch (...) { + return ENOMEM; + } + for (size_t i = 0; i < _active_task_instances.size(); ++i) { + ActiveTaskInstance& inst = _active_task_instances[i]; + inst.type = _control->_active_task_types[i]; + inst.worker_local = NULL; + inst.initialized = false; + inst.impl.magic = ActiveTaskCtxImpl::DEAD_MAGIC; + inst.impl.group = this; + inst.public_ctx.struct_size = sizeof(inst.public_ctx); + inst.public_ctx.tag = _tag; + inst.public_ctx.worker_index = _worker_index; + inst.public_ctx.worker_pthread = _tid; + inst.public_ctx.bound_cpu = _bound_cpu; + inst.public_ctx.reserved0 = 0; + inst.public_ctx.impl = &inst.impl; + inst.impl.magic = ActiveTaskCtxImpl::MAGIC; + if (inst.type.worker_init) { + void* worker_local = NULL; + const int rc = inst.type.worker_init( + &worker_local, &inst.public_ctx, inst.type.user_data); + if (rc != 0) { + inst.impl.magic = ActiveTaskCtxImpl::DEAD_MAGIC; + for (size_t j = i; j > 0; --j) { + ActiveTaskInstance& rollback = _active_task_instances[j - 1]; + if (!rollback.initialized) { + continue; + } + if (rollback.type.worker_destroy) { + rollback.type.worker_destroy(rollback.worker_local, + &rollback.public_ctx, + rollback.type.user_data); + } + rollback.initialized = false; + rollback.public_ctx.impl = NULL; + rollback.impl.magic = ActiveTaskCtxImpl::DEAD_MAGIC; + } + _active_task_instances.clear(); + return rc; + } + inst.worker_local = worker_local; + } + inst.initialized = true; + } + return 0; +} + +void TaskGroup::destroy_active_tasks_for_worker() { + for (size_t i = _active_task_instances.size(); i > 0; --i) { + ActiveTaskInstance& inst = _active_task_instances[i - 1]; + if (!inst.initialized) { + continue; + } + if (inst.type.worker_destroy) { + inst.type.worker_destroy(inst.worker_local, + &inst.public_ctx, + inst.type.user_data); + } + inst.initialized = false; + inst.public_ctx.impl = NULL; + inst.impl.magic = ActiveTaskCtxImpl::DEAD_MAGIC; + inst.impl.group = this; + inst.worker_local = NULL; + } + _active_task_instances.clear(); +} + +void TaskGroup::run_active_tasks_harvest(bool* skip_park) { + if (skip_park) { + *skip_park = false; + } + if (_active_task_instances.empty()) { + return; + } + for (size_t i = 0; i < _active_task_instances.size(); ++i) { + ActiveTaskInstance& inst = _active_task_instances[i]; + if (!inst.initialized || inst.type.harvest == NULL) { + continue; + } + const bthread_active_task_ctx_t* saved_ctx = tls_active_task_hook_ctx; + TaskGroup* saved_group = tls_active_task_hook_group; + tls_active_task_hook_ctx = &inst.public_ctx; + tls_active_task_hook_group = this; + const int cb_ret = inst.type.harvest(inst.worker_local, &inst.public_ctx); + tls_active_task_hook_ctx = saved_ctx; + tls_active_task_hook_group = saved_group; + if (skip_park) { + if (cb_ret == 1) { + *skip_park = true; + } else if (cb_ret != 0) { + LOG_EVERY_SECOND(ERROR) + << "active-task harvest returned invalid value=" << cb_ret + << " (expected 0 or 1), treating as 0"; + } + } + } +} + bool TaskGroup::wait_task(bthread_t* tid) { + if (__builtin_expect(_active_task_instances.empty(), 1)) { + do { +#ifndef BTHREAD_DONT_SAVE_PARKING_STATE + if (_last_pl_state.stopped()) { + return false; + } + _pl->wait(_last_pl_state); + if (pop_next_task_local_first(tid)) { + return true; + } +#else + const ParkingLot::State st = _pl->get_state(); + if (st.stopped()) { + return false; + } + if (pop_next_task_local_first(tid)) { + return true; + } + _pl->wait(st); +#endif + } while (true); + } + do { + if (pop_next_task_local_first(tid)) { + return true; + } + bool harvest_skip_park = false; + run_active_tasks_harvest(&harvest_skip_park); + if (pop_next_task_local_first(tid)) { + return true; + } + const int64_t idle_wait_ns = FLAGS_bthread_active_task_idle_wait_ns; + if (harvest_skip_park || idle_wait_ns == 0) { + continue; + } + timespec timeout_ts{}; + const timespec* ptimeout = NULL; + if (idle_wait_ns > 0) { + timeout_ts = butil::nanoseconds_to_timespec(idle_wait_ns); + ptimeout = &timeout_ts; + } #ifndef BTHREAD_DONT_SAVE_PARKING_STATE if (_last_pl_state.stopped()) { return false; } - _pl->wait(_last_pl_state); - if (steal_task(tid)) { - return true; - } + _pl->wait(_last_pl_state, ptimeout); #else const ParkingLot::State st = _pl->get_state(); if (st.stopped()) { return false; } - if (steal_task(tid)) { - return true; - } - _pl->wait(st); + _pl->wait(st, ptimeout); #endif + if (_pl->get_state().stopped()) { + return false; + } } while (true); } @@ -202,6 +418,7 @@ void TaskGroup::run_main_task() { bvar::PassiveStatus cumulated_cputime( get_cumulated_cputime_from_this, this); std::unique_ptr > > usage_bvar; + size_t last_active_task_periodic_poll_nswitch = 0; TaskGroup* dummy = this; bthread_t tid; @@ -212,6 +429,17 @@ void TaskGroup::run_main_task() { if (_cur_meta->tid != _main_tid) { task_runner(1/*skip remained*/); } + if (!_active_task_instances.empty()) { + const int every_nswitch = FLAGS_bthread_active_task_poll_every_nswitch; + if (every_nswitch > 0 && + _nswitch - last_active_task_periodic_poll_nswitch >= + static_cast(every_nswitch)) { + // Busy workers may avoid wait_task() for a long time. Poll + // the unified harvest hook periodically in the worker loop. + last_active_task_periodic_poll_nswitch = _nswitch; + run_active_tasks_harvest(NULL); + } + } if (FLAGS_show_per_worker_usage_in_vars && !usage_bvar) { char name[32]; #if defined(OS_MACOSX) @@ -277,10 +505,18 @@ int TaskGroup::init(size_t runqueue_capacity) { LOG(FATAL) << "Fail to init _rq"; return -1; } + if (_pinned_rq.init(runqueue_capacity) != 0) { + LOG(FATAL) << "Fail to init _pinned_rq"; + return -1; + } if (_remote_rq.init(runqueue_capacity / 2) != 0) { LOG(FATAL) << "Fail to init _remote_rq"; return -1; } + if (_pinned_remote_rq.init(runqueue_capacity / 2) != 0) { + LOG(FATAL) << "Fail to init _pinned_remote_rq"; + return -1; + } #ifdef BUTIL_USE_ASAN void* stack_addr = NULL; @@ -311,6 +547,11 @@ int TaskGroup::init(size_t runqueue_capacity) { m->cpuwide_start_ns = butil::cpuwide_time_ns(); m->stat = EMPTY_STAT; m->attr = BTHREAD_ATTR_TASKGROUP; + m->local_pin_home_group = NULL; + m->local_pin_home_control = NULL; + m->local_pin_home_tag = BTHREAD_TAG_INVALID; + m->local_pin_depth = 0; + m->local_pin_enabled = false; m->tid = make_tid(*m->version_butex, slot); m->set_stack(stk); @@ -421,6 +662,15 @@ void TaskGroup::task_runner(intptr_t skip_remained) { // return_KeyTable, the group is probably changed. g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); + if (m->local_pin_enabled || m->local_pin_depth != 0 || + m->local_pin_home_group != NULL || m->local_pin_home_control != NULL) { + m->local_pin_enabled = false; + m->local_pin_depth = 0; + m->local_pin_home_group = NULL; + m->local_pin_home_control = NULL; + m->local_pin_home_tag = BTHREAD_TAG_INVALID; + } + // Increase the version and wake up all joiners, if resulting version // is 0, change it to 1 to make bthread_t never be 0. Any access // or join to the bthread after changing version will be rejected. @@ -499,6 +749,11 @@ int TaskGroup::start_foreground(TaskGroup** pg, } m->cpuwide_start_ns = start_ns; m->stat = EMPTY_STAT; + m->local_pin_home_group = NULL; + m->local_pin_home_control = NULL; + m->local_pin_home_tag = BTHREAD_TAG_INVALID; + m->local_pin_depth = 0; + m->local_pin_enabled = false; m->tid = make_tid(*m->version_butex, slot); *th = m->tid; if (using_attr.flags & BTHREAD_LOG_START_AND_FINISH) { @@ -564,6 +819,11 @@ int TaskGroup::start_background(bthread_t* __restrict th, } m->cpuwide_start_ns = start_ns; m->stat = EMPTY_STAT; + m->local_pin_home_group = NULL; + m->local_pin_home_control = NULL; + m->local_pin_home_tag = BTHREAD_TAG_INVALID; + m->local_pin_depth = 0; + m->local_pin_enabled = false; m->tid = make_tid(*m->version_butex, slot); *th = m->tid; if (using_attr.flags & BTHREAD_LOG_START_AND_FINISH) { @@ -636,19 +896,38 @@ TaskStatistics TaskGroup::main_stat() const { return m ? m->stat : EMPTY_STAT; } +bool TaskGroup::is_locally_pinned_task(const TaskMeta* meta) { + return meta != NULL && meta->local_pin_enabled && meta->local_pin_depth > 0 && + meta->local_pin_home_group != NULL; +} + +bool TaskGroup::pop_next_task_local_first(bthread_t* tid) { + if (_pinned_rq.pop(tid)) { + return true; + } + if (_pinned_remote_rq.pop(tid)) { + return true; + } +#ifndef BTHREAD_FAIR_WSQ + if (_rq.pop(tid)) { + return true; + } +#else + if (_rq.steal(tid)) { + return true; + } +#endif + if (_remote_rq.pop(tid)) { + return true; + } + return steal_task_from_others(tid); +} + void TaskGroup::ending_sched(TaskGroup** pg) { TaskGroup* g = *pg; bthread_t next_tid = 0; // Find next task to run, if none, switch to idle thread of the group. -#ifndef BTHREAD_FAIR_WSQ - // When BTHREAD_FAIR_WSQ is defined, profiling shows that cpu cost of - // WSQ::steal() in example/multi_threaded_echo_c++ changes from 1.9% - // to 2.9% - const bool popped = g->_rq.pop(&next_tid); -#else - const bool popped = g->_rq.steal(&next_tid); -#endif - if (!popped && !g->steal_task(&next_tid)) { + if (!g->pop_next_task_local_first(&next_tid)) { // Jump to main task if there's no task to run. next_tid = g->_main_tid; } @@ -688,12 +967,7 @@ void TaskGroup::sched(TaskGroup** pg) { TaskGroup* g = *pg; bthread_t next_tid = 0; // Find next task to run, if none, switch to idle thread of the group. -#ifndef BTHREAD_FAIR_WSQ - const bool popped = g->_rq.pop(&next_tid); -#else - const bool popped = g->_rq.steal(&next_tid); -#endif - if (!popped && !g->steal_task(&next_tid)) { + if (!g->pop_next_task_local_first(&next_tid)) { // Jump to main task if there's no task to run. next_tid = g->_main_tid; } @@ -814,7 +1088,7 @@ void TaskGroup::destroy_self() { } -void TaskGroup::ready_to_run(TaskMeta* meta, bool nosignal) { +void TaskGroup::ready_to_run_local_raw(TaskMeta* meta, bool nosignal) { #ifdef BRPC_BTHREAD_TRACER _control->_task_tracer.set_status(TASK_STATUS_READY, meta); #endif // BRPC_BTHREAD_TRACER @@ -829,6 +1103,38 @@ void TaskGroup::ready_to_run(TaskMeta* meta, bool nosignal) { } } +void TaskGroup::ready_to_run_pinned_local(TaskMeta* meta, bool nosignal) { +#ifdef BRPC_BTHREAD_TRACER + _control->_task_tracer.set_status(TASK_STATUS_READY, meta); +#endif // BRPC_BTHREAD_TRACER + push_pinned_rq(meta->tid); + if (nosignal) { + ++_num_nosignal; + } else { + const int additional_signal = _num_nosignal; + _num_nosignal = 0; + _nsignaled += 1 + additional_signal; + _control->signal_task(1 + additional_signal, _tag); + } +} + +void TaskGroup::ready_to_run(TaskMeta* meta, bool nosignal) { + if (is_locally_pinned_task(meta)) { + TaskGroup* home = meta->local_pin_home_group; + if (home == NULL) { + LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; + return; + } + if (BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group) == home) { + home->ready_to_run_pinned_local(meta, nosignal); + } else { + home->ready_to_run_pinned_remote(meta, nosignal); + } + return; + } + ready_to_run_local_raw(meta, nosignal); +} + void TaskGroup::flush_nosignal_tasks() { const int val = _num_nosignal; if (val) { @@ -838,7 +1144,7 @@ void TaskGroup::flush_nosignal_tasks() { } } -void TaskGroup::ready_to_run_remote(TaskMeta* meta, bool nosignal) { +void TaskGroup::ready_to_run_remote_raw(TaskMeta* meta, bool nosignal) { #ifdef BRPC_BTHREAD_TRACER _control->_task_tracer.set_status(TASK_STATUS_READY, meta); #endif // BRPC_BTHREAD_TRACER @@ -862,6 +1168,40 @@ void TaskGroup::ready_to_run_remote(TaskMeta* meta, bool nosignal) { } } +void TaskGroup::ready_to_run_pinned_remote(TaskMeta* meta, bool nosignal) { +#ifdef BRPC_BTHREAD_TRACER + _control->_task_tracer.set_status(TASK_STATUS_READY, meta); +#endif // BRPC_BTHREAD_TRACER + (void)nosignal; // correctness first for pinned cross-thread wakeups. + _pinned_remote_rq._mutex.lock(); + while (!_pinned_remote_rq.push_locked(meta->tid)) { + _pinned_remote_rq._mutex.unlock(); + LOG_EVERY_SECOND(ERROR) << "_pinned_remote_rq is full, capacity=" + << _pinned_remote_rq.capacity(); + ::usleep(1000); + _pinned_remote_rq._mutex.lock(); + } + _pinned_remote_rq._mutex.unlock(); + _control->signal_task(1, _tag); +} + +void TaskGroup::ready_to_run_remote(TaskMeta* meta, bool nosignal) { + if (is_locally_pinned_task(meta)) { + TaskGroup* home = meta->local_pin_home_group; + if (home == NULL) { + LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; + return; + } + if (BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group) == home) { + home->ready_to_run_pinned_local(meta, nosignal); + } else { + home->ready_to_run_pinned_remote(meta, nosignal); + } + return; + } + ready_to_run_remote_raw(meta, nosignal); +} + void TaskGroup::flush_nosignal_tasks_remote_locked(butil::Mutex& locked_mutex) { const int val = _remote_num_nosignal; if (!val) { @@ -875,6 +1215,19 @@ void TaskGroup::flush_nosignal_tasks_remote_locked(butil::Mutex& locked_mutex) { } void TaskGroup::ready_to_run_general(TaskMeta* meta, bool nosignal) { + if (is_locally_pinned_task(meta)) { + TaskGroup* home = meta->local_pin_home_group; + if (home == NULL) { + LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; + return; + } + if (BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group) == home) { + home->ready_to_run_pinned_local(meta, nosignal); + } else { + home->ready_to_run_pinned_remote(meta, nosignal); + } + return; + } if (tls_task_group == this) { return ready_to_run(meta, nosignal); } @@ -895,15 +1248,34 @@ void TaskGroup::ready_to_run_in_worker(void* args_in) { void TaskGroup::ready_to_run_in_worker_ignoresignal(void* args_in) { ReadyToRunArgs* args = static_cast(args_in); + return tls_task_group->ready_to_run_ignoresignal_pinaware(args->meta); +} + +void TaskGroup::ready_to_run_ignoresignal_pinaware(TaskMeta* meta) { #ifdef BRPC_BTHREAD_TRACER - tls_task_group->_control->_task_tracer.set_status( - TASK_STATUS_READY, args->meta); + _control->_task_tracer.set_status(TASK_STATUS_READY, meta); #endif // BRPC_BTHREAD_TRACER - return tls_task_group->push_rq(args->meta->tid); + if (!is_locally_pinned_task(meta)) { + push_rq(meta->tid); + return; + } + TaskGroup* home = meta->local_pin_home_group; + if (home == NULL) { + LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; + return; + } + if (this == home) { + push_pinned_rq(meta->tid); + return; + } + home->ready_to_run_pinned_remote(meta, true); } void TaskGroup::priority_to_run(void* args_in) { ReadyToRunArgs* args = static_cast(args_in); + if (is_locally_pinned_task(args->meta)) { + return tls_task_group->ready_to_run_ignoresignal_pinaware(args->meta); + } #ifdef BRPC_BTHREAD_TRACER tls_task_group->_control->_task_tracer.set_status( TASK_STATUS_READY, args->meta); diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index 54140c0dc2..bd86af58f3 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -154,6 +154,9 @@ class TaskGroup { static void set_stopped(bthread_t tid); static bool is_stopped(bthread_t tid); + static int butex_wake_within_active_task(const bthread_active_task_ctx_t* ctx, + void* butex); + // The bthread running run_main_task(); bthread_t main_tid() const { return _main_tid; } TaskStatistics main_stat() const; @@ -211,6 +214,7 @@ class TaskGroup { // Push a task into _rq, if _rq is full, retry after some time. This // process make go on indefinitely. void push_rq(bthread_t tid); + void push_pinned_rq(bthread_t tid); // Returns size of local run queue. size_t rq_size() const { @@ -301,6 +305,11 @@ friend class TaskControl; explicit TaskGroup(TaskControl* c); int init(size_t runqueue_capacity); + static TaskGroup* validate_active_task_hook_ctx( + const bthread_active_task_ctx_t* ctx, int* err); + int init_active_tasks_for_worker(); + void destroy_active_tasks_for_worker(); + void run_active_tasks_harvest(bool* skip_park); // You shall call destroy_selfm() instead of destructor because deletion // of groups are postponed to avoid race. @@ -322,20 +331,30 @@ friend class TaskControl; static void ready_to_run_in_worker(void*); static void ready_to_run_in_worker_ignoresignal(void*); static void priority_to_run(void*); + void ready_to_run_local_raw(TaskMeta* meta, bool nosignal); + void ready_to_run_remote_raw(TaskMeta* meta, bool nosignal); + void ready_to_run_pinned_local(TaskMeta* meta, bool nosignal); + void ready_to_run_pinned_remote(TaskMeta* meta, bool nosignal); + void ready_to_run_ignoresignal_pinaware(TaskMeta* meta); + static bool is_locally_pinned_task(const TaskMeta* meta); // Wait for a task to run. // Returns true on success, false is treated as permanent error and the // loop calling this function should end. bool wait_task(bthread_t* tid); + bool pop_next_task_local_first(bthread_t* tid); + bool steal_task_from_others(bthread_t* tid) { +#ifndef BTHREAD_DONT_SAVE_PARKING_STATE + _last_pl_state = _pl->get_state(); +#endif + return _control->steal_task(tid, &_steal_seed, _steal_offset); + } bool steal_task(bthread_t* tid) { if (_remote_rq.pop(tid)) { return true; } -#ifndef BTHREAD_DONT_SAVE_PARKING_STATE - _last_pl_state = _pl->get_state(); -#endif - return _control->steal_task(tid, &_steal_seed, _steal_offset); + return steal_task_from_others(tid); } void set_tag(bthread_tag_t tag) { _tag = tag; } @@ -346,6 +365,21 @@ friend class TaskControl; return g->_main_tid == tid; } + struct ActiveTaskCtxImpl { + static const uint64_t MAGIC = 0x4252504341544b31ULL; // "BRPCATK1" + static const uint64_t DEAD_MAGIC = 0x4252504341544b30ULL; // "BRPCATK0" + uint64_t magic{DEAD_MAGIC}; + TaskGroup* group{NULL}; + }; + + struct ActiveTaskInstance { + bthread_active_task_type_t type{}; + void* worker_local{NULL}; + bool initialized{false}; + ActiveTaskCtxImpl impl; + bthread_active_task_ctx_t public_ctx{}; + }; + TaskMeta* _cur_meta{NULL}; // the control that this group belongs to @@ -369,7 +403,9 @@ friend class TaskControl; ContextualStack* _main_stack{NULL}; bthread_t _main_tid{INVALID_BTHREAD}; WorkStealingQueue _rq; + WorkStealingQueue _pinned_rq; RemoteTaskQueue _remote_rq; + RemoteTaskQueue _pinned_remote_rq; int _remote_num_nosignal{0}; int _remote_nsignaled{0}; @@ -379,6 +415,9 @@ friend class TaskControl; // Worker thread id. pthread_t _tid{}; + uint32_t _worker_index{0}; + int32_t _bound_cpu{-1}; + std::vector _active_task_instances; }; } // namespace bthread diff --git a/src/bthread/task_group_inl.h b/src/bthread/task_group_inl.h index faa5683b6c..8f42c4e267 100644 --- a/src/bthread/task_group_inl.h +++ b/src/bthread/task_group_inl.h @@ -101,6 +101,15 @@ inline void TaskGroup::push_rq(bthread_t tid) { } } +inline void TaskGroup::push_pinned_rq(bthread_t tid) { + while (!_pinned_rq.push(tid)) { + flush_nosignal_tasks(); + LOG_EVERY_SECOND(ERROR) << "_pinned_rq is full, capacity=" + << _pinned_rq.capacity(); + ::usleep(1000); + } +} + inline void TaskGroup::flush_nosignal_tasks_remote() { if (_remote_num_nosignal) { _remote_rq._mutex.lock(); diff --git a/src/bthread/task_meta.h b/src/bthread/task_meta.h index 1b77c0b601..38a9e15493 100644 --- a/src/bthread/task_meta.h +++ b/src/bthread/task_meta.h @@ -31,6 +31,9 @@ namespace bthread { +class TaskControl; +class TaskGroup; + struct TaskStatistics { int64_t cputime_ns; int64_t nswitch; @@ -117,6 +120,15 @@ struct TaskMeta { // Worker thread id. pthread_t worker_tid{}; + // Pin current task to a specific worker(TaskGroup). When enabled, runnable + // transitions must route back to local_pin_home_group and must not be + // stealable until local_pin_depth drops to 0. + TaskGroup* local_pin_home_group{NULL}; + TaskControl* local_pin_home_control{NULL}; + bthread_tag_t local_pin_home_tag{BTHREAD_TAG_INVALID}; + uint16_t local_pin_depth{0}; + bool local_pin_enabled{false}; + public: // Only initialize [Not Reset] fields, other fields will be reset in // bthread_start* functions diff --git a/src/bthread/unstable.h b/src/bthread/unstable.h index 4580202f87..2000910b89 100644 --- a/src/bthread/unstable.h +++ b/src/bthread/unstable.h @@ -22,6 +22,7 @@ #ifndef BTHREAD_UNSTABLE_H #define BTHREAD_UNSTABLE_H +#include #include #include #include "bthread/types.h" @@ -100,6 +101,79 @@ extern int bthread_set_create_span_func(void* (*func)()); // suspend indefinitely. extern void bthread_stop_world(); +// Active task callback context. This structure is only valid during callbacks +// registered by bthread_register_active_task_type(). +typedef struct bthread_active_task_ctx_t { + size_t struct_size; + bthread_tag_t tag; + uint32_t worker_index; + pthread_t worker_pthread; + int32_t bound_cpu; // -1 when unknown or not bound. + uint32_t reserved0; + void* impl; // internal opaque pointer, pass only to active-task within APIs. +} bthread_active_task_ctx_t; + +typedef struct bthread_active_task_type_t { + size_t struct_size; + const char* name; + void* user_data; + + // Called once for each worker pthread after TaskGroup is created and + // tls_task_group is set, before entering the worker scheduling loop. + int (*worker_init)(void** worker_local, + const bthread_active_task_ctx_t* ctx, + void* user_data); + + // Called once for each worker pthread before the worker exits. + void (*worker_destroy)(void* worker_local, + const bthread_active_task_ctx_t* ctx, + void* user_data); + + // Called by the worker scheduler loop as a non-blocking maintenance hook to + // harvest completions and wake local waiters. The runtime may call this + // hook in multiple places (for example before parking, after wakeup and + // periodic busy-loop polling). Return 1 to ask the worker loop to retry + // without entering ParkingLot wait in the current iteration, return 0 + // otherwise. + int (*harvest)( + void* worker_local, const bthread_active_task_ctx_t* ctx); +} bthread_active_task_type_t; + +// Register an active-task type. This function must be called before bthread +// TaskControl is initialized. +extern int bthread_register_active_task_type( + const bthread_active_task_type_t* type); + +// Active-task callbacks are constrained to simple non-blocking maintenance +// logic (for example completion harvesting + local waiter wakeup). Creating +// new bthreads from active-task callbacks is intentionally unsupported. +// +// Wake the single waiter on `butex' from inside the current active-task +// callback and enqueue the resumed bthread into the current hook worker's +// local queue explicitly (nosignal=true, no immediate switch inside hook). +// +// This API is designed for per-request private butex usage: +// - 0 waiter on `butex': return 0 +// - 1 waiter and it is a same-TaskControl/same-tag bthread waiter: return 1 +// - otherwise (multiple waiters / pthread waiter / cross-tag / cross-control): +// return -1 and set errno=EINVAL +// +// Calling this API outside active-task harvest callbacks returns -1 and sets +// errno=EPERM. +extern int bthread_butex_wake_within(const bthread_active_task_ctx_t* ctx, + void* butex); + +// Wait on butex with an implicit wait-scope local pin. Semantics are the same +// as butex_wait (including timeout/interruption), but the runtime temporarily +// pins the current bthread to the current worker for this wait operation so the +// resumed bthread is routed back to the home worker and is not stealable before +// bthread_butex_wait_local() returns. +// +// Returns 0 on success, -1 otherwise and errno is set. +// - EPERM: not running inside a normal bthread worker task +extern int bthread_butex_wait_local(void* butex, int expected_value, + const struct timespec* abstime); + // Create a bthread_key_t with an additional arg to destructor. // Generally the dtor_arg is for passing the creator of data so that we can // return the data back to the creator in destructor. Without this arg, we diff --git a/test/bthread_active_task_unittest.cpp b/test/bthread_active_task_unittest.cpp new file mode 100644 index 0000000000..0bc9834f01 --- /dev/null +++ b/test/bthread_active_task_unittest.cpp @@ -0,0 +1,1470 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "butil/atomicops.h" +#include "butil/time.h" +#include "bthread/bthread.h" +#include "bthread/butex.h" +#include "bthread/task_control.h" +#include "bthread/task_group.h" +#include "bthread/unstable.h" + +namespace bthread { +DECLARE_int32(bthread_active_task_poll_every_nswitch); +DECLARE_int64(bthread_active_task_idle_wait_ns); +} + +namespace { + +enum TestMode { + TEST_MODE_IDLE = 0, + TEST_MODE_IDLE_WAIT_INTERVAL = 1, + TEST_MODE_BUTEX_WAKE_WITHIN = 2, + TEST_MODE_BUTEX_WAKE_WITHIN_NULL = 3, + TEST_MODE_BUTEX_WAKE_WITHIN_NO_WAITER = 4, + TEST_MODE_BUTEX_WAKE_WITHIN_PTHREAD_WAITER = 5, + TEST_MODE_BUSY_PERIODIC_POLL_WAKE = 6, + TEST_MODE_SCENARIO_REQ_WAKE = 7, + TEST_MODE_SCENARIO_REQ_WAKE_BUSY_PERIODIC = 8, + TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT = 9, +}; + +struct PerWorkerState { +}; + +struct MockReqCtx { + MockReqCtx() + : butex(NULL) + , result_ready(0) + , wake_rc(0) + , wake_errno(0) + , waiter_ready(0) + , waiter_done(0) + , resume_saw_result_ready(0) + , wait_rc(0) + , wait_errno(0) + , waiter_worker_pthread(0) + , resume_worker_pthread(0) + , hook_worker_pthread(0) + , completion_published(0) {} + + void* butex; + std::atomic result_ready; + std::atomic wake_rc; + std::atomic wake_errno; + std::atomic waiter_ready; + std::atomic waiter_done; + std::atomic resume_saw_result_ready; + std::atomic wait_rc; + std::atomic wait_errno; + std::atomic waiter_worker_pthread; + std::atomic resume_worker_pthread; + std::atomic hook_worker_pthread; + std::atomic completion_published; +}; + +struct ActiveTaskTestState { + ActiveTaskTestState() + : mode(TEST_MODE_IDLE) + , init_calls(0) + , destroy_calls(0) + , harvest_calls(0) + , butex_ptr(0) + , pending_req_ptr(0) + , target_hook_worker_pthread(0) + , butex_expected_waiters(0) + , butex_wake_started(0) + , butex_wake_completed(0) + , butex_wake_rc(0) + , butex_wake_errno(0) + , hook_wake_harvest_calls(0) + , hook_action_inflight(0) + , butex_waiter_ready_count(0) + , butex_waiter_done_count(0) + , butex_waiter_resume_count(0) + , butex_waiter_worker_pthread(0) + , butex_waiter_resume_worker_pthread(0) + , pthread_waiter_ready_count(0) + , pthread_waiter_done_count(0) + , busy_task_started(0) + , busy_task_stop(0) + , busy_task_switches(0) {} + std::atomic mode; + std::atomic init_calls; + std::atomic destroy_calls; + std::atomic harvest_calls; + std::atomic butex_ptr; + std::atomic pending_req_ptr; + std::atomic target_hook_worker_pthread; + std::atomic butex_expected_waiters; + std::atomic butex_wake_started; + std::atomic butex_wake_completed; + std::atomic butex_wake_rc; + std::atomic butex_wake_errno; + std::atomic hook_wake_harvest_calls; + std::atomic hook_action_inflight; + std::atomic butex_waiter_ready_count; + std::atomic butex_waiter_done_count; + std::atomic butex_waiter_resume_count; + std::atomic butex_waiter_worker_pthread; + std::atomic butex_waiter_resume_worker_pthread; + std::atomic pthread_waiter_ready_count; + std::atomic pthread_waiter_done_count; + std::atomic busy_task_started; + std::atomic busy_task_stop; + std::atomic busy_task_switches; +}; + +struct PinnedWaitCtx { + PinnedWaitCtx() + : butex(NULL) + , use_timeout(false) + , timeout_ms(0) + , ready(0) + , done(0) + , wait_rc(0) + , wait_errno(0) + , pinned_worker_pthread(0) + , resume_worker_pthread(0) {} + + void* butex; + bool use_timeout; + int timeout_ms; + std::atomic ready; + std::atomic done; + std::atomic wait_rc; + std::atomic wait_errno; + std::atomic pinned_worker_pthread; + std::atomic resume_worker_pthread; +}; + +ActiveTaskTestState g_state; +std::atomic g_register_rc(-1); +std::atomic g_register_once(0); +bthread::TaskControl* g_shared_single_worker_tc = NULL; +std::atomic g_shared_single_worker_tc_once(0); +bthread::TaskControl* g_shared_two_worker_tc = NULL; +std::atomic g_shared_two_worker_tc_once(0); + +void ResetState() { + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.init_calls.store(0, std::memory_order_relaxed); + g_state.destroy_calls.store(0, std::memory_order_relaxed); + g_state.harvest_calls.store(0, std::memory_order_relaxed); + g_state.butex_ptr.store(0, std::memory_order_relaxed); + g_state.pending_req_ptr.store(0, std::memory_order_relaxed); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); + g_state.butex_expected_waiters.store(0, std::memory_order_relaxed); + g_state.butex_wake_started.store(0, std::memory_order_relaxed); + g_state.butex_wake_completed.store(0, std::memory_order_relaxed); + g_state.butex_wake_rc.store(0, std::memory_order_relaxed); + g_state.butex_wake_errno.store(0, std::memory_order_relaxed); + g_state.hook_wake_harvest_calls.store(0, std::memory_order_relaxed); + g_state.butex_waiter_ready_count.store(0, std::memory_order_relaxed); + g_state.butex_waiter_done_count.store(0, std::memory_order_relaxed); + g_state.butex_waiter_resume_count.store(0, std::memory_order_relaxed); + g_state.butex_waiter_worker_pthread.store(0, std::memory_order_relaxed); + g_state.butex_waiter_resume_worker_pthread.store(0, std::memory_order_relaxed); + g_state.pthread_waiter_ready_count.store(0, std::memory_order_relaxed); + g_state.pthread_waiter_done_count.store(0, std::memory_order_relaxed); + g_state.busy_task_started.store(0, std::memory_order_relaxed); + g_state.busy_task_stop.store(0, std::memory_order_relaxed); + g_state.busy_task_switches.store(0, std::memory_order_relaxed); +} + +uint64_t PthreadToU64(pthread_t tid) { + uint64_t v = 0; + memcpy(&v, &tid, std::min(sizeof(v), sizeof(tid))); + return v; +} + +bool WaitAtomicAtLeast(const std::atomic& value, int expected, int timeout_ms) { + for (int i = 0; i < timeout_ms; ++i) { + if (value.load(std::memory_order_relaxed) >= expected) { + return true; + } + usleep(1000); + } + return value.load(std::memory_order_relaxed) >= expected; +} + +bool WaitAtomicEqual(const std::atomic& value, int expected, int timeout_ms) { + for (int i = 0; i < timeout_ms; ++i) { + if (value.load(std::memory_order_relaxed) == expected) { + return true; + } + usleep(1000); + } + return value.load(std::memory_order_relaxed) == expected; +} + +void DrainHookActions() { + ASSERT_TRUE(WaitAtomicEqual(g_state.hook_action_inflight, 0, 5000)); +} + +void QuiesceHookActionsAfterModeIdle() { + DrainHookActions(); + usleep(1000); + DrainHookActions(); +} + +void PrepareForCase() { + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + QuiesceHookActionsAfterModeIdle(); + ResetState(); +} + +bthread::TaskControl& GetSharedSingleWorkerTaskControl() { + int expected = 0; + if (g_shared_single_worker_tc_once.compare_exchange_strong( + expected, 1, std::memory_order_relaxed)) { + g_shared_single_worker_tc = new bthread::TaskControl(); + CHECK(g_shared_single_worker_tc != NULL); + CHECK_EQ(0, g_shared_single_worker_tc->init(1)); + CHECK(WaitAtomicAtLeast(g_state.init_calls, 1, 5000)); + } + CHECK(g_shared_single_worker_tc != NULL); + return *g_shared_single_worker_tc; +} + +bthread::TaskControl& GetSharedTwoWorkerTaskControl() { + int expected = 0; + if (g_shared_two_worker_tc_once.compare_exchange_strong( + expected, 1, std::memory_order_relaxed)) { + g_shared_two_worker_tc = new bthread::TaskControl(); + CHECK(g_shared_two_worker_tc != NULL); + CHECK_EQ(0, g_shared_two_worker_tc->init(2)); + CHECK(WaitAtomicAtLeast(g_state.init_calls, 2, 5000)); + } + CHECK(g_shared_two_worker_tc != NULL); + return *g_shared_two_worker_tc; +} + +void* TestButexWaitTask(void*) { + void* butex = reinterpret_cast( + g_state.butex_ptr.load(std::memory_order_relaxed)); + if (butex == NULL) { + g_state.butex_waiter_done_count.fetch_add(1, std::memory_order_relaxed); + return NULL; + } + g_state.butex_waiter_worker_pthread.store(PthreadToU64(pthread_self()), + std::memory_order_relaxed); + g_state.butex_waiter_ready_count.fetch_add(1, std::memory_order_relaxed); + const int rc = bthread::butex_wait(butex, 0, NULL); + if (rc == 0) { + g_state.butex_waiter_resume_count.fetch_add(1, std::memory_order_relaxed); + g_state.butex_waiter_resume_worker_pthread.store(PthreadToU64(pthread_self()), + std::memory_order_relaxed); + } else if (errno == EWOULDBLOCK) { + // Value changed before waiter was queued; still resumed from caller's view. + g_state.butex_waiter_resume_count.fetch_add(1, std::memory_order_relaxed); + g_state.butex_waiter_resume_worker_pthread.store(PthreadToU64(pthread_self()), + std::memory_order_relaxed); + } + g_state.butex_waiter_done_count.fetch_add(1, std::memory_order_relaxed); + return NULL; +} + +void* TestBusyYieldTask(void*) { + g_state.busy_task_started.fetch_add(1, std::memory_order_relaxed); + while (!g_state.busy_task_stop.load(std::memory_order_relaxed)) { + g_state.busy_task_switches.fetch_add(1, std::memory_order_relaxed); + bthread_yield(); + } + return NULL; +} + +void* TestPthreadButexWait(void*) { + void* butex = reinterpret_cast( + g_state.butex_ptr.load(std::memory_order_relaxed)); + if (butex == NULL) { + g_state.pthread_waiter_done_count.fetch_add(1, std::memory_order_relaxed); + return NULL; + } + g_state.pthread_waiter_ready_count.fetch_add(1, std::memory_order_relaxed); + const int rc = bthread::butex_wait(butex, 0, NULL); + if (rc != 0 && errno != EWOULDBLOCK) { + // Still count completion; assertions are on within wake behavior. + } + g_state.pthread_waiter_done_count.fetch_add(1, std::memory_order_relaxed); + return NULL; +} + +void* TestRequestWaitTask(void* arg) { + MockReqCtx* req = static_cast(arg); + if (req == NULL || req->butex == NULL) { + return NULL; + } + req->waiter_worker_pthread.store(PthreadToU64(pthread_self()), + std::memory_order_relaxed); + req->waiter_ready.store(1, std::memory_order_release); + errno = 0; + const int rc = bthread::butex_wait(req->butex, 0, NULL); + const int err = errno; + req->wait_rc.store(rc, std::memory_order_relaxed); + req->wait_errno.store(err, std::memory_order_relaxed); + req->resume_worker_pthread.store(PthreadToU64(pthread_self()), + std::memory_order_relaxed); + if (req->result_ready.load(std::memory_order_acquire) == 1) { + req->resume_saw_result_ready.store(1, std::memory_order_relaxed); + } + req->waiter_done.store(1, std::memory_order_release); + return NULL; +} + +void* TestPinnedButexLocalWaitTask(void*) { + void* butex = reinterpret_cast( + g_state.butex_ptr.load(std::memory_order_relaxed)); + if (butex == NULL) { + g_state.butex_waiter_done_count.fetch_add(1, std::memory_order_relaxed); + return NULL; + } + const uint64_t home = PthreadToU64(pthread_self()); + g_state.butex_waiter_worker_pthread.store(home, std::memory_order_relaxed); + g_state.butex_waiter_ready_count.fetch_add(1, std::memory_order_relaxed); + const int rc = bthread_butex_wait_local(butex, 0, NULL); + if (rc == 0 || errno == EWOULDBLOCK) { + g_state.butex_waiter_resume_count.fetch_add(1, std::memory_order_relaxed); + g_state.butex_waiter_resume_worker_pthread.store(PthreadToU64(pthread_self()), + std::memory_order_relaxed); + } + g_state.butex_waiter_done_count.fetch_add(1, std::memory_order_relaxed); + return NULL; +} + +void* TestPinnedWaitTask(void* arg) { + PinnedWaitCtx* ctx = static_cast(arg); + if (ctx == NULL || ctx->butex == NULL) { + if (ctx) { + ctx->done.store(1, std::memory_order_release); + } + return NULL; + } + + const uint64_t home = PthreadToU64(pthread_self()); + ctx->pinned_worker_pthread.store(home, std::memory_order_relaxed); + ctx->ready.store(1, std::memory_order_release); + + timespec abstime; + const timespec* pabstime = NULL; + if (ctx->use_timeout) { + abstime = butil::milliseconds_from_now(ctx->timeout_ms); + pabstime = &abstime; + } + errno = 0; + const int wait_rc = bthread_butex_wait_local(ctx->butex, 0, pabstime); + const int wait_errno = errno; + ctx->wait_rc.store(wait_rc, std::memory_order_relaxed); + ctx->wait_errno.store(wait_errno, std::memory_order_relaxed); + ctx->resume_worker_pthread.store(PthreadToU64(pthread_self()), + std::memory_order_relaxed); + ctx->done.store(1, std::memory_order_release); + return NULL; +} + +bool MaybeRunWithinWakeFromHook(const bthread_active_task_ctx_t* ctx, + int mode, + bool* skip_park_out) { + struct ScopedHookInflight { + ScopedHookInflight() { + g_state.hook_action_inflight.fetch_add(1, std::memory_order_relaxed); + } + ~ScopedHookInflight() { + g_state.hook_action_inflight.fetch_sub(1, std::memory_order_relaxed); + } + } scoped_inflight; + + if (mode != TEST_MODE_BUTEX_WAKE_WITHIN && + mode != TEST_MODE_BUTEX_WAKE_WITHIN_NULL && + mode != TEST_MODE_BUTEX_WAKE_WITHIN_NO_WAITER && + mode != TEST_MODE_BUTEX_WAKE_WITHIN_PTHREAD_WAITER && + mode != TEST_MODE_BUSY_PERIODIC_POLL_WAKE && + mode != TEST_MODE_SCENARIO_REQ_WAKE && + mode != TEST_MODE_SCENARIO_REQ_WAKE_BUSY_PERIODIC && + mode != TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT) { + return false; + } + + const uint64_t target_worker = g_state.target_hook_worker_pthread.load( + std::memory_order_relaxed); + if (target_worker != 0 && PthreadToU64(ctx->worker_pthread) != target_worker) { + return false; + } + + const bool is_scenario_req_wake = + (mode == TEST_MODE_SCENARIO_REQ_WAKE || + mode == TEST_MODE_SCENARIO_REQ_WAKE_BUSY_PERIODIC); + + if (mode == TEST_MODE_BUTEX_WAKE_WITHIN || + mode == TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT || + mode == TEST_MODE_BUSY_PERIODIC_POLL_WAKE) { + const int expected_waiters = + g_state.butex_expected_waiters.load(std::memory_order_relaxed); + if (g_state.butex_waiter_ready_count.load(std::memory_order_relaxed) < + expected_waiters) { + return true; + } + } else if (mode == TEST_MODE_BUTEX_WAKE_WITHIN_PTHREAD_WAITER) { + if (g_state.pthread_waiter_ready_count.load(std::memory_order_relaxed) < 1) { + return true; + } + } else if (is_scenario_req_wake) { + MockReqCtx* req = reinterpret_cast( + g_state.pending_req_ptr.load(std::memory_order_acquire)); + if (req == NULL || + req->waiter_ready.load(std::memory_order_acquire) < 1 || + req->completion_published.load(std::memory_order_acquire) == 0) { + return true; + } + } + + int expected = 0; + if (!g_state.butex_wake_started.compare_exchange_strong( + expected, 1, std::memory_order_relaxed)) { + return true; + } + + g_state.hook_wake_harvest_calls.fetch_add(1, std::memory_order_relaxed); + + void* butex = NULL; + MockReqCtx* req = NULL; + if (is_scenario_req_wake) { + req = reinterpret_cast( + g_state.pending_req_ptr.load(std::memory_order_acquire)); + if (req == NULL) { + g_state.butex_wake_started.store(0, std::memory_order_relaxed); + return true; + } + req->hook_worker_pthread.store(PthreadToU64(ctx->worker_pthread), + std::memory_order_relaxed); + req->result_ready.store(1, std::memory_order_release); + butex = req->butex; + } else { + butex = reinterpret_cast( + g_state.butex_ptr.load(std::memory_order_relaxed)); + if (mode == TEST_MODE_BUTEX_WAKE_WITHIN_NULL) { + butex = NULL; + } + } + + errno = 0; + const int rc = bthread_butex_wake_within(ctx, butex); + const int err = errno; + g_state.butex_wake_rc.store(rc, std::memory_order_relaxed); + g_state.butex_wake_errno.store(err, std::memory_order_relaxed); + + bool done = true; + if (is_scenario_req_wake) { + done = (rc == 1); + } else if (mode == TEST_MODE_BUTEX_WAKE_WITHIN || + mode == TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT || + mode == TEST_MODE_BUSY_PERIODIC_POLL_WAKE) { + const int expected_waiters = + g_state.butex_expected_waiters.load(std::memory_order_relaxed); + if (mode == TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT) { + done = (rc == -1 && err == EINVAL); + } else if (expected_waiters == 1) { + done = (rc == 1); + } else if (expected_waiters > 1) { + done = (rc == -1 && err == EINVAL) || (rc == 1); + } + } else if (mode == TEST_MODE_BUTEX_WAKE_WITHIN_PTHREAD_WAITER) { + done = (rc == -1 && err == EINVAL); + } + + if (!done) { + g_state.butex_wake_started.store(0, std::memory_order_relaxed); + return true; + } + + if (req != NULL) { + req->wake_rc.store(rc, std::memory_order_relaxed); + req->wake_errno.store(err, std::memory_order_relaxed); + } + g_state.butex_wake_completed.fetch_add(1, std::memory_order_relaxed); + if (skip_park_out) { + *skip_park_out = true; + } + return true; +} + +int ActiveTaskWorkerInit(void** worker_local, + const bthread_active_task_ctx_t* ctx, + void*) { + (void)ctx; + PerWorkerState* s = new (std::nothrow) PerWorkerState; + if (s == NULL) { + return ENOMEM; + } + *worker_local = s; + g_state.init_calls.fetch_add(1, std::memory_order_relaxed); + return 0; +} + +void ActiveTaskWorkerDestroy(void* worker_local, + const bthread_active_task_ctx_t*, + void*) { + delete static_cast(worker_local); + g_state.destroy_calls.fetch_add(1, std::memory_order_relaxed); +} + +int ActiveTaskHarvest(void* worker_local, + const bthread_active_task_ctx_t* ctx) { + g_state.harvest_calls.fetch_add(1, std::memory_order_relaxed); + (void)worker_local; + const int mode = g_state.mode.load(std::memory_order_acquire); + if (mode == TEST_MODE_IDLE_WAIT_INTERVAL) { + return 0; + } + bool skip_park = false; + if (MaybeRunWithinWakeFromHook(ctx, mode, &skip_park)) { + return skip_park ? 1 : 0; + } + return 0; +} + +int RegisterTestActiveTaskType() { + bthread_active_task_type_t type; + memset(&type, 0, sizeof(type)); + type.struct_size = sizeof(type); + type.name = "active_task_unittest"; + type.worker_init = ActiveTaskWorkerInit; + type.worker_destroy = ActiveTaskWorkerDestroy; + type.harvest = ActiveTaskHarvest; + return bthread_register_active_task_type(&type); +} + +void* JustExit(void*) { + return NULL; +} + +int ChildCheckRegisterAfterInitRejected() { + bthread_t tid = INVALID_BTHREAD; + if (bthread_start_background(&tid, NULL, JustExit, NULL) != 0) { + return 2; + } + bthread_join(tid, NULL); + + bthread_active_task_type_t type; + memset(&type, 0, sizeof(type)); + type.struct_size = sizeof(type); + type.name = "active_task_after_init"; + type.harvest = ActiveTaskHarvest; + const int rc = bthread_register_active_task_type(&type); + return (rc == EPERM ? 0 : 3); +} + +int ChildCheckLocalWorkerInitDestroyAndIdleWaitInterval() { + if (g_register_rc.load(std::memory_order_relaxed) != 0) { + return 20; + } + PrepareForCase(); + { + bthread::TaskControl tc; + if (tc.init(2) != 0) { + return 21; + } + if (!WaitAtomicAtLeast(g_state.init_calls, 2, 5000)) { + return 22; + } + const int harvest_snapshot = g_state.harvest_calls.load(std::memory_order_relaxed); + const int64_t old_idle_wait_ns = bthread::FLAGS_bthread_active_task_idle_wait_ns; + bthread::FLAGS_bthread_active_task_idle_wait_ns = 1000 * 1000; // 1ms + g_state.mode.store(TEST_MODE_IDLE_WAIT_INTERVAL, std::memory_order_release); + tc.signal_task(2, BTHREAD_TAG_DEFAULT); + if (!WaitAtomicAtLeast(g_state.harvest_calls, harvest_snapshot + 3, 5000)) { + bthread::FLAGS_bthread_active_task_idle_wait_ns = old_idle_wait_ns; + return 23; + } + bthread::FLAGS_bthread_active_task_idle_wait_ns = old_idle_wait_ns; + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + if (!WaitAtomicEqual(g_state.hook_action_inflight, 0, 5000)) { + return 25; + } + } + return (g_state.destroy_calls.load(std::memory_order_relaxed) == 2 ? 0 : 24); +} + +int RunChildMode(const char* mode) { + pid_t pid = fork(); + if (pid < 0) { + return -1; + } + if (pid == 0) { + char self_path[PATH_MAX]; + const ssize_t n = readlink("/proc/self/exe", self_path, sizeof(self_path) - 1); + if (n <= 0) { + _exit(4); + } + self_path[n] = '\0'; + setenv("BRPC_ACTIVE_TASK_UT_CHILD_MODE", mode, 1); + char* const argv[] = { self_path, NULL }; + execv(self_path, argv); + _exit(5); + } + int status = 0; + if (waitpid(pid, &status, 0) != pid) { + return -1; + } + if (!WIFEXITED(status)) { + return -1; + } + return WEXITSTATUS(status); +} + +void RunButexWakeWithinCase(int waiter_count, + int expected_wake_rc, + int expected_wake_errno) { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedSingleWorkerTaskControl(); + ASSERT_GT(waiter_count, 0); + const bool allow_setup_race_retry = (waiter_count > 1 && expected_wake_rc < 0); + const int max_attempts = allow_setup_race_retry ? 10 : 1; + + for (int attempt = 1; attempt <= max_attempts; ++attempt) { + PrepareForCase(); + + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + g_state.butex_ptr.store(reinterpret_cast(butex), + std::memory_order_relaxed); + g_state.butex_expected_waiters.store(waiter_count, std::memory_order_relaxed); + + std::vector tids(waiter_count, INVALID_BTHREAD); + bool retry_case = false; + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN, std::memory_order_release); + + for (int i = 0; i < waiter_count; ++i) { + ASSERT_EQ(0, bthread_start_background(&tids[i], NULL, + TestButexWaitTask, NULL)); + } + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_ready_count, waiter_count, 5000)); + tc.signal_task(1, BTHREAD_TAG_DEFAULT); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_started, 1, 5000)); + if (expected_wake_rc < 0) { + static_cast*>(butex)->store(1, butil::memory_order_release); + for (int kick = 0; + kick < 20 && + g_state.butex_waiter_done_count.load(std::memory_order_relaxed) < waiter_count; + ++kick) { + ASSERT_GE(bthread::butex_wake_all(butex, true), 0); + usleep(1000); + } + } + const int wake_rc = g_state.butex_wake_rc.load(std::memory_order_relaxed); + const int wake_errno = g_state.butex_wake_errno.load(std::memory_order_relaxed); + + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_done_count, waiter_count, 5000)) + << "wake_rc=" << wake_rc + << " wake_errno=" << wake_errno + << " ready=" << g_state.butex_waiter_ready_count.load(std::memory_order_relaxed) + << " resumed=" << g_state.butex_waiter_resume_count.load(std::memory_order_relaxed) + << " harvest_calls=" << g_state.harvest_calls.load(std::memory_order_relaxed) + << " attempt=" << attempt; + for (int i = 0; i < waiter_count; ++i) { + ASSERT_EQ(0, bthread_join(tids[i], NULL)); + } + ASSERT_EQ(waiter_count, + g_state.butex_waiter_resume_count.load(std::memory_order_relaxed)); + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + QuiesceHookActionsAfterModeIdle(); + const int final_wake_rc = g_state.butex_wake_rc.load(std::memory_order_relaxed); + const int final_wake_errno = g_state.butex_wake_errno.load(std::memory_order_relaxed); + + if (allow_setup_race_retry && final_wake_rc == 1 && attempt < max_attempts) { + retry_case = true; + } else { + ASSERT_EQ(expected_wake_rc, final_wake_rc); + if (expected_wake_rc < 0) { + ASSERT_EQ(expected_wake_errno, final_wake_errno); + } + } + bthread::butex_destroy(butex); + g_state.butex_ptr.store(0, std::memory_order_relaxed); + if (!retry_case) { + return; + } + } +} + +void RunHookOnlyWakeCase(TestMode mode, void* butex, int expected_rc, int expected_errno) { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedSingleWorkerTaskControl(); + PrepareForCase(); + g_state.butex_ptr.store(reinterpret_cast(butex), + std::memory_order_relaxed); + g_state.butex_expected_waiters.store(0, std::memory_order_relaxed); + + g_state.mode.store(mode, std::memory_order_release); + tc.signal_task(1, BTHREAD_TAG_DEFAULT); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)); + ASSERT_EQ(expected_rc, g_state.butex_wake_rc.load(std::memory_order_relaxed)); + if (expected_rc < 0) { + ASSERT_EQ(expected_errno, g_state.butex_wake_errno.load(std::memory_order_relaxed)); + } + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + QuiesceHookActionsAfterModeIdle(); + g_state.butex_ptr.store(0, std::memory_order_relaxed); +} + +void RunPthreadWaiterRejectedCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedSingleWorkerTaskControl(); + PrepareForCase(); + + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + g_state.butex_ptr.store(reinterpret_cast(butex), + std::memory_order_relaxed); + + pthread_t th; + ASSERT_EQ(0, pthread_create(&th, NULL, TestPthreadButexWait, NULL)); + + ASSERT_TRUE(WaitAtomicAtLeast(g_state.pthread_waiter_ready_count, 1, 5000)); + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN_PTHREAD_WAITER, + std::memory_order_release); + tc.signal_task(1, BTHREAD_TAG_DEFAULT); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)); + ASSERT_EQ(-1, g_state.butex_wake_rc.load(std::memory_order_relaxed)); + ASSERT_EQ(EINVAL, g_state.butex_wake_errno.load(std::memory_order_relaxed)); + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + QuiesceHookActionsAfterModeIdle(); + + static_cast*>(butex)->store(1, butil::memory_order_release); + for (int kick = 0; + kick < 20 && + g_state.pthread_waiter_done_count.load(std::memory_order_relaxed) < 1; + ++kick) { + ASSERT_GE(bthread::butex_wake_all(butex, true), 0); + usleep(1000); + } + ASSERT_EQ(0, pthread_join(th, NULL)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.pthread_waiter_done_count, 1, 5000)); + + bthread::butex_destroy(butex); + g_state.butex_ptr.store(0, std::memory_order_relaxed); +} + +struct ScopedPollEveryNSwitch { + explicit ScopedPollEveryNSwitch(int32_t value) + : old_(bthread::FLAGS_bthread_active_task_poll_every_nswitch) { + bthread::FLAGS_bthread_active_task_poll_every_nswitch = value; + } + ~ScopedPollEveryNSwitch() { + bthread::FLAGS_bthread_active_task_poll_every_nswitch = old_; + } + int32_t old_; +}; + +void RunBusyPeriodicPollWakeCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedSingleWorkerTaskControl(); + (void)tc; + PrepareForCase(); + + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + g_state.butex_ptr.store(reinterpret_cast(butex), + std::memory_order_relaxed); + g_state.butex_expected_waiters.store(1, std::memory_order_relaxed); + + ScopedPollEveryNSwitch guard(1); + bthread_t busy_tid = INVALID_BTHREAD; + bthread_t waiter_tid = INVALID_BTHREAD; + g_state.mode.store(TEST_MODE_BUSY_PERIODIC_POLL_WAKE, std::memory_order_release); + + ASSERT_EQ(0, bthread_start_background(&busy_tid, NULL, TestBusyYieldTask, NULL)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.busy_task_started, 1, 5000)); + ASSERT_EQ(0, bthread_start_background(&waiter_tid, NULL, TestButexWaitTask, NULL)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_ready_count, 1, 5000)); + + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_started, 1, 5000)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_done_count, 1, 5000)); + ASSERT_EQ(1, g_state.butex_wake_rc.load(std::memory_order_relaxed)); + ASSERT_GE(g_state.hook_wake_harvest_calls.load(std::memory_order_relaxed), 1); + ASSERT_GE(g_state.harvest_calls.load(std::memory_order_relaxed), 1); + ASSERT_GT(g_state.busy_task_switches.load(std::memory_order_relaxed), 0); + + g_state.busy_task_stop.store(1, std::memory_order_relaxed); + ASSERT_EQ(0, bthread_join(waiter_tid, NULL)); + ASSERT_EQ(0, bthread_join(busy_tid, NULL)); + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + QuiesceHookActionsAfterModeIdle(); + + bthread::butex_destroy(butex); + g_state.butex_ptr.store(0, std::memory_order_relaxed); +} + +bool ChooseTwoDistinctGroups(bthread::TaskControl& tc, + bthread::TaskGroup** g1, + bthread::TaskGroup** g2) { + if (g1 == NULL || g2 == NULL) { + return false; + } + *g1 = NULL; + *g2 = NULL; + if (tc.concurrency(BTHREAD_TAG_DEFAULT) < 2) { + return false; + } + for (int i = 0; i < 256; ++i) { + bthread::TaskGroup* a = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + bthread::TaskGroup* b = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + if (a != NULL && b != NULL && a != b) { + *g1 = a; + *g2 = b; + return true; + } + } + return false; +} + +void RunStrictCrossWorkerRejectCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + PrepareForCase(); + + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + + bthread::TaskGroup* group_a = NULL; + bthread::TaskGroup* group_b = NULL; + ASSERT_TRUE(ChooseTwoDistinctGroups(tc, &group_a, &group_b)); + ASSERT_NE(static_cast(NULL), group_a); + ASSERT_NE(static_cast(NULL), group_b); + ASSERT_NE(group_a, group_b); + ASSERT_NE(0u, PthreadToU64(group_a->tid())); + ASSERT_NE(0u, PthreadToU64(group_b->tid())); + + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + g_state.butex_ptr.store(reinterpret_cast(butex), + std::memory_order_relaxed); + g_state.butex_expected_waiters.store(1, std::memory_order_relaxed); + + ScopedPollEveryNSwitch poll_guard(1); + bthread_t busy_b = INVALID_BTHREAD; + bthread_t waiter_tid = INVALID_BTHREAD; + + ASSERT_EQ(0, group_a->start_background(&waiter_tid, NULL, + TestButexWaitTask, NULL)); + group_a->flush_nosignal_tasks(); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_ready_count, 1, 5000)); + const uint64_t waiter_exec_worker = + g_state.butex_waiter_worker_pthread.load(std::memory_order_relaxed); + ASSERT_NE(0u, waiter_exec_worker); + bthread::TaskGroup* waiter_group = NULL; + bthread::TaskGroup* wrong_waker_group = NULL; + if (waiter_exec_worker == PthreadToU64(group_a->tid())) { + waiter_group = group_a; + wrong_waker_group = group_b; + } else if (waiter_exec_worker == PthreadToU64(group_b->tid())) { + waiter_group = group_b; + wrong_waker_group = group_a; + } else { + FAIL() << "waiter ran on unexpected worker pthread=" << waiter_exec_worker + << " group_a=" << PthreadToU64(group_a->tid()) + << " group_b=" << PthreadToU64(group_b->tid()); + } + ASSERT_EQ(0, wrong_waker_group->start_background(&busy_b, NULL, + TestBusyYieldTask, NULL)); + wrong_waker_group->flush_nosignal_tasks(); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.busy_task_started, 1, 5000)); + + g_state.target_hook_worker_pthread.store(PthreadToU64(wrong_waker_group->tid()), + std::memory_order_relaxed); + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT, + std::memory_order_release); + tc.signal_task(2, BTHREAD_TAG_DEFAULT); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)) + << "target_wrong=" << PthreadToU64(wrong_waker_group->tid()) + << " waiter_group=" << PthreadToU64(waiter_group->tid()) + << " waiter_exec=" << waiter_exec_worker + << " busy_started=" << g_state.busy_task_started.load(std::memory_order_relaxed) + << " harvest_calls=" << g_state.harvest_calls.load(std::memory_order_relaxed) + << " hook_wake_harvest_calls=" + << g_state.hook_wake_harvest_calls.load(std::memory_order_relaxed) + << " rc=" << g_state.butex_wake_rc.load(std::memory_order_relaxed) + << " errno=" << g_state.butex_wake_errno.load(std::memory_order_relaxed) + << " waiter_done=" << g_state.butex_waiter_done_count.load(std::memory_order_relaxed) + << " waiter_resumed=" + << g_state.butex_waiter_resume_count.load(std::memory_order_relaxed); + ASSERT_EQ(-1, g_state.butex_wake_rc.load(std::memory_order_relaxed)); + ASSERT_EQ(EINVAL, g_state.butex_wake_errno.load(std::memory_order_relaxed)); + ASSERT_EQ(0, g_state.butex_waiter_done_count.load(std::memory_order_relaxed)); + + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); + QuiesceHookActionsAfterModeIdle(); + g_state.butex_wake_started.store(0, std::memory_order_relaxed); + g_state.butex_wake_completed.store(0, std::memory_order_relaxed); + g_state.butex_wake_rc.store(0, std::memory_order_relaxed); + g_state.butex_wake_errno.store(0, std::memory_order_relaxed); + g_state.target_hook_worker_pthread.store(PthreadToU64(waiter_group->tid()), + std::memory_order_relaxed); + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN, std::memory_order_release); + tc.signal_task(2, BTHREAD_TAG_DEFAULT); + + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)); + ASSERT_EQ(1, g_state.butex_wake_rc.load(std::memory_order_relaxed)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_done_count, 1, 5000)); + ASSERT_EQ(1, g_state.butex_waiter_resume_count.load(std::memory_order_relaxed)); + + g_state.busy_task_stop.store(1, std::memory_order_relaxed); + ASSERT_EQ(0, bthread_join(waiter_tid, NULL)); + ASSERT_EQ(0, bthread_join(busy_b, NULL)); + + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); + QuiesceHookActionsAfterModeIdle(); + g_state.butex_ptr.store(0, std::memory_order_relaxed); + bthread::butex_destroy(butex); +} + +void RunRequestFlowCase(bool busy_periodic) { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedSingleWorkerTaskControl(); + bthread::TaskGroup* tg = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), tg); + PrepareForCase(); + + MockReqCtx* req = new MockReqCtx; + ASSERT_NE(static_cast(NULL), req); + req->butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), req->butex); + static_cast*>(req->butex)->store(0, butil::memory_order_relaxed); + g_state.pending_req_ptr.store(reinterpret_cast(req), + std::memory_order_release); + + bthread_t waiter_tid = INVALID_BTHREAD; + bthread_t busy_tid = INVALID_BTHREAD; + int32_t old_poll_every_nswitch = 0; + bool poll_overridden = false; + if (busy_periodic) { + old_poll_every_nswitch = bthread::FLAGS_bthread_active_task_poll_every_nswitch; + bthread::FLAGS_bthread_active_task_poll_every_nswitch = 1; + poll_overridden = true; + ASSERT_EQ(0, tg->start_background(&busy_tid, NULL, TestBusyYieldTask, NULL)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.busy_task_started, 1, 5000)); + } + + ASSERT_EQ(0, tg->start_background(&waiter_tid, NULL, TestRequestWaitTask, req)); + ASSERT_TRUE(WaitAtomicAtLeast(req->waiter_ready, 1, 5000)); + + req->completion_published.store(1, std::memory_order_release); + g_state.mode.store(busy_periodic ? TEST_MODE_SCENARIO_REQ_WAKE_BUSY_PERIODIC + : TEST_MODE_SCENARIO_REQ_WAKE, + std::memory_order_release); + if (!busy_periodic) { + tc.signal_task(1, BTHREAD_TAG_DEFAULT); + } + + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_started, 1, 5000)); + ASSERT_TRUE(WaitAtomicAtLeast(req->waiter_done, 1, 5000)) + << "wake_rc=" << req->wake_rc.load(std::memory_order_relaxed) + << " wake_errno=" << req->wake_errno.load(std::memory_order_relaxed) + << " hook_worker=" << req->hook_worker_pthread.load(std::memory_order_relaxed) + << " waiter_worker=" << req->waiter_worker_pthread.load(std::memory_order_relaxed) + << " resume_worker=" << req->resume_worker_pthread.load(std::memory_order_relaxed); + + if (busy_periodic) { + ASSERT_GE(g_state.hook_wake_harvest_calls.load(std::memory_order_relaxed), 1); + ASSERT_GE(g_state.harvest_calls.load(std::memory_order_relaxed), 1); + ASSERT_GT(g_state.busy_task_switches.load(std::memory_order_relaxed), 0); + } + + ASSERT_EQ(1, req->wake_rc.load(std::memory_order_relaxed)); + ASSERT_EQ(1, req->result_ready.load(std::memory_order_acquire)); + ASSERT_EQ(1, req->resume_saw_result_ready.load(std::memory_order_relaxed)); + ASSERT_EQ(0, req->wait_rc.load(std::memory_order_relaxed)); + const uint64_t waiter_worker = req->waiter_worker_pthread.load(std::memory_order_relaxed); + const uint64_t hook_worker = req->hook_worker_pthread.load(std::memory_order_relaxed); + const uint64_t resume_worker = req->resume_worker_pthread.load(std::memory_order_relaxed); + ASSERT_NE(0u, waiter_worker); + ASSERT_NE(0u, hook_worker); + ASSERT_NE(0u, resume_worker); + ASSERT_EQ(waiter_worker, hook_worker); + ASSERT_EQ(waiter_worker, resume_worker); + + if (busy_periodic) { + g_state.busy_task_stop.store(1, std::memory_order_relaxed); + } + ASSERT_EQ(0, bthread_join(waiter_tid, NULL)); + if (busy_periodic) { + ASSERT_EQ(0, bthread_join(busy_tid, NULL)); + } + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + QuiesceHookActionsAfterModeIdle(); + g_state.pending_req_ptr.store(0, std::memory_order_release); + + if (poll_overridden) { + bthread::FLAGS_bthread_active_task_poll_every_nswitch = old_poll_every_nswitch; + } + bthread::butex_destroy(req->butex); + delete req; +} + +void StartTaskOnGroupAndFlush(bthread::TaskGroup* g, + bthread_t* tid, + void* (*fn)(void*), + void* arg) { + ASSERT_NE(static_cast(NULL), g); + ASSERT_NE(static_cast(NULL), tid); + *tid = INVALID_BTHREAD; + ASSERT_EQ(0, g->start_background(tid, NULL, fn, arg)); + g->flush_nosignal_tasks(); +} + +void StartPinnedWaitTaskAndWaitReady(bthread::TaskGroup* g, + PinnedWaitCtx* ctx, + bthread_t* tid, + uint64_t* home_worker) { + ASSERT_NE(static_cast(NULL), g); + ASSERT_NE(static_cast(NULL), ctx); + ASSERT_NE(static_cast(NULL), tid); + ASSERT_NE(static_cast(NULL), home_worker); + StartTaskOnGroupAndFlush(g, tid, TestPinnedWaitTask, ctx); + ASSERT_TRUE(WaitAtomicAtLeast(ctx->ready, 1, 5000)); + *home_worker = ctx->pinned_worker_pthread.load(std::memory_order_relaxed); + ASSERT_NE(0u, *home_worker); +} + +void WaitJoinPinnedWaitTaskAndAssert(PinnedWaitCtx* ctx, + bthread_t tid, + int expected_wait_rc, + int expected_wait_errno, + uint64_t expected_home_worker) { + ASSERT_NE(static_cast(NULL), ctx); + ASSERT_TRUE(WaitAtomicAtLeast(ctx->done, 1, 5000)); + ASSERT_EQ(0, bthread_join(tid, NULL)); + ASSERT_EQ(expected_wait_rc, ctx->wait_rc.load(std::memory_order_relaxed)); + if (expected_wait_rc < 0) { + ASSERT_EQ(expected_wait_errno, ctx->wait_errno.load(std::memory_order_relaxed)); + } + ASSERT_EQ(expected_home_worker, + ctx->resume_worker_pthread.load(std::memory_order_relaxed)); +} + +void RunPinnedGenericWakeCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + bthread::TaskGroup* g1 = NULL; + bthread::TaskGroup* g2 = NULL; + ASSERT_TRUE(ChooseTwoDistinctGroups(tc, &g1, &g2)); + PrepareForCase(); + + PinnedWaitCtx ctx; + ctx.butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), ctx.butex); + static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + + bthread_t tid = INVALID_BTHREAD; + uint64_t home = 0; + StartPinnedWaitTaskAndWaitReady(g1, &ctx, &tid, &home); + bool woke = false; + for (int i = 0; i < 50; ++i) { + const int rc = bthread::butex_wake(ctx.butex, true); + ASSERT_GE(rc, 0); + if (rc == 1) { + woke = true; + break; + } + if (ctx.done.load(std::memory_order_acquire) == 1) { + break; + } + usleep(1000); + } + ASSERT_TRUE(woke) << "pinned generic butex_wake did not observe waiter in queue"; + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, 0, 0, home); + + bthread::butex_destroy(ctx.butex); +} + +void RunPinnedTimeoutCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + bthread::TaskGroup* g = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), g); + PrepareForCase(); + + PinnedWaitCtx ctx; + ctx.butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), ctx.butex); + static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + ctx.use_timeout = true; + ctx.timeout_ms = 20; + + bthread_t tid = INVALID_BTHREAD; + uint64_t home = 0; + StartPinnedWaitTaskAndWaitReady(g, &ctx, &tid, &home); + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, -1, ETIMEDOUT, home); + + bthread::butex_destroy(ctx.butex); +} + +void RunPinnedInterruptCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + bthread::TaskGroup* g = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), g); + PrepareForCase(); + + PinnedWaitCtx ctx; + ctx.butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), ctx.butex); + static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + + bthread_t tid = INVALID_BTHREAD; + uint64_t home = 0; + StartPinnedWaitTaskAndWaitReady(g, &ctx, &tid, &home); + bool interrupt_sent = false; + for (int i = 0; i < 20 && + ctx.done.load(std::memory_order_acquire) == 0; ++i) { + ASSERT_EQ(0, bthread_interrupt(tid)); + interrupt_sent = true; + if (WaitAtomicAtLeast(ctx.done, 1, 50)) { + break; + } + } + ASSERT_TRUE(interrupt_sent); + + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, -1, EINTR, home); + + bthread::butex_destroy(ctx.butex); +} + +void RunButexWaitLocalImmediateEwouldblockCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + bthread::TaskGroup* g = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), g); + PrepareForCase(); + + PinnedWaitCtx ctx; + ctx.butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), ctx.butex); + static_cast*>(ctx.butex)->store(1, butil::memory_order_relaxed); + + bthread_t tid = INVALID_BTHREAD; + uint64_t home = 0; + StartPinnedWaitTaskAndWaitReady(g, &ctx, &tid, &home); + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, -1, EWOULDBLOCK, home); + + bthread::butex_destroy(ctx.butex); +} + +void RunPinnedTimeoutThenWithinWakeReturnsZeroCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + bthread::TaskGroup* g = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), g); + PrepareForCase(); + + PinnedWaitCtx ctx; + ctx.butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), ctx.butex); + static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + ctx.use_timeout = true; + ctx.timeout_ms = 20; + + bthread_t tid = INVALID_BTHREAD; + uint64_t home = 0; + StartPinnedWaitTaskAndWaitReady(g, &ctx, &tid, &home); + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, -1, ETIMEDOUT, home); + + RunHookOnlyWakeCase(TEST_MODE_BUTEX_WAKE_WITHIN_NO_WAITER, ctx.butex, 0, 0); + bthread::butex_destroy(ctx.butex); +} + +void RunPinnedGenericWakeThenWithinWakeReturnsZeroCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + bthread::TaskGroup* g1 = NULL; + bthread::TaskGroup* g2 = NULL; + ASSERT_TRUE(ChooseTwoDistinctGroups(tc, &g1, &g2)); + PrepareForCase(); + + PinnedWaitCtx ctx; + ctx.butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), ctx.butex); + static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + + bthread_t tid = INVALID_BTHREAD; + uint64_t home = 0; + StartPinnedWaitTaskAndWaitReady(g1, &ctx, &tid, &home); + bool woke = false; + for (int i = 0; i < 50; ++i) { + const int rc = bthread::butex_wake(ctx.butex, true); + ASSERT_GE(rc, 0); + if (rc == 1) { + woke = true; + break; + } + if (ctx.done.load(std::memory_order_acquire) == 1) { + break; + } + usleep(1000); + } + ASSERT_TRUE(woke); + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, 0, 0, home); + + RunHookOnlyWakeCase(TEST_MODE_BUTEX_WAKE_WITHIN_NO_WAITER, ctx.butex, 0, 0); + bthread::butex_destroy(ctx.butex); +} + +void RunPinnedInterruptThenWithinWakeReturnsZeroCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + bthread::TaskGroup* g = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), g); + PrepareForCase(); + + PinnedWaitCtx ctx; + ctx.butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), ctx.butex); + static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + + bthread_t tid = INVALID_BTHREAD; + uint64_t home = 0; + StartPinnedWaitTaskAndWaitReady(g, &ctx, &tid, &home); + bool interrupt_sent = false; + for (int i = 0; i < 20 && ctx.done.load(std::memory_order_acquire) == 0; ++i) { + ASSERT_EQ(0, bthread_interrupt(tid)); + interrupt_sent = true; + if (WaitAtomicAtLeast(ctx.done, 1, 50)) { + break; + } + } + ASSERT_TRUE(interrupt_sent); + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, -1, EINTR, home); + + RunHookOnlyWakeCase(TEST_MODE_BUTEX_WAKE_WITHIN_NO_WAITER, ctx.butex, 0, 0); + bthread::butex_destroy(ctx.butex); +} + +void RunPinnedWithinWakeNoStealCase() { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); + bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); + PrepareForCase(); + + bthread::TaskGroup* group_a = NULL; + bthread::TaskGroup* group_b = NULL; + ASSERT_TRUE(ChooseTwoDistinctGroups(tc, &group_a, &group_b)); + + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + g_state.butex_ptr.store(reinterpret_cast(butex), + std::memory_order_relaxed); + g_state.butex_expected_waiters.store(1, std::memory_order_relaxed); + + bthread_t waiter_tid = INVALID_BTHREAD; + StartTaskOnGroupAndFlush(group_a, &waiter_tid, TestPinnedButexLocalWaitTask, NULL); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_ready_count, 1, 5000)); + const uint64_t waiter_worker = + g_state.butex_waiter_worker_pthread.load(std::memory_order_relaxed); + ASSERT_NE(0u, waiter_worker); + g_state.target_hook_worker_pthread.store(waiter_worker, std::memory_order_relaxed); + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN, std::memory_order_release); + tc.signal_task(2, BTHREAD_TAG_DEFAULT); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)); + ASSERT_EQ(1, g_state.butex_wake_rc.load(std::memory_order_relaxed)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_done_count, 1, 5000)); + ASSERT_EQ(1, g_state.butex_waiter_resume_count.load(std::memory_order_relaxed)); + ASSERT_EQ(waiter_worker, + g_state.butex_waiter_resume_worker_pthread.load( + std::memory_order_relaxed)); + + ASSERT_EQ(0, bthread_join(waiter_tid, NULL)); + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); + QuiesceHookActionsAfterModeIdle(); + g_state.butex_ptr.store(0, std::memory_order_relaxed); + bthread::butex_destroy(butex); +} + +void RunPinnedWaiterNotStealableStressCase(int rounds) { + ASSERT_GT(rounds, 0); + for (int i = 0; i < rounds; ++i) { + RunPinnedWithinWakeNoStealCase(); + } +} + +class ActiveTaskTestEnvironment : public ::testing::Environment { +public: + void SetUp() override { + const char* child_mode = getenv("BRPC_ACTIVE_TASK_UT_CHILD_MODE"); + int expected = 0; + if (g_register_once.compare_exchange_strong(expected, 1, + std::memory_order_relaxed)) { + g_register_rc.store(RegisterTestActiveTaskType(), std::memory_order_relaxed); + } + if (child_mode != NULL) { + if (strcmp(child_mode, "register_after_init") == 0) { + _exit(ChildCheckRegisterAfterInitRejected()); + } + if (strcmp(child_mode, "local_worker_init_destroy_and_idle_wait_interval") == 0) { + _exit(ChildCheckLocalWorkerInitDestroyAndIdleWaitInterval()); + } + _exit(100); + } + } +}; + +::testing::Environment* const g_active_task_env = + ::testing::AddGlobalTestEnvironment(new ActiveTaskTestEnvironment); + +} // namespace + +TEST(BthreadActiveTaskTest, register_before_init_succeeds) { + ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); +} + +TEST(BthreadActiveTaskTest, butex_wake_within_outside_hook_is_rejected) { + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + bthread_active_task_ctx_t fake_ctx; + memset(&fake_ctx, 0, sizeof(fake_ctx)); + fake_ctx.struct_size = sizeof(fake_ctx); + errno = 0; + ASSERT_EQ(-1, bthread_butex_wake_within(&fake_ctx, butex)); + ASSERT_EQ(EPERM, errno); + bthread::butex_destroy(butex); +} + +TEST(BthreadActiveTaskTest, butex_wake_within_local_wakes_waiter) { + RunButexWakeWithinCase(1, 1, 0); +} + +TEST(BthreadActiveTaskTest, butex_wake_within_null_butex_is_rejected_in_hook) { + RunHookOnlyWakeCase(TEST_MODE_BUTEX_WAKE_WITHIN_NULL, NULL, -1, EINVAL); +} + +TEST(BthreadActiveTaskTest, butex_wake_within_no_waiter_returns_zero_in_hook) { + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + RunHookOnlyWakeCase(TEST_MODE_BUTEX_WAKE_WITHIN_NO_WAITER, butex, 0, 0); + bthread::butex_destroy(butex); +} + +TEST(BthreadActiveTaskTest, butex_wake_within_multiple_waiters_rejected) { + RunButexWakeWithinCase(2, -1, EINVAL); +} + +TEST(BthreadActiveTaskTest, butex_wake_within_pthread_waiter_rejected) { + RunPthreadWaiterRejectedCase(); +} + +TEST(BthreadActiveTaskTest, busy_periodic_poll_can_wake_waiter_without_parking) { + RunBusyPeriodicPollWakeCase(); +} + +TEST(BthreadActiveTaskTest, request_flow_private_butex_is_passed_and_woken_locally) { + RunRequestFlowCase(false); +} + +TEST(BthreadActiveTaskTest, request_flow_busy_periodic_poll_wakes_waiter) { + RunRequestFlowCase(true); +} + +TEST(BthreadActiveTaskTest, local_worker_init_destroy_and_idle_wait_interval) { + ASSERT_EQ(0, RunChildMode("local_worker_init_destroy_and_idle_wait_interval")); +} + +TEST(BthreadActiveTaskTest, register_after_global_init_is_rejected) { + ASSERT_EQ(0, RunChildMode("register_after_init")); +} + +TEST(BthreadActiveTaskTest, stress_single_waiter_local_wake) { + for (int i = 0; i < 50; ++i) { + RunButexWakeWithinCase(1, 1, 0); + } +} + +TEST(BthreadActiveTaskTest, stress_multiple_waiters_rejected) { + for (int i = 0; i < 10; ++i) { + RunButexWakeWithinCase(2, -1, EINVAL); + } +} + +TEST(BthreadActiveTaskTest, strict_same_worker_rejects_cross_worker_wake) { + RunStrictCrossWorkerRejectCase(); +} + +TEST(BthreadActiveTaskTest, butex_wait_local_outside_bthread_rejected) { + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + errno = 0; + ASSERT_EQ(-1, bthread_butex_wait_local(butex, 0, NULL)); + ASSERT_EQ(EPERM, errno); + bthread::butex_destroy(butex); +} + +TEST(BthreadActiveTaskTest, butex_wait_local_immediate_ewouldblock_uses_implicit_wait_scope_pin) { + RunButexWaitLocalImmediateEwouldblockCase(); +} + +TEST(BthreadActiveTaskTest, pinned_waiter_active_task_within_resumes_on_home_worker) { + RunPinnedWithinWakeNoStealCase(); +} + +TEST(BthreadActiveTaskTest, pinned_waiter_active_task_within_not_stealable_stress) { + RunPinnedWaiterNotStealableStressCase(20); +} + +TEST(BthreadActiveTaskTest, pinned_waiter_timeout_resumes_on_home_worker) { + RunPinnedTimeoutCase(); +} + +TEST(BthreadActiveTaskTest, pinned_waiter_interrupt_resumes_on_home_worker) { + RunPinnedInterruptCase(); +} + +TEST(BthreadActiveTaskTest, generic_butex_wake_on_pinned_waiter_routes_to_home_worker) { + RunPinnedGenericWakeCase(); +} + +TEST(BthreadActiveTaskTest, wake_within_returns_zero_after_timeout_competition) { + RunPinnedTimeoutThenWithinWakeReturnsZeroCase(); +} + +TEST(BthreadActiveTaskTest, wake_within_returns_zero_after_interrupt_competition) { + RunPinnedInterruptThenWithinWakeReturnsZeroCase(); +} + +TEST(BthreadActiveTaskTest, wake_within_returns_zero_after_generic_wake_competition) { + RunPinnedGenericWakeThenWithinWakeReturnsZeroCase(); +} From 84aa1d51275ef98ead8fb6697e7f71bb68aa58ee Mon Sep 17 00:00:00 2001 From: MalikHou Date: Fri, 27 Feb 2026 13:31:55 +0800 Subject: [PATCH 2/8] fix --- test/bthread_active_task_unittest.cpp | 104 ++++++++++++++++---------- 1 file changed, 65 insertions(+), 39 deletions(-) diff --git a/test/bthread_active_task_unittest.cpp b/test/bthread_active_task_unittest.cpp index 0bc9834f01..19d56a068d 100644 --- a/test/bthread_active_task_unittest.cpp +++ b/test/bthread_active_task_unittest.cpp @@ -171,10 +171,8 @@ struct PinnedWaitCtx { ActiveTaskTestState g_state; std::atomic g_register_rc(-1); std::atomic g_register_once(0); -bthread::TaskControl* g_shared_single_worker_tc = NULL; -std::atomic g_shared_single_worker_tc_once(0); -bthread::TaskControl* g_shared_two_worker_tc = NULL; -std::atomic g_shared_two_worker_tc_once(0); +bthread::TaskControl* g_shared_tc = NULL; +std::atomic g_shared_tc_once(0); void ResetState() { g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); @@ -244,30 +242,28 @@ void PrepareForCase() { ResetState(); } -bthread::TaskControl& GetSharedSingleWorkerTaskControl() { +bthread::TaskControl& GetSharedTaskControl() { int expected = 0; - if (g_shared_single_worker_tc_once.compare_exchange_strong( + if (g_shared_tc_once.compare_exchange_strong( expected, 1, std::memory_order_relaxed)) { - g_shared_single_worker_tc = new bthread::TaskControl(); - CHECK(g_shared_single_worker_tc != NULL); - CHECK_EQ(0, g_shared_single_worker_tc->init(1)); - CHECK(WaitAtomicAtLeast(g_state.init_calls, 1, 5000)); + g_shared_tc = new bthread::TaskControl(); + CHECK(g_shared_tc != NULL); + // Keep one TaskControl instance in this process. Multiple TaskControl + // instances expose fixed-name bvars and conflict in CI builds with + // BRPC_BTHREAD_TRACER enabled. + CHECK_EQ(0, g_shared_tc->init(2)); + CHECK(WaitAtomicAtLeast(g_state.init_calls, 2, 5000)); } - CHECK(g_shared_single_worker_tc != NULL); - return *g_shared_single_worker_tc; + CHECK(g_shared_tc != NULL); + return *g_shared_tc; +} + +bthread::TaskControl& GetSharedSingleWorkerTaskControl() { + return GetSharedTaskControl(); } bthread::TaskControl& GetSharedTwoWorkerTaskControl() { - int expected = 0; - if (g_shared_two_worker_tc_once.compare_exchange_strong( - expected, 1, std::memory_order_relaxed)) { - g_shared_two_worker_tc = new bthread::TaskControl(); - CHECK(g_shared_two_worker_tc != NULL); - CHECK_EQ(0, g_shared_two_worker_tc->init(2)); - CHECK(WaitAtomicAtLeast(g_state.init_calls, 2, 5000)); - } - CHECK(g_shared_two_worker_tc != NULL); - return *g_shared_two_worker_tc; + return GetSharedTaskControl(); } void* TestButexWaitTask(void*) { @@ -329,7 +325,7 @@ void* TestRequestWaitTask(void* arg) { std::memory_order_relaxed); req->waiter_ready.store(1, std::memory_order_release); errno = 0; - const int rc = bthread::butex_wait(req->butex, 0, NULL); + const int rc = bthread_butex_wait_local(req->butex, 0, NULL); const int err = errno; req->wait_rc.store(rc, std::memory_order_relaxed); req->wait_errno.store(err, std::memory_order_relaxed); @@ -647,6 +643,8 @@ void RunButexWakeWithinCase(int waiter_count, int expected_wake_errno) { ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); bthread::TaskControl& tc = GetSharedSingleWorkerTaskControl(); + bthread::TaskGroup* tg = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), tg); ASSERT_GT(waiter_count, 0); const bool allow_setup_race_retry = (waiter_count > 1 && expected_wake_rc < 0); const int max_attempts = allow_setup_race_retry ? 10 : 1; @@ -663,15 +661,35 @@ void RunButexWakeWithinCase(int waiter_count, std::vector tids(waiter_count, INVALID_BTHREAD); bool retry_case = false; - g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN, std::memory_order_release); for (int i = 0; i < waiter_count; ++i) { - ASSERT_EQ(0, bthread_start_background(&tids[i], NULL, - TestButexWaitTask, NULL)); + ASSERT_EQ(0, tg->start_background(&tids[i], NULL, + TestButexWaitTask, NULL)); } + tg->flush_nosignal_tasks(); ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_ready_count, waiter_count, 5000)); - tc.signal_task(1, BTHREAD_TAG_DEFAULT); + if (waiter_count == 1) { + const uint64_t waiter_worker = + g_state.butex_waiter_worker_pthread.load(std::memory_order_relaxed); + ASSERT_NE(0u, waiter_worker); + g_state.target_hook_worker_pthread.store(waiter_worker, + std::memory_order_relaxed); + } + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN, std::memory_order_release); + tc.signal_task(2, BTHREAD_TAG_DEFAULT); ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_started, 1, 5000)); + int observed_wake_rc = g_state.butex_wake_rc.load(std::memory_order_relaxed); + int observed_wake_errno = g_state.butex_wake_errno.load(std::memory_order_relaxed); + if (expected_wake_rc < 0) { + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)) + << "waiters=" << waiter_count + << " ready=" << g_state.butex_waiter_ready_count.load(std::memory_order_relaxed) + << " started=" << g_state.butex_wake_started.load(std::memory_order_relaxed) + << " rc=" << g_state.butex_wake_rc.load(std::memory_order_relaxed) + << " errno=" << g_state.butex_wake_errno.load(std::memory_order_relaxed); + observed_wake_rc = g_state.butex_wake_rc.load(std::memory_order_relaxed); + observed_wake_errno = g_state.butex_wake_errno.load(std::memory_order_relaxed); + } if (expected_wake_rc < 0) { static_cast*>(butex)->store(1, butil::memory_order_release); for (int kick = 0; @@ -682,12 +700,10 @@ void RunButexWakeWithinCase(int waiter_count, usleep(1000); } } - const int wake_rc = g_state.butex_wake_rc.load(std::memory_order_relaxed); - const int wake_errno = g_state.butex_wake_errno.load(std::memory_order_relaxed); ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_done_count, waiter_count, 5000)) - << "wake_rc=" << wake_rc - << " wake_errno=" << wake_errno + << "wake_rc=" << observed_wake_rc + << " wake_errno=" << observed_wake_errno << " ready=" << g_state.butex_waiter_ready_count.load(std::memory_order_relaxed) << " resumed=" << g_state.butex_waiter_resume_count.load(std::memory_order_relaxed) << " harvest_calls=" << g_state.harvest_calls.load(std::memory_order_relaxed) @@ -698,16 +714,18 @@ void RunButexWakeWithinCase(int waiter_count, ASSERT_EQ(waiter_count, g_state.butex_waiter_resume_count.load(std::memory_order_relaxed)); g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); QuiesceHookActionsAfterModeIdle(); const int final_wake_rc = g_state.butex_wake_rc.load(std::memory_order_relaxed); - const int final_wake_errno = g_state.butex_wake_errno.load(std::memory_order_relaxed); - if (allow_setup_race_retry && final_wake_rc == 1 && attempt < max_attempts) { + if (allow_setup_race_retry && observed_wake_rc == 1 && attempt < max_attempts) { retry_case = true; } else { - ASSERT_EQ(expected_wake_rc, final_wake_rc); if (expected_wake_rc < 0) { - ASSERT_EQ(expected_wake_errno, final_wake_errno); + ASSERT_EQ(expected_wake_rc, observed_wake_rc); + ASSERT_EQ(expected_wake_errno, observed_wake_errno); + } else { + ASSERT_EQ(expected_wake_rc, final_wake_rc); } } bthread::butex_destroy(butex); @@ -791,7 +809,8 @@ struct ScopedPollEveryNSwitch { void RunBusyPeriodicPollWakeCase() { ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); bthread::TaskControl& tc = GetSharedSingleWorkerTaskControl(); - (void)tc; + bthread::TaskGroup* tg = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + ASSERT_NE(static_cast(NULL), tg); PrepareForCase(); void* butex = bthread::butex_create(); @@ -804,12 +823,18 @@ void RunBusyPeriodicPollWakeCase() { ScopedPollEveryNSwitch guard(1); bthread_t busy_tid = INVALID_BTHREAD; bthread_t waiter_tid = INVALID_BTHREAD; - g_state.mode.store(TEST_MODE_BUSY_PERIODIC_POLL_WAKE, std::memory_order_release); - ASSERT_EQ(0, bthread_start_background(&busy_tid, NULL, TestBusyYieldTask, NULL)); + ASSERT_EQ(0, tg->start_background(&busy_tid, NULL, TestBusyYieldTask, NULL)); + tg->flush_nosignal_tasks(); ASSERT_TRUE(WaitAtomicAtLeast(g_state.busy_task_started, 1, 5000)); - ASSERT_EQ(0, bthread_start_background(&waiter_tid, NULL, TestButexWaitTask, NULL)); + ASSERT_EQ(0, tg->start_background(&waiter_tid, NULL, TestButexWaitTask, NULL)); + tg->flush_nosignal_tasks(); ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_ready_count, 1, 5000)); + const uint64_t waiter_worker = + g_state.butex_waiter_worker_pthread.load(std::memory_order_relaxed); + ASSERT_NE(0u, waiter_worker); + g_state.target_hook_worker_pthread.store(waiter_worker, std::memory_order_relaxed); + g_state.mode.store(TEST_MODE_BUSY_PERIODIC_POLL_WAKE, std::memory_order_release); ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_started, 1, 5000)); ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_done_count, 1, 5000)); @@ -822,6 +847,7 @@ void RunBusyPeriodicPollWakeCase() { ASSERT_EQ(0, bthread_join(waiter_tid, NULL)); ASSERT_EQ(0, bthread_join(busy_tid, NULL)); g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); QuiesceHookActionsAfterModeIdle(); bthread::butex_destroy(butex); From 3f3c829bd7ee01733927a2460bd7e301e0322bbb Mon Sep 17 00:00:00 2001 From: MalikHou Date: Fri, 27 Feb 2026 21:12:06 +0800 Subject: [PATCH 3/8] fix --- .../install-all-dependencies/action.yml | 19 +- .../install-essential-dependencies/action.yml | 20 +- docs/cn/bthread_active_task.md | 14 +- src/bthread/butex.cpp | 84 ++++++++- src/bthread/butex.h | 8 + src/bthread/task_control.cpp | 3 +- src/bthread/task_group.cpp | 176 +++++++++--------- src/bthread/task_group.h | 20 +- src/bthread/task_group_inl.h | 4 + src/bthread/unstable.h | 4 + test/bthread_active_task_unittest.cpp | 143 ++++++++++---- 11 files changed, 362 insertions(+), 133 deletions(-) diff --git a/.github/actions/install-all-dependencies/action.yml b/.github/actions/install-all-dependencies/action.yml index 179f86cd4f..c400787f0b 100644 --- a/.github/actions/install-all-dependencies/action.yml +++ b/.github/actions/install-all-dependencies/action.yml @@ -2,7 +2,24 @@ runs: using: "composite" steps: - uses: ./.github/actions/install-essential-dependencies - - run: sudo apt-get install -y libunwind-dev libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev + - run: | + set -euo pipefail + retry() { + local max_retries=3 + local attempt=1 + while true; do + if "$@"; then + return 0 + fi + if [ "${attempt}" -ge "${max_retries}" ]; then + return 1 + fi + sleep $((attempt * 10)) + attempt=$((attempt + 1)) + done + } + retry sudo env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + libunwind-dev libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev shell: bash - run: | wget https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf thrift-0.11.0.tar.gz && cd thrift-0.11.0/ diff --git a/.github/actions/install-essential-dependencies/action.yml b/.github/actions/install-essential-dependencies/action.yml index 3411b7f7c1..47d542424a 100644 --- a/.github/actions/install-essential-dependencies/action.yml +++ b/.github/actions/install-essential-dependencies/action.yml @@ -3,5 +3,23 @@ runs: steps: - run: ulimit -c unlimited -S && sudo bash -c "echo 'core.%e.%p' > /proc/sys/kernel/core_pattern" shell: bash - - run: sudo apt-get install -y git g++ make libssl-dev libgflags-dev libprotobuf-dev libprotoc-dev protobuf-compiler libleveldb-dev + - run: | + set -euo pipefail + retry() { + local max_retries=3 + local attempt=1 + while true; do + if "$@"; then + return 0 + fi + if [ "${attempt}" -ge "${max_retries}" ]; then + return 1 + fi + sleep $((attempt * 10)) + attempt=$((attempt + 1)) + done + } + retry sudo apt-get -o Acquire::Retries=3 update + retry sudo env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + git g++ make libssl-dev libgflags-dev libprotobuf-dev libprotoc-dev protobuf-compiler libleveldb-dev shell: bash diff --git a/docs/cn/bthread_active_task.md b/docs/cn/bthread_active_task.md index 4291add01c..81868bc55c 100644 --- a/docs/cn/bthread_active_task.md +++ b/docs/cn/bthread_active_task.md @@ -38,7 +38,7 @@ - `harvest` 返回值语义:`0` 表示正常;`1` 表示本轮 worker loop 跳过一次 `ParkingLot::wait`(立即重试)。 - `bthread_butex_wake_within(ctx, butex)` 只允许在 active-task `harvest` 回调中调用。 - `bthread_butex_wait_local(...)` 内部会对本次 wait 启用 **隐式 wait-scope 本地化 pin**: - - 从进入 wait 到返回(成功/超时/中断/被普通 `butex_wake*` 唤醒)这一段,恢复会被路由回 home worker + - 从进入 wait 到返回(成功/超时/中断)这一段,恢复会被路由回 home worker - 恢复前不会被 steal - 返回后 task 恢复默认调度行为(后续 `yield` 仍可能迁移) - `bthread_butex_wake_within` 只适用于“每请求私有 butex(单 waiter)”模型: @@ -258,7 +258,10 @@ Active-task `harvest` 回调会在 worker 调度循环的多个内部时机被 - active-task `bthread_butex_wake_within(...)` - timeout(TimerThread) - interruption(`bthread_interrupt` / `bthread_stop`) -- 普通 `butex_wake*`(即使误用普通 wake,也会按 pin 语义回 home worker) +- `bthread_butex_wake_within(...)` 是外部允许的唯一正常唤醒路径(strict 模式) +- timeout / interruption 是 runtime 内部路径 + +strict 模式下,普通 `butex_wake*` 命中 pinned waiter 会返回 `-1/EINVAL`(不做隐式回退)。 因此在这次 `bthread_butex_wait_local(...)` 调用的生命周期内: @@ -293,8 +296,15 @@ Active-task `harvest` 回调会在 worker 调度循环的多个内部时机被 - 检查是否误用了多 waiter butex - 检查是否在 pthread waiter 上使用了 within wake - 检查 tag / `TaskControl` 是否匹配 + - 检查是否对 `bthread_butex_wait_local` 的 waiter 误用了普通 `butex_wake*` - 检查 completion 是否被错误 worker 的 `harvest` 收割(ownership/routing 问题) +可观测计数(累计 bvar): + +- `bthread_butex_strict_reject_count`:普通 `butex_wake*` 命中 pinned waiter 被 strict 拒绝次数 +- `bthread_butex_within_no_waiter_count`:`bthread_butex_wake_within` 返回 `0`(无 waiter)次数 +- `bthread_butex_within_invalid_count`:`bthread_butex_wake_within` 返回 `-1/EINVAL` 次数 + ## 注意事项(务必遵守) - `bthread_register_active_task_type()` 必须在 bthread/brpc 初始化前调用 diff --git a/src/bthread/butex.cpp b/src/bthread/butex.cpp index ffa43f7e5f..46b09708be 100644 --- a/src/bthread/butex.cpp +++ b/src/bthread/butex.cpp @@ -24,11 +24,12 @@ #include "butil/macros.h" #include "butil/containers/flat_map.h" #include "butil/containers/linked_list.h" // LinkNode -#ifdef SHOW_BTHREAD_BUTEX_WAITER_COUNT_IN_VARS #include "butil/memory/singleton_on_pthread_once.h" +#ifdef SHOW_BTHREAD_BUTEX_WAITER_COUNT_IN_VARS #endif #include "butil/logging.h" #include "butil/object_pool.h" +#include "bvar/bvar.h" #include "bthread/errno.h" // EWOULDBLOCK #include "bthread/sys_futex.h" // futex_* #include "bthread/processor.h" // cpu_relax @@ -68,6 +69,33 @@ inline bvar::Adder& butex_waiter_count() { } #endif +struct ButexStrictRejectCount : public bvar::Adder { + ButexStrictRejectCount() + : bvar::Adder("bthread_butex_strict_reject_count") {} +}; + +struct ButexWithinNoWaiterCount : public bvar::Adder { + ButexWithinNoWaiterCount() + : bvar::Adder("bthread_butex_within_no_waiter_count") {} +}; + +struct ButexWithinInvalidCount : public bvar::Adder { + ButexWithinInvalidCount() + : bvar::Adder("bthread_butex_within_invalid_count") {} +}; + +inline bvar::Adder& butex_strict_reject_count() { + return *butil::get_leaky_singleton(); +} + +inline bvar::Adder& butex_within_no_waiter_count() { + return *butil::get_leaky_singleton(); +} + +inline bvar::Adder& butex_within_invalid_count() { + return *butil::get_leaky_singleton(); +} + enum WaiterState { WAITER_STATE_NONE, WAITER_STATE_READY, @@ -309,8 +337,34 @@ inline void run_in_local_task_group(TaskGroup* g, TaskMeta* next_meta, bool nosi } } +inline bool is_pinned_waiter(const ButexWaiter* bw) { + if (bw == NULL || bw->tid == 0) { + return false; + } + const ButexBthreadWaiter* bbw = static_cast(bw); + const TaskMeta* meta = bbw->task_meta; + return meta != NULL && meta->local_pin_enabled && meta->local_pin_depth > 0; +} + +template +inline bool reject_if_selected_contains_pinned(ButexWaiterList* waiters, + const ShouldCheck& should_check) { + size_t index = 0; + for (butil::LinkNode* p = waiters->head(); + p != waiters->end(); p = p->next(), ++index) { + ButexWaiter* bw = p->value(); + if (should_check(bw, index) && is_pinned_waiter(bw)) { + butex_strict_reject_count() << 1; + errno = EINVAL; + return true; + } + } + return false; +} + int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { if (arg == NULL || target_group == NULL) { + butex_within_invalid_count() << 1; errno = EINVAL; return -1; } @@ -319,15 +373,18 @@ int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { { BAIDU_SCOPED_LOCK(b->waiter_lock); if (b->waiters.empty()) { + butex_within_no_waiter_count() << 1; return 0; } butil::LinkNode* head = b->waiters.head(); if (head->next() != b->waiters.end()) { + butex_within_invalid_count() << 1; errno = EINVAL; return -1; } ButexWaiter* bw = head->value(); if (bw->tid == 0) { + butex_within_invalid_count() << 1; errno = EINVAL; return -1; } @@ -335,6 +392,7 @@ int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { if (bbw->home_group != target_group || bbw->control != target_group->control() || bbw->tag != target_group->tag()) { + butex_within_invalid_count() << 1; errno = EINVAL; return -1; } @@ -356,6 +414,11 @@ int butex_wake(void* arg, bool nosignal) { return 0; } front = b->waiters.head()->value(); + if (is_pinned_waiter(front)) { + butex_strict_reject_count() << 1; + errno = EINVAL; + return -1; + } front->RemoveFromList(); front->container.store(NULL, butil::memory_order_relaxed); } @@ -381,6 +444,13 @@ int butex_wake_n(void* arg, size_t n, bool nosignal) { ButexWaiterList pthread_waiters; { BAIDU_SCOPED_LOCK(b->waiter_lock); + if (reject_if_selected_contains_pinned( + &b->waiters, + [n](const ButexWaiter*, size_t index) { + return n == 0 || index < n; + })) { + return -1; + } for (size_t i = 0; (n == 0 || i < n) && !b->waiters.empty(); ++i) { ButexWaiter* bw = b->waiters.head()->value(); bw->RemoveFromList(); @@ -450,6 +520,13 @@ int butex_wake_except(void* arg, bthread_t excluded_bthread) { { ButexWaiter* excluded_waiter = NULL; BAIDU_SCOPED_LOCK(b->waiter_lock); + if (reject_if_selected_contains_pinned( + &b->waiters, + [excluded_bthread](const ButexWaiter* bw, size_t) { + return bw->tid != 0 && bw->tid != excluded_bthread; + })) { + return -1; + } while (!b->waiters.empty()) { ButexWaiter* bw = b->waiters.head()->value(); bw->RemoveFromList(); @@ -515,6 +592,11 @@ int butex_requeue(void* arg, void* arg2) { if (b->waiters.empty()) { return 0; } + if (reject_if_selected_contains_pinned( + &b->waiters, + [](const ButexWaiter*, size_t) { return true; })) { + return -1; + } front = b->waiters.head()->value(); front->RemoveFromList(); diff --git a/src/bthread/butex.h b/src/bthread/butex.h index 2e7a195d7f..ef2e4eed6b 100644 --- a/src/bthread/butex.h +++ b/src/bthread/butex.h @@ -53,11 +53,15 @@ void butex_destroy(void* butex); // Wake up at most 1 thread waiting on |butex|. // Returns # of threads woken up. +// Returns -1 and sets errno=EINVAL when the selected waiter is in +// bthread_butex_wait_local() strict pinned scope. int butex_wake(void* butex, bool nosignal = false); // Wake up all threads waiting on |butex| if n is zero, // Otherwise, wake up at most n thread waiting on |butex|. // Returns # of threads woken up. +// Returns -1 and sets errno=EINVAL when selected waiters include strict +// pinned waiters from bthread_butex_wait_local(). int butex_wake_n(void* butex, size_t n, bool nosignal = false); // Wake up all threads waiting on |butex|. @@ -67,6 +71,8 @@ int butex_wake_all(void* butex, bool nosignal = false); // Wake up all threads waiting on |butex| except a bthread whose identifier // is |excluded_bthread|. This function does not yield. // Returns # of threads woken up. +// Returns -1 and sets errno=EINVAL when selected waiters include strict +// pinned waiters from bthread_butex_wait_local(). int butex_wake_except(void* butex, bthread_t excluded_bthread); // Internal helper used by active-task within wake APIs. Explicitly enqueue the @@ -78,6 +84,8 @@ int butex_wake_to_task_group(void* butex, TaskGroup* target_group); // Wake up at most 1 thread waiting on |butex1|, let all other threads wait // on |butex2| instead. // Returns # of threads woken up. +// Returns -1 and sets errno=EINVAL when source queue contains strict pinned +// waiters from bthread_butex_wait_local(). int butex_requeue(void* butex1, void* butex2); // Atomically wait on |butex| if *butex equals |expected_value|, until the diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index fbcdedf3e9..d7fdb79a26 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -662,7 +662,8 @@ int64_t TaskControl::get_cumulated_signal_count() { BAIDU_SCOPED_LOCK(_modify_group_mutex); for_each_task_group([&](TaskGroup* g) { if (g) { - c += g->_nsignaled + g->_remote_nsignaled; + c += g->_nsignaled + g->_remote_nsignaled + + g->_pinned_remote_nsignaled; } }); return c; diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 785c6c7260..807b254ab9 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -235,6 +235,7 @@ int TaskGroup::butex_wake_within_active_task(const bthread_active_task_ctx_t* ct int TaskGroup::init_active_tasks_for_worker() { _active_task_instances.clear(); + _has_active_task_harvest = false; if (_control == NULL || _control->_active_task_types.empty()) { return 0; } @@ -284,6 +285,9 @@ int TaskGroup::init_active_tasks_for_worker() { inst.worker_local = worker_local; } inst.initialized = true; + if (inst.type.harvest != NULL) { + _has_active_task_harvest = true; + } } return 0; } @@ -306,6 +310,7 @@ void TaskGroup::destroy_active_tasks_for_worker() { inst.worker_local = NULL; } _active_task_instances.clear(); + _has_active_task_harvest = false; } void TaskGroup::run_active_tasks_harvest(bool* skip_park) { @@ -340,7 +345,7 @@ void TaskGroup::run_active_tasks_harvest(bool* skip_park) { } bool TaskGroup::wait_task(bthread_t* tid) { - if (__builtin_expect(_active_task_instances.empty(), 1)) { + if (__builtin_expect(!_has_active_task_harvest, 1)) { do { #ifndef BTHREAD_DONT_SAVE_PARKING_STATE if (_last_pl_state.stopped()) { @@ -364,6 +369,9 @@ bool TaskGroup::wait_task(bthread_t* tid) { } do { + if (_pl->get_state().stopped()) { + return false; + } if (pop_next_task_local_first(tid)) { return true; } @@ -374,6 +382,9 @@ bool TaskGroup::wait_task(bthread_t* tid) { } const int64_t idle_wait_ns = FLAGS_bthread_active_task_idle_wait_ns; if (harvest_skip_park || idle_wait_ns == 0) { + if (_pl->get_state().stopped()) { + return false; + } continue; } timespec timeout_ts{}; @@ -429,7 +440,7 @@ void TaskGroup::run_main_task() { if (_cur_meta->tid != _main_tid) { task_runner(1/*skip remained*/); } - if (!_active_task_instances.empty()) { + if (_has_active_task_harvest) { const int every_nswitch = FLAGS_bthread_active_task_poll_every_nswitch; if (every_nswitch > 0 && _nswitch - last_active_task_periodic_poll_nswitch >= @@ -901,6 +912,23 @@ bool TaskGroup::is_locally_pinned_task(const TaskMeta* meta) { meta->local_pin_home_group != NULL; } +bool TaskGroup::route_to_pinned_home(TaskMeta* meta, bool nosignal) { + if (!is_locally_pinned_task(meta)) { + return false; + } + TaskGroup* home = meta->local_pin_home_group; + if (home == NULL) { + LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; + return true; + } + if (BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group) == home) { + home->ready_to_run_pinned_local(meta, nosignal); + } else { + home->ready_to_run_pinned_remote(meta, nosignal); + } + return true; +} + bool TaskGroup::pop_next_task_local_first(bthread_t* tid) { if (_pinned_rq.pop(tid)) { return true; @@ -1093,14 +1121,7 @@ void TaskGroup::ready_to_run_local_raw(TaskMeta* meta, bool nosignal) { _control->_task_tracer.set_status(TASK_STATUS_READY, meta); #endif // BRPC_BTHREAD_TRACER push_rq(meta->tid); - if (nosignal) { - ++_num_nosignal; - } else { - const int additional_signal = _num_nosignal; - _num_nosignal = 0; - _nsignaled += 1 + additional_signal; - _control->signal_task(1 + additional_signal, _tag); - } + on_local_ready_enqueued(nosignal); } void TaskGroup::ready_to_run_pinned_local(TaskMeta* meta, bool nosignal) { @@ -1108,6 +1129,10 @@ void TaskGroup::ready_to_run_pinned_local(TaskMeta* meta, bool nosignal) { _control->_task_tracer.set_status(TASK_STATUS_READY, meta); #endif // BRPC_BTHREAD_TRACER push_pinned_rq(meta->tid); + on_local_ready_enqueued(nosignal); +} + +void TaskGroup::on_local_ready_enqueued(bool nosignal) { if (nosignal) { ++_num_nosignal; } else { @@ -1119,17 +1144,7 @@ void TaskGroup::ready_to_run_pinned_local(TaskMeta* meta, bool nosignal) { } void TaskGroup::ready_to_run(TaskMeta* meta, bool nosignal) { - if (is_locally_pinned_task(meta)) { - TaskGroup* home = meta->local_pin_home_group; - if (home == NULL) { - LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; - return; - } - if (BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group) == home) { - home->ready_to_run_pinned_local(meta, nosignal); - } else { - home->ready_to_run_pinned_remote(meta, nosignal); - } + if (route_to_pinned_home(meta, nosignal)) { return; } ready_to_run_local_raw(meta, nosignal); @@ -1144,59 +1159,54 @@ void TaskGroup::flush_nosignal_tasks() { } } -void TaskGroup::ready_to_run_remote_raw(TaskMeta* meta, bool nosignal) { +void TaskGroup::on_remote_ready_enqueued( + TaskMeta* meta, + RemoteTaskQueue* rq, + int* num_nosignal, + int* nsignaled, + bool nosignal, + const char* rq_name, + void (TaskGroup::*flush_locked)(butil::Mutex&)) { #ifdef BRPC_BTHREAD_TRACER _control->_task_tracer.set_status(TASK_STATUS_READY, meta); #endif // BRPC_BTHREAD_TRACER - _remote_rq._mutex.lock(); - while (!_remote_rq.push_locked(meta->tid)) { - flush_nosignal_tasks_remote_locked(_remote_rq._mutex); - LOG_EVERY_SECOND(ERROR) << "_remote_rq is full, capacity=" - << _remote_rq.capacity(); + rq->_mutex.lock(); + while (!rq->push_locked(meta->tid)) { + (this->*flush_locked)(rq->_mutex); + LOG_EVERY_SECOND(ERROR) << rq_name << " is full, capacity=" + << rq->capacity(); ::usleep(1000); - _remote_rq._mutex.lock(); + rq->_mutex.lock(); } if (nosignal) { - ++_remote_num_nosignal; - _remote_rq._mutex.unlock(); + ++*num_nosignal; + rq->_mutex.unlock(); } else { - const int additional_signal = _remote_num_nosignal; - _remote_num_nosignal = 0; - _remote_nsignaled += 1 + additional_signal; - _remote_rq._mutex.unlock(); + const int additional_signal = *num_nosignal; + *num_nosignal = 0; + *nsignaled += 1 + additional_signal; + rq->_mutex.unlock(); _control->signal_task(1 + additional_signal, _tag); } } +void TaskGroup::ready_to_run_remote_raw(TaskMeta* meta, bool nosignal) { + on_remote_ready_enqueued(meta, &_remote_rq, + &_remote_num_nosignal, &_remote_nsignaled, + nosignal, "_remote_rq", + &TaskGroup::flush_nosignal_tasks_remote_locked); +} + void TaskGroup::ready_to_run_pinned_remote(TaskMeta* meta, bool nosignal) { -#ifdef BRPC_BTHREAD_TRACER - _control->_task_tracer.set_status(TASK_STATUS_READY, meta); -#endif // BRPC_BTHREAD_TRACER - (void)nosignal; // correctness first for pinned cross-thread wakeups. - _pinned_remote_rq._mutex.lock(); - while (!_pinned_remote_rq.push_locked(meta->tid)) { - _pinned_remote_rq._mutex.unlock(); - LOG_EVERY_SECOND(ERROR) << "_pinned_remote_rq is full, capacity=" - << _pinned_remote_rq.capacity(); - ::usleep(1000); - _pinned_remote_rq._mutex.lock(); - } - _pinned_remote_rq._mutex.unlock(); - _control->signal_task(1, _tag); + on_remote_ready_enqueued(meta, &_pinned_remote_rq, + &_pinned_remote_num_nosignal, + &_pinned_remote_nsignaled, + nosignal, "_pinned_remote_rq", + &TaskGroup::flush_nosignal_tasks_pinned_remote_locked); } void TaskGroup::ready_to_run_remote(TaskMeta* meta, bool nosignal) { - if (is_locally_pinned_task(meta)) { - TaskGroup* home = meta->local_pin_home_group; - if (home == NULL) { - LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; - return; - } - if (BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group) == home) { - home->ready_to_run_pinned_local(meta, nosignal); - } else { - home->ready_to_run_pinned_remote(meta, nosignal); - } + if (route_to_pinned_home(meta, nosignal)) { return; } ready_to_run_remote_raw(meta, nosignal); @@ -1214,24 +1224,28 @@ void TaskGroup::flush_nosignal_tasks_remote_locked(butil::Mutex& locked_mutex) { _control->signal_task(val, _tag); } +void TaskGroup::flush_nosignal_tasks_pinned_remote_locked(butil::Mutex& locked_mutex) { + const int val = _pinned_remote_num_nosignal; + if (!val) { + locked_mutex.unlock(); + return; + } + _pinned_remote_num_nosignal = 0; + _pinned_remote_nsignaled += val; + locked_mutex.unlock(); + _control->signal_task(val, _tag); +} + void TaskGroup::ready_to_run_general(TaskMeta* meta, bool nosignal) { - if (is_locally_pinned_task(meta)) { - TaskGroup* home = meta->local_pin_home_group; - if (home == NULL) { - LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; - return; - } - if (BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group) == home) { - home->ready_to_run_pinned_local(meta, nosignal); - } else { - home->ready_to_run_pinned_remote(meta, nosignal); - } + if (route_to_pinned_home(meta, nosignal)) { return; } if (tls_task_group == this) { - return ready_to_run(meta, nosignal); + // route_to_pinned_home() is already checked above. + return ready_to_run_local_raw(meta, nosignal); } - return ready_to_run_remote(meta, nosignal); + // route_to_pinned_home() is already checked above. + return ready_to_run_remote_raw(meta, nosignal); } void TaskGroup::flush_nosignal_tasks_general() { @@ -1252,23 +1266,13 @@ void TaskGroup::ready_to_run_in_worker_ignoresignal(void* args_in) { } void TaskGroup::ready_to_run_ignoresignal_pinaware(TaskMeta* meta) { + if (route_to_pinned_home(meta, true)) { + return; + } #ifdef BRPC_BTHREAD_TRACER _control->_task_tracer.set_status(TASK_STATUS_READY, meta); #endif // BRPC_BTHREAD_TRACER - if (!is_locally_pinned_task(meta)) { - push_rq(meta->tid); - return; - } - TaskGroup* home = meta->local_pin_home_group; - if (home == NULL) { - LOG(FATAL) << "Pinned task " << meta->tid << " has NULL home_group"; - return; - } - if (this == home) { - push_pinned_rq(meta->tid); - return; - } - home->ready_to_run_pinned_remote(meta, true); + push_rq(meta->tid); } void TaskGroup::priority_to_run(void* args_in) { diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index bd86af58f3..d6eca3d8c6 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -191,6 +191,7 @@ class TaskGroup { // Push a bthread into the runqueue from another non-worker thread. void ready_to_run_remote(TaskMeta* meta, bool nosignal = false); void flush_nosignal_tasks_remote_locked(butil::Mutex& locked_mutex); + void flush_nosignal_tasks_pinned_remote_locked(butil::Mutex& locked_mutex); void flush_nosignal_tasks_remote(); // Automatically decide the caller is remote or local, and call @@ -331,10 +332,19 @@ friend class TaskControl; static void ready_to_run_in_worker(void*); static void ready_to_run_in_worker_ignoresignal(void*); static void priority_to_run(void*); + void on_local_ready_enqueued(bool nosignal); + void on_remote_ready_enqueued(TaskMeta* meta, + RemoteTaskQueue* rq, + int* num_nosignal, + int* nsignaled, + bool nosignal, + const char* rq_name, + void (TaskGroup::*flush_locked)(butil::Mutex&)); void ready_to_run_local_raw(TaskMeta* meta, bool nosignal); void ready_to_run_remote_raw(TaskMeta* meta, bool nosignal); void ready_to_run_pinned_local(TaskMeta* meta, bool nosignal); void ready_to_run_pinned_remote(TaskMeta* meta, bool nosignal); + bool route_to_pinned_home(TaskMeta* meta, bool nosignal); void ready_to_run_ignoresignal_pinaware(TaskMeta* meta); static bool is_locally_pinned_task(const TaskMeta* meta); @@ -350,13 +360,6 @@ friend class TaskControl; return _control->steal_task(tid, &_steal_seed, _steal_offset); } - bool steal_task(bthread_t* tid) { - if (_remote_rq.pop(tid)) { - return true; - } - return steal_task_from_others(tid); - } - void set_tag(bthread_tag_t tag) { _tag = tag; } void set_pl(ParkingLot* pl) { _pl = pl; } @@ -408,6 +411,8 @@ friend class TaskControl; RemoteTaskQueue _pinned_remote_rq; int _remote_num_nosignal{0}; int _remote_nsignaled{0}; + int _pinned_remote_num_nosignal{0}; + int _pinned_remote_nsignaled{0}; int _sched_recursive_guard{0}; // tag of this taskgroup @@ -417,6 +422,7 @@ friend class TaskControl; pthread_t _tid{}; uint32_t _worker_index{0}; int32_t _bound_cpu{-1}; + bool _has_active_task_harvest{false}; std::vector _active_task_instances; }; diff --git a/src/bthread/task_group_inl.h b/src/bthread/task_group_inl.h index 8f42c4e267..5425809a61 100644 --- a/src/bthread/task_group_inl.h +++ b/src/bthread/task_group_inl.h @@ -115,6 +115,10 @@ inline void TaskGroup::flush_nosignal_tasks_remote() { _remote_rq._mutex.lock(); flush_nosignal_tasks_remote_locked(_remote_rq._mutex); } + if (_pinned_remote_num_nosignal) { + _pinned_remote_rq._mutex.lock(); + flush_nosignal_tasks_pinned_remote_locked(_pinned_remote_rq._mutex); + } } } // namespace bthread diff --git a/src/bthread/unstable.h b/src/bthread/unstable.h index 2000910b89..89676bcce5 100644 --- a/src/bthread/unstable.h +++ b/src/bthread/unstable.h @@ -169,6 +169,10 @@ extern int bthread_butex_wake_within(const bthread_active_task_ctx_t* ctx, // resumed bthread is routed back to the home worker and is not stealable before // bthread_butex_wait_local() returns. // +// Strict mode: external wakeup for this waiter must use +// bthread_butex_wake_within(). Generic butex_wake* APIs do not provide a +// fallback path for pinned waiters and may fail with EINVAL. +// // Returns 0 on success, -1 otherwise and errno is set. // - EPERM: not running inside a normal bthread worker task extern int bthread_butex_wait_local(void* butex, int expected_value, diff --git a/test/bthread_active_task_unittest.cpp b/test/bthread_active_task_unittest.cpp index 19d56a068d..48f0acc2b1 100644 --- a/test/bthread_active_task_unittest.cpp +++ b/test/bthread_active_task_unittest.cpp @@ -59,6 +59,13 @@ enum TestMode { TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT = 9, }; +enum GenericWakeVariant { + GENERIC_WAKE = 0, + GENERIC_WAKE_N = 1, + GENERIC_WAKE_EXCEPT = 2, + GENERIC_REQUEUE = 3, +}; + struct PerWorkerState { }; @@ -1102,7 +1109,25 @@ void WaitJoinPinnedWaitTaskAndAssert(PinnedWaitCtx* ctx, ctx->resume_worker_pthread.load(std::memory_order_relaxed)); } -void RunPinnedGenericWakeCase() { +int CallGenericWakeVariant(GenericWakeVariant variant, void* butex, + void* requeue_target) { + switch (variant) { + case GENERIC_WAKE: + return bthread::butex_wake(butex, true); + case GENERIC_WAKE_N: + return bthread::butex_wake_n(butex, 1, true); + case GENERIC_WAKE_EXCEPT: + return bthread::butex_wake_except(butex, INVALID_BTHREAD); + case GENERIC_REQUEUE: + CHECK(requeue_target != NULL); + return bthread::butex_requeue(butex, requeue_target); + default: + CHECK(false) << "unknown generic wake variant=" << variant; + return -1; + } +} + +void RunPinnedGenericWakeRejectedCase(GenericWakeVariant variant) { ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); bthread::TaskGroup* g1 = NULL; @@ -1114,26 +1139,46 @@ void RunPinnedGenericWakeCase() { ctx.butex = bthread::butex_create(); ASSERT_NE(static_cast(NULL), ctx.butex); static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + void* requeue_butex = NULL; + if (variant == GENERIC_REQUEUE) { + requeue_butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), requeue_butex); + static_cast*>(requeue_butex)->store( + 0, butil::memory_order_relaxed); + } bthread_t tid = INVALID_BTHREAD; uint64_t home = 0; StartPinnedWaitTaskAndWaitReady(g1, &ctx, &tid, &home); - bool woke = false; + bool rejected = false; for (int i = 0; i < 50; ++i) { - const int rc = bthread::butex_wake(ctx.butex, true); - ASSERT_GE(rc, 0); - if (rc == 1) { - woke = true; + errno = 0; + const int rc = CallGenericWakeVariant(variant, ctx.butex, requeue_butex); + const int err = errno; + if (rc == -1 && err == EINVAL) { + rejected = true; break; } - if (ctx.done.load(std::memory_order_acquire) == 1) { + ASSERT_EQ(0, rc); + usleep(1000); + } + ASSERT_TRUE(rejected); + ASSERT_EQ(0, ctx.done.load(std::memory_order_acquire)); + + bool interrupt_sent = false; + for (int i = 0; i < 20 && ctx.done.load(std::memory_order_acquire) == 0; ++i) { + ASSERT_EQ(0, bthread_interrupt(tid)); + interrupt_sent = true; + if (WaitAtomicAtLeast(ctx.done, 1, 50)) { break; } - usleep(1000); } - ASSERT_TRUE(woke) << "pinned generic butex_wake did not observe waiter in queue"; - WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, 0, 0, home); + ASSERT_TRUE(interrupt_sent); + WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, -1, EINTR, home); + if (requeue_butex != NULL) { + bthread::butex_destroy(requeue_butex); + } bthread::butex_destroy(ctx.butex); } @@ -1233,7 +1278,7 @@ void RunPinnedTimeoutThenWithinWakeReturnsZeroCase() { bthread::butex_destroy(ctx.butex); } -void RunPinnedGenericWakeThenWithinWakeReturnsZeroCase() { +void RunPinnedGenericWakeRejectedThenWithinWakeCase() { ASSERT_EQ(0, g_register_rc.load(std::memory_order_relaxed)); bthread::TaskControl& tc = GetSharedTwoWorkerTaskControl(); bthread::TaskGroup* g1 = NULL; @@ -1241,32 +1286,50 @@ void RunPinnedGenericWakeThenWithinWakeReturnsZeroCase() { ASSERT_TRUE(ChooseTwoDistinctGroups(tc, &g1, &g2)); PrepareForCase(); - PinnedWaitCtx ctx; - ctx.butex = bthread::butex_create(); - ASSERT_NE(static_cast(NULL), ctx.butex); - static_cast*>(ctx.butex)->store(0, butil::memory_order_relaxed); + void* butex = bthread::butex_create(); + ASSERT_NE(static_cast(NULL), butex); + static_cast*>(butex)->store(0, butil::memory_order_relaxed); + g_state.butex_ptr.store(reinterpret_cast(butex), std::memory_order_relaxed); + g_state.butex_expected_waiters.store(1, std::memory_order_relaxed); - bthread_t tid = INVALID_BTHREAD; - uint64_t home = 0; - StartPinnedWaitTaskAndWaitReady(g1, &ctx, &tid, &home); - bool woke = false; + bthread_t waiter_tid = INVALID_BTHREAD; + StartTaskOnGroupAndFlush(g1, &waiter_tid, TestPinnedButexLocalWaitTask, NULL); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_ready_count, 1, 5000)); + const uint64_t waiter_worker = + g_state.butex_waiter_worker_pthread.load(std::memory_order_relaxed); + ASSERT_NE(0u, waiter_worker); + + bool rejected = false; for (int i = 0; i < 50; ++i) { - const int rc = bthread::butex_wake(ctx.butex, true); - ASSERT_GE(rc, 0); - if (rc == 1) { - woke = true; - break; - } - if (ctx.done.load(std::memory_order_acquire) == 1) { + errno = 0; + const int rc = bthread::butex_wake(butex, true); + const int err = errno; + if (rc == -1 && err == EINVAL) { + rejected = true; break; } + ASSERT_EQ(0, rc); usleep(1000); } - ASSERT_TRUE(woke); - WaitJoinPinnedWaitTaskAndAssert(&ctx, tid, 0, 0, home); + ASSERT_TRUE(rejected); + ASSERT_EQ(0, g_state.butex_waiter_done_count.load(std::memory_order_relaxed)); - RunHookOnlyWakeCase(TEST_MODE_BUTEX_WAKE_WITHIN_NO_WAITER, ctx.butex, 0, 0); - bthread::butex_destroy(ctx.butex); + g_state.target_hook_worker_pthread.store(waiter_worker, std::memory_order_relaxed); + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN, std::memory_order_release); + tc.signal_task(2, BTHREAD_TAG_DEFAULT); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)); + ASSERT_EQ(1, g_state.butex_wake_rc.load(std::memory_order_relaxed)); + ASSERT_TRUE(WaitAtomicAtLeast(g_state.butex_waiter_done_count, 1, 5000)); + ASSERT_EQ(1, g_state.butex_waiter_resume_count.load(std::memory_order_relaxed)); + ASSERT_EQ(waiter_worker, + g_state.butex_waiter_resume_worker_pthread.load(std::memory_order_relaxed)); + + ASSERT_EQ(0, bthread_join(waiter_tid, NULL)); + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); + QuiesceHookActionsAfterModeIdle(); + g_state.butex_ptr.store(0, std::memory_order_relaxed); + bthread::butex_destroy(butex); } void RunPinnedInterruptThenWithinWakeReturnsZeroCase() { @@ -1479,8 +1542,20 @@ TEST(BthreadActiveTaskTest, pinned_waiter_interrupt_resumes_on_home_worker) { RunPinnedInterruptCase(); } -TEST(BthreadActiveTaskTest, generic_butex_wake_on_pinned_waiter_routes_to_home_worker) { - RunPinnedGenericWakeCase(); +TEST(BthreadActiveTaskTest, generic_butex_wake_on_pinned_waiter_is_rejected) { + RunPinnedGenericWakeRejectedCase(GENERIC_WAKE); +} + +TEST(BthreadActiveTaskTest, generic_butex_wake_n_on_pinned_waiter_is_rejected) { + RunPinnedGenericWakeRejectedCase(GENERIC_WAKE_N); +} + +TEST(BthreadActiveTaskTest, generic_butex_wake_except_on_pinned_waiter_is_rejected) { + RunPinnedGenericWakeRejectedCase(GENERIC_WAKE_EXCEPT); +} + +TEST(BthreadActiveTaskTest, generic_butex_requeue_on_pinned_waiter_is_rejected) { + RunPinnedGenericWakeRejectedCase(GENERIC_REQUEUE); } TEST(BthreadActiveTaskTest, wake_within_returns_zero_after_timeout_competition) { @@ -1491,6 +1566,6 @@ TEST(BthreadActiveTaskTest, wake_within_returns_zero_after_interrupt_competition RunPinnedInterruptThenWithinWakeReturnsZeroCase(); } -TEST(BthreadActiveTaskTest, wake_within_returns_zero_after_generic_wake_competition) { - RunPinnedGenericWakeThenWithinWakeReturnsZeroCase(); +TEST(BthreadActiveTaskTest, within_wake_succeeds_after_generic_wake_rejected) { + RunPinnedGenericWakeRejectedThenWithinWakeCase(); } From 6a53a7f248d94611c6c0d3c0117a97be12673e42 Mon Sep 17 00:00:00 2001 From: MalikHou Date: Fri, 27 Feb 2026 21:17:48 +0800 Subject: [PATCH 4/8] fix --- .../install-all-dependencies/action.yml | 19 +----------------- .../install-essential-dependencies/action.yml | 20 +------------------ 2 files changed, 2 insertions(+), 37 deletions(-) diff --git a/.github/actions/install-all-dependencies/action.yml b/.github/actions/install-all-dependencies/action.yml index c400787f0b..179f86cd4f 100644 --- a/.github/actions/install-all-dependencies/action.yml +++ b/.github/actions/install-all-dependencies/action.yml @@ -2,24 +2,7 @@ runs: using: "composite" steps: - uses: ./.github/actions/install-essential-dependencies - - run: | - set -euo pipefail - retry() { - local max_retries=3 - local attempt=1 - while true; do - if "$@"; then - return 0 - fi - if [ "${attempt}" -ge "${max_retries}" ]; then - return 1 - fi - sleep $((attempt * 10)) - attempt=$((attempt + 1)) - done - } - retry sudo env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - libunwind-dev libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev + - run: sudo apt-get install -y libunwind-dev libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev shell: bash - run: | wget https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf thrift-0.11.0.tar.gz && cd thrift-0.11.0/ diff --git a/.github/actions/install-essential-dependencies/action.yml b/.github/actions/install-essential-dependencies/action.yml index 47d542424a..3411b7f7c1 100644 --- a/.github/actions/install-essential-dependencies/action.yml +++ b/.github/actions/install-essential-dependencies/action.yml @@ -3,23 +3,5 @@ runs: steps: - run: ulimit -c unlimited -S && sudo bash -c "echo 'core.%e.%p' > /proc/sys/kernel/core_pattern" shell: bash - - run: | - set -euo pipefail - retry() { - local max_retries=3 - local attempt=1 - while true; do - if "$@"; then - return 0 - fi - if [ "${attempt}" -ge "${max_retries}" ]; then - return 1 - fi - sleep $((attempt * 10)) - attempt=$((attempt + 1)) - done - } - retry sudo apt-get -o Acquire::Retries=3 update - retry sudo env DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - git g++ make libssl-dev libgflags-dev libprotobuf-dev libprotoc-dev protobuf-compiler libleveldb-dev + - run: sudo apt-get install -y git g++ make libssl-dev libgflags-dev libprotobuf-dev libprotoc-dev protobuf-compiler libleveldb-dev shell: bash From 05dd76f4b6903fa144d67172facb98e9c36980bd Mon Sep 17 00:00:00 2001 From: MalikHou Date: Sat, 28 Feb 2026 17:11:17 +0800 Subject: [PATCH 5/8] fix --- docs/cn/bthread_active_task.md | 5 +- src/bthread/butex.cpp | 8 + src/bthread/task_group.cpp | 4 + src/bthread/task_group.h | 4 + src/bthread/unstable.h | 2 + test/bthread_active_task_unittest.cpp | 206 +++++++++++++++++++++++++- 6 files changed, 227 insertions(+), 2 deletions(-) diff --git a/docs/cn/bthread_active_task.md b/docs/cn/bthread_active_task.md index 81868bc55c..ea99099403 100644 --- a/docs/cn/bthread_active_task.md +++ b/docs/cn/bthread_active_task.md @@ -44,6 +44,7 @@ - `bthread_butex_wake_within` 只适用于“每请求私有 butex(单 waiter)”模型: - 0 waiter -> 返回 `0` - 1 waiter(同 `TaskControl`、同 tag 的 bthread waiter)-> 返回 `1` + - 当前 worker 的 pinned local runqueue 已满 -> 返回 `-1` 且 `errno=EAGAIN`(应在下一轮 `harvest` 重试) - 否则返回 `-1` 且 `errno=EINVAL`(多 waiter / pthread waiter / 跨 tag / 跨 `TaskControl`) ## 快速上手(推荐接入顺序) @@ -214,12 +215,14 @@ static int IoHarvest( - 返回 `0`:当前 butex 上没有 waiter(例如 timeout/取消竞争后已无人等待) - 返回 `-1`: - `EPERM`:不在 active-task `harvest` 回调里调用 + - `EAGAIN`:当前 worker 的 pinned local runqueue 满,当前轮无法安全入队,应下一轮重试 - `EINVAL`:butex 不满足 within 语义(多 waiter / pthread waiter / 跨 tag / 跨 `TaskControl`),或 wrong-worker invariant 被触发 建议: - `wake_rc == 0` 当作**合法分支**处理(不是异常) -- `wake_rc < 0` 视为**用法/所有权错误**并记录错误日志或计数 +- `wake_rc < 0 && errno == EAGAIN` 视为**背压信号**,本轮放弃,下一轮 `harvest` 重试 +- 其他 `wake_rc < 0` 视为**用法/所有权错误**并记录错误日志或计数 ## 调用时机与可调间隔(busy/idle) diff --git a/src/bthread/butex.cpp b/src/bthread/butex.cpp index 46b09708be..0a102dc130 100644 --- a/src/bthread/butex.cpp +++ b/src/bthread/butex.cpp @@ -396,6 +396,14 @@ int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { errno = EINVAL; return -1; } + // wake_within() runs on target_group's owner worker. For pinned waiters, + // if owner-local pinned runqueue is already full, report EAGAIN and + // keep waiter on butex list for next harvest retry, instead of + // blocking/spinning here. + if (is_pinned_waiter(bw) && target_group->pinned_rq_full()) { + errno = EAGAIN; + return -1; + } bw->RemoveFromList(); bw->container.store(NULL, butil::memory_order_relaxed); } diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index 807b254ab9..c82786e886 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -912,6 +912,10 @@ bool TaskGroup::is_locally_pinned_task(const TaskMeta* meta) { meta->local_pin_home_group != NULL; } +bool TaskGroup::pinned_rq_full() const { + return _pinned_rq.volatile_size() >= _pinned_rq.capacity(); +} + bool TaskGroup::route_to_pinned_home(TaskMeta* meta, bool nosignal) { if (!is_locally_pinned_task(meta)) { return false; diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index d6eca3d8c6..865704dbab 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -221,6 +221,10 @@ class TaskGroup { size_t rq_size() const { return _rq.volatile_size(); } + // Returns true when owner-local pinned runqueue is full. + // This is intended for non-blocking wake paths that must not spin + // inside active-task hook callbacks. + bool pinned_rq_full() const; bthread_tag_t tag() const { return _tag; } diff --git a/src/bthread/unstable.h b/src/bthread/unstable.h index 89676bcce5..ebac28ef9c 100644 --- a/src/bthread/unstable.h +++ b/src/bthread/unstable.h @@ -157,6 +157,8 @@ extern int bthread_register_active_task_type( // - 1 waiter and it is a same-TaskControl/same-tag bthread waiter: return 1 // - otherwise (multiple waiters / pthread waiter / cross-tag / cross-control): // return -1 and set errno=EINVAL +// - owner-local pinned runqueue is full: return -1 and set errno=EAGAIN. +// Caller should retry in a later harvest round. // // Calling this API outside active-task harvest callbacks returns -1 and sets // errno=EPERM. diff --git a/test/bthread_active_task_unittest.cpp b/test/bthread_active_task_unittest.cpp index 48f0acc2b1..516cf1a671 100644 --- a/test/bthread_active_task_unittest.cpp +++ b/test/bthread_active_task_unittest.cpp @@ -43,6 +43,7 @@ namespace bthread { DECLARE_int32(bthread_active_task_poll_every_nswitch); DECLARE_int64(bthread_active_task_idle_wait_ns); } +DECLARE_int32(task_group_runqueue_capacity); namespace { @@ -57,6 +58,7 @@ enum TestMode { TEST_MODE_SCENARIO_REQ_WAKE = 7, TEST_MODE_SCENARIO_REQ_WAKE_BUSY_PERIODIC = 8, TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT = 9, + TEST_MODE_BUTEX_WAKE_WITHIN_EAGAIN_WHEN_PINNED_RQ_FULL = 10, }; enum GenericWakeVariant { @@ -107,6 +109,8 @@ struct ActiveTaskTestState { , destroy_calls(0) , harvest_calls(0) , butex_ptr(0) + , butex_ptr_aux1(0) + , butex_ptr_aux2(0) , pending_req_ptr(0) , target_hook_worker_pthread(0) , butex_expected_waiters(0) @@ -114,6 +118,10 @@ struct ActiveTaskTestState { , butex_wake_completed(0) , butex_wake_rc(0) , butex_wake_errno(0) + , butex_wake_rc_aux1(0) + , butex_wake_errno_aux1(0) + , butex_wake_rc_aux2(0) + , butex_wake_errno_aux2(0) , hook_wake_harvest_calls(0) , hook_action_inflight(0) , butex_waiter_ready_count(0) @@ -131,6 +139,8 @@ struct ActiveTaskTestState { std::atomic destroy_calls; std::atomic harvest_calls; std::atomic butex_ptr; + std::atomic butex_ptr_aux1; + std::atomic butex_ptr_aux2; std::atomic pending_req_ptr; std::atomic target_hook_worker_pthread; std::atomic butex_expected_waiters; @@ -138,6 +148,10 @@ struct ActiveTaskTestState { std::atomic butex_wake_completed; std::atomic butex_wake_rc; std::atomic butex_wake_errno; + std::atomic butex_wake_rc_aux1; + std::atomic butex_wake_errno_aux1; + std::atomic butex_wake_rc_aux2; + std::atomic butex_wake_errno_aux2; std::atomic hook_wake_harvest_calls; std::atomic hook_action_inflight; std::atomic butex_waiter_ready_count; @@ -187,6 +201,8 @@ void ResetState() { g_state.destroy_calls.store(0, std::memory_order_relaxed); g_state.harvest_calls.store(0, std::memory_order_relaxed); g_state.butex_ptr.store(0, std::memory_order_relaxed); + g_state.butex_ptr_aux1.store(0, std::memory_order_relaxed); + g_state.butex_ptr_aux2.store(0, std::memory_order_relaxed); g_state.pending_req_ptr.store(0, std::memory_order_relaxed); g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); g_state.butex_expected_waiters.store(0, std::memory_order_relaxed); @@ -194,6 +210,10 @@ void ResetState() { g_state.butex_wake_completed.store(0, std::memory_order_relaxed); g_state.butex_wake_rc.store(0, std::memory_order_relaxed); g_state.butex_wake_errno.store(0, std::memory_order_relaxed); + g_state.butex_wake_rc_aux1.store(0, std::memory_order_relaxed); + g_state.butex_wake_errno_aux1.store(0, std::memory_order_relaxed); + g_state.butex_wake_rc_aux2.store(0, std::memory_order_relaxed); + g_state.butex_wake_errno_aux2.store(0, std::memory_order_relaxed); g_state.hook_wake_harvest_calls.store(0, std::memory_order_relaxed); g_state.butex_waiter_ready_count.store(0, std::memory_order_relaxed); g_state.butex_waiter_done_count.store(0, std::memory_order_relaxed); @@ -233,6 +253,23 @@ bool WaitAtomicEqual(const std::atomic& value, int expected, int timeout_ms return value.load(std::memory_order_relaxed) == expected; } +bool WaitPinnedButexQueued(void* butex, int timeout_ms) { + for (int i = 0; i < timeout_ms; ++i) { + errno = 0; + const int rc = bthread::butex_wake(butex, true); + const int err = errno; + if (rc == -1 && err == EINVAL) { + return true; + } + if (rc == 0) { + usleep(1000); + continue; + } + return false; + } + return false; +} + void DrainHookActions() { ASSERT_TRUE(WaitAtomicEqual(g_state.hook_action_inflight, 0, 5000)); } @@ -414,7 +451,8 @@ bool MaybeRunWithinWakeFromHook(const bthread_active_task_ctx_t* ctx, mode != TEST_MODE_BUSY_PERIODIC_POLL_WAKE && mode != TEST_MODE_SCENARIO_REQ_WAKE && mode != TEST_MODE_SCENARIO_REQ_WAKE_BUSY_PERIODIC && - mode != TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT) { + mode != TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT && + mode != TEST_MODE_BUTEX_WAKE_WITHIN_EAGAIN_WHEN_PINNED_RQ_FULL) { return false; } @@ -430,6 +468,7 @@ bool MaybeRunWithinWakeFromHook(const bthread_active_task_ctx_t* ctx, if (mode == TEST_MODE_BUTEX_WAKE_WITHIN || mode == TEST_MODE_BUTEX_WAKE_WITHIN_STRICT_CROSS_WORKER_REJECT || + mode == TEST_MODE_BUTEX_WAKE_WITHIN_EAGAIN_WHEN_PINNED_RQ_FULL || mode == TEST_MODE_BUSY_PERIODIC_POLL_WAKE) { const int expected_waiters = g_state.butex_expected_waiters.load(std::memory_order_relaxed); @@ -461,6 +500,42 @@ bool MaybeRunWithinWakeFromHook(const bthread_active_task_ctx_t* ctx, void* butex = NULL; MockReqCtx* req = NULL; + if (mode == TEST_MODE_BUTEX_WAKE_WITHIN_EAGAIN_WHEN_PINNED_RQ_FULL) { + void* butex0 = reinterpret_cast( + g_state.butex_ptr.load(std::memory_order_relaxed)); + void* butex1 = reinterpret_cast( + g_state.butex_ptr_aux1.load(std::memory_order_relaxed)); + void* butex2 = reinterpret_cast( + g_state.butex_ptr_aux2.load(std::memory_order_relaxed)); + errno = 0; + const int rc0 = bthread_butex_wake_within(ctx, butex0); + const int err0 = errno; + errno = 0; + const int rc1 = bthread_butex_wake_within(ctx, butex1); + const int err1 = errno; + errno = 0; + const int rc2 = bthread_butex_wake_within(ctx, butex2); + const int err2 = errno; + + g_state.butex_wake_rc.store(rc0, std::memory_order_relaxed); + g_state.butex_wake_errno.store(err0, std::memory_order_relaxed); + g_state.butex_wake_rc_aux1.store(rc1, std::memory_order_relaxed); + g_state.butex_wake_errno_aux1.store(err1, std::memory_order_relaxed); + g_state.butex_wake_rc_aux2.store(rc2, std::memory_order_relaxed); + g_state.butex_wake_errno_aux2.store(err2, std::memory_order_relaxed); + + if (!(rc0 == 1 && rc1 == 1 && rc2 == -1 && err2 == EAGAIN)) { + // setup race: retry in next harvest round. + g_state.butex_wake_started.store(0, std::memory_order_relaxed); + return true; + } + g_state.butex_wake_completed.fetch_add(1, std::memory_order_relaxed); + if (skip_park_out) { + *skip_park_out = true; + } + return true; + } + if (is_scenario_req_wake) { req = reinterpret_cast( g_state.pending_req_ptr.load(std::memory_order_acquire)); @@ -618,6 +693,128 @@ int ChildCheckLocalWorkerInitDestroyAndIdleWaitInterval() { return (g_state.destroy_calls.load(std::memory_order_relaxed) == 2 ? 0 : 24); } +int ChildCheckWakeWithinEagainWhenPinnedRqFull() { + if (g_register_rc.load(std::memory_order_relaxed) != 0) { + return 30; + } + const int32_t old_runqueue_capacity = FLAGS_task_group_runqueue_capacity; + FLAGS_task_group_runqueue_capacity = 2; + + int ret = 0; + PrepareForCase(); + { + bthread::TaskControl tc; + if (tc.init(1) != 0) { + ret = 31; + } else if (!WaitAtomicAtLeast(g_state.init_calls, 1, 5000)) { + ret = 32; + } else { + bthread::TaskGroup* tg = tc.choose_one_group(BTHREAD_TAG_DEFAULT); + if (tg == NULL) { + ret = 33; + } else { + PinnedWaitCtx ctx[3]; + bthread_t tids[3] = { + INVALID_BTHREAD, INVALID_BTHREAD, INVALID_BTHREAD }; + void* butexes[3] = { NULL, NULL, NULL }; + for (int i = 0; i < 3; ++i) { + butexes[i] = bthread::butex_create(); + if (butexes[i] == NULL) { + ret = 34; + break; + } + static_cast*>(butexes[i])->store( + 0, butil::memory_order_relaxed); + ctx[i].butex = butexes[i]; + if (tg->start_background(&tids[i], NULL, + TestPinnedWaitTask, &ctx[i]) != 0) { + ret = 35; + break; + } + tg->flush_nosignal_tasks(); + if (!WaitAtomicAtLeast(ctx[i].ready, 1, 5000)) { + ret = 36; + break; + } + if (!WaitPinnedButexQueued(butexes[i], 5000)) { + ret = 37; + break; + } + } + + if (ret == 0) { + const uint64_t home = ctx[0].pinned_worker_pthread.load( + std::memory_order_relaxed); + if (home == 0) { + ret = 38; + } else { + g_state.butex_ptr.store(reinterpret_cast(butexes[0]), + std::memory_order_relaxed); + g_state.butex_ptr_aux1.store(reinterpret_cast(butexes[1]), + std::memory_order_relaxed); + g_state.butex_ptr_aux2.store(reinterpret_cast(butexes[2]), + std::memory_order_relaxed); + g_state.butex_expected_waiters.store(0, std::memory_order_relaxed); + g_state.target_hook_worker_pthread.store(home, + std::memory_order_relaxed); + g_state.mode.store( + TEST_MODE_BUTEX_WAKE_WITHIN_EAGAIN_WHEN_PINNED_RQ_FULL, + std::memory_order_release); + tc.signal_task(1, BTHREAD_TAG_DEFAULT); + if (!WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)) { + ret = 39; + } else if (g_state.butex_wake_rc.load(std::memory_order_relaxed) != 1 || + g_state.butex_wake_rc_aux1.load(std::memory_order_relaxed) != 1 || + g_state.butex_wake_rc_aux2.load(std::memory_order_relaxed) != -1 || + g_state.butex_wake_errno_aux2.load(std::memory_order_relaxed) != EAGAIN) { + ret = 40; + } else { + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + QuiesceHookActionsAfterModeIdle(); + g_state.butex_wake_started.store(0, std::memory_order_relaxed); + g_state.butex_wake_completed.store(0, std::memory_order_relaxed); + g_state.butex_wake_rc.store(0, std::memory_order_relaxed); + g_state.butex_wake_errno.store(0, std::memory_order_relaxed); + g_state.target_hook_worker_pthread.store(home, + std::memory_order_relaxed); + g_state.butex_ptr.store( + reinterpret_cast(butexes[2]), + std::memory_order_relaxed); + g_state.mode.store(TEST_MODE_BUTEX_WAKE_WITHIN, + std::memory_order_release); + tc.signal_task(1, BTHREAD_TAG_DEFAULT); + if (!WaitAtomicAtLeast(g_state.butex_wake_completed, 1, 5000)) { + ret = 41; + } else if (g_state.butex_wake_rc.load(std::memory_order_relaxed) != 1) { + ret = 42; + } + } + } + } + + g_state.mode.store(TEST_MODE_IDLE, std::memory_order_release); + g_state.target_hook_worker_pthread.store(0, std::memory_order_relaxed); + QuiesceHookActionsAfterModeIdle(); + for (int i = 0; i < 3; ++i) { + if (tids[i] != INVALID_BTHREAD) { + if (bthread_join(tids[i], NULL) != 0 && ret == 0) { + ret = 43; + } + } + if (butexes[i] != NULL) { + bthread::butex_destroy(butexes[i]); + } + } + g_state.butex_ptr.store(0, std::memory_order_relaxed); + g_state.butex_ptr_aux1.store(0, std::memory_order_relaxed); + g_state.butex_ptr_aux2.store(0, std::memory_order_relaxed); + } + } + } + FLAGS_task_group_runqueue_capacity = old_runqueue_capacity; + return ret; +} + int RunChildMode(const char* mode) { pid_t pid = fork(); if (pid < 0) { @@ -1426,6 +1623,9 @@ class ActiveTaskTestEnvironment : public ::testing::Environment { if (strcmp(child_mode, "local_worker_init_destroy_and_idle_wait_interval") == 0) { _exit(ChildCheckLocalWorkerInitDestroyAndIdleWaitInterval()); } + if (strcmp(child_mode, "wake_within_eagain_when_pinned_rq_full") == 0) { + _exit(ChildCheckWakeWithinEagainWhenPinnedRqFull()); + } _exit(100); } } @@ -1468,6 +1668,10 @@ TEST(BthreadActiveTaskTest, butex_wake_within_no_waiter_returns_zero_in_hook) { bthread::butex_destroy(butex); } +TEST(BthreadActiveTaskTest, butex_wake_within_returns_eagain_when_pinned_rq_full) { + ASSERT_EQ(0, RunChildMode("wake_within_eagain_when_pinned_rq_full")); +} + TEST(BthreadActiveTaskTest, butex_wake_within_multiple_waiters_rejected) { RunButexWakeWithinCase(2, -1, EINVAL); } From e0ee64dd04de6c85d47f21d477ae5d89549d85f3 Mon Sep 17 00:00:00 2001 From: MalikHou Date: Mon, 2 Mar 2026 01:08:33 +0800 Subject: [PATCH 6/8] fix comment --- docs/cn/bthread_active_task.md | 71 ++++++++++++++++++++++++--- src/bthread/bthread.cpp | 1 + src/bthread/butex.cpp | 36 +++++++++++++- test/bthread_active_task_unittest.cpp | 42 ++++++++++++++-- 4 files changed, 137 insertions(+), 13 deletions(-) diff --git a/docs/cn/bthread_active_task.md b/docs/cn/bthread_active_task.md index ea99099403..3c6a367a75 100644 --- a/docs/cn/bthread_active_task.md +++ b/docs/cn/bthread_active_task.md @@ -14,9 +14,9 @@ 1. worker 初始化时创建本地 reactor/ring。 2. 提交异步 IO 后,在私有 `butex` 上通过 `bthread_butex_wait_local` 挂起。 -4. worker 的 active-task hook 收割 completion。 -5. hook 内调用 `bthread_butex_wake_within(ctx, req->butex)` 唤醒 waiter。 -6. waiter bthread 在同一个 worker 上恢复执行(不会被 steal)。 +3. worker 的 active-task hook 收割 completion。 +4. hook 内调用 `bthread_butex_wake_within(ctx, req->butex)` 唤醒 waiter。 +5. waiter bthread 在同一个 worker 上恢复执行(不会被 steal)。 ## 当前提供的接口(UNSTABLE) @@ -302,11 +302,68 @@ strict 模式下,普通 `butex_wake*` 命中 pinned waiter 会返回 `-1/EINVA - 检查是否对 `bthread_butex_wait_local` 的 waiter 误用了普通 `butex_wake*` - 检查 completion 是否被错误 worker 的 `harvest` 收割(ownership/routing 问题) -可观测计数(累计 bvar): +## 监控与排障(建议上线前梳理) + +### 监控入口 + +- `/vars`:查看单个或批量 bvar + - 例如:`/vars/bthread_butex_within*` +- `/brpc_metrics`:Prometheus 抓取入口 + - 例如抓取后查询 `bthread_butex_within_success_count` + +### 可观测计数(累计 bvar) + +以下指标均为**累计计数器**(进程内单调递增,进程重启后归零): + +- `bthread_butex_within_success_count` + - `bthread_butex_wake_within` 成功唤醒并入队 waiter(返回 `1`)次数 +- `bthread_butex_within_pinned_success_count` + - `within_success_count` 的 pinned waiter 子集(同样返回 `1`) +- `bthread_butex_within_eagain_count` + - 因 pinned local runqueue 已满而返回 `-1/EAGAIN` 次数(应在后续 `harvest` 重试) +- `bthread_butex_within_no_waiter_count` + - `bthread_butex_wake_within` 返回 `0`(当前无 waiter)次数 +- `bthread_butex_within_invalid_count` + - `bthread_butex_wake_within` 返回 `-1/EINVAL` 次数(多 waiter / pthread waiter / 跨 tag / 跨 `TaskControl` / wrong-worker invariant 等) +- `bthread_butex_strict_reject_count` + - 普通 `butex_wake*` 命中 pinned waiter 被 strict 拒绝次数 + +### 判读建议(推荐看速率和比例) + +建议在监控系统里看 `rate()/increase()`,不要直接按绝对值判定健康度。 + +- `within_success_qps = rate(bthread_butex_within_success_count[1m])` + - 反映 `within` 主路径吞吐;长期为 0 通常说明未走该路径或流量不足 +- `within_eagain_qps = rate(bthread_butex_within_eagain_count[1m])` + - 反映 pinned 队列背压;持续升高说明局部拥塞在加重 +- `within_eagain_ratio = rate(bthread_butex_within_eagain_count[5m]) / clamp_min(rate(bthread_butex_within_success_count[5m]), 1)` + - 建议重点观察;上升代表“每次成功唤醒对应的重试压力”在变大 +- `within_pinned_share = rate(bthread_butex_within_pinned_success_count[5m]) / clamp_min(rate(bthread_butex_within_success_count[5m]), 1)` + - 反映 pinned 路径占比;在 per-worker 本地 reactor 场景通常应较高 +- `within_invalid_qps = rate(bthread_butex_within_invalid_count[5m])` + - 该值应接近 0;升高优先排查接口使用与 ownership/routing + +### 告警起点(可按业务再收敛) + +- `within_invalid_qps > 0` 持续一段时间(例如 5~10 分钟)可作为高优先级告警 +- `within_eagain_ratio` 持续抬升可作为容量/背压预警(结合延迟和业务重试率判断) +- `within_success_qps` 在有流量时异常跌到接近 0,可作为路径失效告警信号 + +### Prometheus 查询示例 + +```promql +sum(rate(bthread_butex_within_success_count[1m])) +sum(rate(bthread_butex_within_eagain_count[1m])) +sum(rate(bthread_butex_within_eagain_count[5m])) / clamp_min(sum(rate(bthread_butex_within_success_count[5m])), 1) +sum(rate(bthread_butex_within_pinned_success_count[5m])) / clamp_min(sum(rate(bthread_butex_within_success_count[5m])), 1) +sum(rate(bthread_butex_within_invalid_count[5m])) +``` + +### 基本关系校验(用于自检) -- `bthread_butex_strict_reject_count`:普通 `butex_wake*` 命中 pinned waiter 被 strict 拒绝次数 -- `bthread_butex_within_no_waiter_count`:`bthread_butex_wake_within` 返回 `0`(无 waiter)次数 -- `bthread_butex_within_invalid_count`:`bthread_butex_wake_within` 返回 `-1/EINVAL` 次数 +- `bthread_butex_within_pinned_success_count <= bthread_butex_within_success_count` +- `eagain` 上升时通常会看到业务重试路径变活跃(同一请求在后续 `harvest` 成功唤醒) +- `within_no_waiter_count` 小幅增长通常是 timeout/取消竞争下的正常现象,不应单独作为故障依据 ## 注意事项(务必遵守) diff --git a/src/bthread/bthread.cpp b/src/bthread/bthread.cpp index 44f206e0e3..4f7af3bfb4 100644 --- a/src/bthread/bthread.cpp +++ b/src/bthread/bthread.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "butil/macros.h" // BAIDU_CASSERT #include "butil/logging.h" diff --git a/src/bthread/butex.cpp b/src/bthread/butex.cpp index 0a102dc130..a803ab0c9d 100644 --- a/src/bthread/butex.cpp +++ b/src/bthread/butex.cpp @@ -84,6 +84,21 @@ struct ButexWithinInvalidCount : public bvar::Adder { : bvar::Adder("bthread_butex_within_invalid_count") {} }; +struct ButexWithinSuccessCount : public bvar::Adder { + ButexWithinSuccessCount() + : bvar::Adder("bthread_butex_within_success_count") {} +}; + +struct ButexWithinEagainCount : public bvar::Adder { + ButexWithinEagainCount() + : bvar::Adder("bthread_butex_within_eagain_count") {} +}; + +struct ButexWithinPinnedSuccessCount : public bvar::Adder { + ButexWithinPinnedSuccessCount() + : bvar::Adder("bthread_butex_within_pinned_success_count") {} +}; + inline bvar::Adder& butex_strict_reject_count() { return *butil::get_leaky_singleton(); } @@ -96,6 +111,18 @@ inline bvar::Adder& butex_within_invalid_count() { return *butil::get_leaky_singleton(); } +inline bvar::Adder& butex_within_success_count() { + return *butil::get_leaky_singleton(); +} + +inline bvar::Adder& butex_within_eagain_count() { + return *butil::get_leaky_singleton(); +} + +inline bvar::Adder& butex_within_pinned_success_count() { + return *butil::get_leaky_singleton(); +} + enum WaiterState { WAITER_STATE_NONE, WAITER_STATE_READY, @@ -370,6 +397,7 @@ int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { } Butex* b = container_of(static_cast*>(arg), Butex, value); ButexBthreadWaiter* bbw = NULL; + bool pinned_waiter = false; { BAIDU_SCOPED_LOCK(b->waiter_lock); if (b->waiters.empty()) { @@ -396,11 +424,13 @@ int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { errno = EINVAL; return -1; } + pinned_waiter = is_pinned_waiter(bw); // wake_within() runs on target_group's owner worker. For pinned waiters, // if owner-local pinned runqueue is already full, report EAGAIN and // keep waiter on butex list for next harvest retry, instead of // blocking/spinning here. - if (is_pinned_waiter(bw) && target_group->pinned_rq_full()) { + if (pinned_waiter && target_group->pinned_rq_full()) { + butex_within_eagain_count() << 1; errno = EAGAIN; return -1; } @@ -410,6 +440,10 @@ int butex_wake_to_task_group(void* arg, TaskGroup* target_group) { unsleep_if_necessary(bbw, get_global_timer_thread()); target_group->ready_to_run(bbw->task_meta, true); + butex_within_success_count() << 1; + if (pinned_waiter) { + butex_within_pinned_success_count() << 1; + } return 1; } diff --git a/test/bthread_active_task_unittest.cpp b/test/bthread_active_task_unittest.cpp index 516cf1a671..6cc07947c3 100644 --- a/test/bthread_active_task_unittest.cpp +++ b/test/bthread_active_task_unittest.cpp @@ -32,6 +32,7 @@ #include #include "butil/atomicops.h" +#include "butil/process_util.h" #include "butil/time.h" #include "bthread/bthread.h" #include "bthread/butex.h" @@ -815,6 +816,30 @@ int ChildCheckWakeWithinEagainWhenPinnedRqFull() { return ret; } +bool GetSelfExecutablePath(char* buf, size_t len) { + if (buf == NULL || len == 0) { + return false; + } + const ssize_t n = butil::GetProcessAbsolutePath(buf, len); + if (n <= 0 || static_cast(n) >= len) { + return false; + } + buf[n] = '\0'; + return true; +} + +bool GetArgv0Fallback(char* buf, size_t len) { + if (buf == NULL || len == 0) { + return false; + } + const ssize_t n = butil::ReadCommandLine(buf, len, false); + if (n <= 0 || static_cast(n) >= len) { + return false; + } + buf[n] = '\0'; + return true; +} + int RunChildMode(const char* mode) { pid_t pid = fork(); if (pid < 0) { @@ -822,14 +847,21 @@ int RunChildMode(const char* mode) { } if (pid == 0) { char self_path[PATH_MAX]; - const ssize_t n = readlink("/proc/self/exe", self_path, sizeof(self_path) - 1); - if (n <= 0) { + char argv0[PATH_MAX]; + const bool have_self_path = GetSelfExecutablePath(self_path, sizeof(self_path)); + const bool have_argv0 = GetArgv0Fallback(argv0, sizeof(argv0)); + if (!have_self_path && !have_argv0) { _exit(4); } - self_path[n] = '\0'; setenv("BRPC_ACTIVE_TASK_UT_CHILD_MODE", mode, 1); - char* const argv[] = { self_path, NULL }; - execv(self_path, argv); + if (have_self_path) { + char* const argv[] = { self_path, NULL }; + execv(self_path, argv); + } + if (have_argv0) { + char* const argv[] = { argv0, NULL }; + execvp(argv0, argv); + } _exit(5); } int status = 0; From 50d604b3a2d9bb448b06775568aa7b4f7fe9b5c6 Mon Sep 17 00:00:00 2001 From: MalikHou Date: Tue, 3 Mar 2026 21:59:25 +0800 Subject: [PATCH 7/8] fix doc --- docs/cn/threading_overview.md | 25 +++++++++++++++++++++++++ docs/en/threading_overview.md | 25 +++++++++++++++++++++++++ src/bthread/butex.h | 15 +++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/docs/cn/threading_overview.md b/docs/cn/threading_overview.md index 8d494ddec2..e084730ca5 100644 --- a/docs/cn/threading_overview.md +++ b/docs/cn/threading_overview.md @@ -43,3 +43,28 @@ 异步编程中的流程控制对于专家也充满了陷阱。任何挂起操作,如sleep一会儿或等待某事完成,都意味着用户需要显式地保存状态,并在回调函数中恢复状态。异步代码往往得写成状态机的形式。当挂起较少时,这有点麻烦,但还是可把握的。问题在于一旦挂起发生在条件判断、循环、子函数中,写出这样的状态机并能被很多人理解和维护,几乎是不可能的,而这在分布式系统中又很常见,因为一个节点往往要与多个节点同时交互。另外如果唤醒可由多种事件触发(比如fd有数据或超时了),挂起和恢复的过程容易出现race condition,对多线程编码能力要求很高。语法糖(比如lambda)可以让编码不那么“麻烦”,但无法降低难度。 共享指针在异步编程中很普遍,这看似方便,但也使内存的ownership变得难以捉摸,如果内存泄漏了,很难定位哪里没有释放;如果segment fault了,也不知道哪里多释放了一下。大量使用引用计数的用户代码很难控制代码质量,容易长期在内存问题上耗费时间。如果引用计数还需要手动维护,保持质量就更难了,维护者也不会愿意改进。没有上下文会使得[RAII](http://en.wikipedia.org/wiki/Resource_Acquisition_Is_Initialization)无法充分发挥作用, 有时需要在callback之外lock,callback之内unlock,实践中很容易出错。 + +## butex wait/wake 顺序规则(实用) + +直接使用 `butex_wait`/`butex_wake*` 时,务必遵守: + +1. 唤醒方先写结果/状态,再调用 `butex_wake*`。 +2. 等待方在每次 `butex_wait` 返回后都要重检谓词条件。 + +`butex_wait` 返回 `0` 只表示“从 butex 等待队列被唤醒”,不代表“业务条件已经满足”。 + +常见写法: + +```cpp +// 唤醒方 +state.store(new_value, butil::memory_order_release); +bthread::butex_wake(&state); + +// 等待方 +while (state.load(butil::memory_order_acquire) == expected_value) { + if (bthread::butex_wait(&state, expected_value, NULL) < 0 && + errno != EWOULDBLOCK && errno != EINTR) { + // 处理超时/中断/停止等错误 + } +} +``` diff --git a/docs/en/threading_overview.md b/docs/en/threading_overview.md index d67d14d9d6..656f629b51 100644 --- a/docs/en/threading_overview.md +++ b/docs/en/threading_overview.md @@ -43,3 +43,28 @@ When an event dispatcher passes a task to a worker thread, the user code probabl Flow controls in asynchronous programming are even difficult for experts. Any suspending operation such as sleeping for a while or waiting for something to finish, implies that users have to save states explicitly and restore states in callbacks. Asynchronous code is often written as state machines. A few suspensions are troublesome, but still handleable. The problem is that once the suspension occurs inside a condition, loop or sub-function, it's almost impossible to write such a state machine being understood and maintained by many people, although the scenario is quite common in distributed systems where a node often needs to interact with multiple nodes simultaneously. In addition, if the wakeup can be triggered by more than one events (such as either fd has data or timeout is reached), the suspension and resuming are prone to race conditions, which require good multi-threaded programming skills to solve. Syntactic sugars(such as lambda) just make coding less troublesome rather than reducing difficulty. Shared pointers are common in asynchronous programming, which seems convenient, but also makes ownerships of memory elusive. If the memory is leaked, it's difficult to locate the code that forgot to release; if segment fault happens, where the double-free occurs is also unknown. Code with a lot of referential countings is hard to remain good-quality and may waste a lot of time on debugging memory related issues. If references are even counted manually, keeping quality of the code is harder and the maintainers are less willing to modify the code. [RAII](http://en.wikipedia.org/wiki/Resource_Acquisition_Is_Initialization) cannot be used in many scenarios in asynchronous programming, sometimes resources need to be locked before a callback and unlocked inside the callback, which is very error-prone in practice. + +## Butex wait/wake ordering (practical rule) + +When using `butex_wait`/`butex_wake*` directly, follow this rule strictly: + +1. Waker writes result/state first, then calls `butex_wake*`. +2. Waiter always re-checks predicate after every `butex_wait` return. + +`butex_wait` returning `0` only means "woken from butex queue", not "predicate is true". + +Typical pattern: + +```cpp +// waker +state.store(new_value, butil::memory_order_release); +bthread::butex_wake(&state); + +// waiter +while (state.load(butil::memory_order_acquire) == expected_value) { + if (bthread::butex_wait(&state, expected_value, NULL) < 0 && + errno != EWOULDBLOCK && errno != EINTR) { + // handle timeout/stop/etc. + } +} +``` diff --git a/src/bthread/butex.h b/src/bthread/butex.h index ef2e4eed6b..b71e52f865 100644 --- a/src/bthread/butex.h +++ b/src/bthread/butex.h @@ -51,6 +51,18 @@ template T* butex_create_checked() { // Destroy the butex. void butex_destroy(void* butex); +// Usage contract for butex wait/wake on the same state variable: +// 1) The waker must publish state transition first, then call butex_wake*(). +// Typical pattern: store(..., memory_order_release) -> butex_wake*(). +// 2) A successful wakeup does NOT mean the user predicate is true. +// Waiters must re-check predicate after every butex_wait() return. +// +// Example: +// while (state.load(memory_order_acquire) == expected) { +// if (butex_wait(&state, expected, NULL) < 0 && errno != EWOULDBLOCK && +// errno != EINTR) { ... } +// } +// // Wake up at most 1 thread waiting on |butex|. // Returns # of threads woken up. // Returns -1 and sets errno=EINVAL when the selected waiter is in @@ -91,6 +103,9 @@ int butex_requeue(void* butex1, void* butex2); // Atomically wait on |butex| if *butex equals |expected_value|, until the // butex is woken up by butex_wake*, or CLOCK_REALTIME reached |abstime| if // abstime is not NULL. +// IMPORTANT: +// Returning 0 only means the waiter was woken from butex queue. +// It does NOT imply user predicate is satisfied. // About |abstime|: // Different from FUTEX_WAIT, butex_wait uses absolute time. // About |prepend|: From ca6e844467ca77780fed9f114b4e82880b43c1db Mon Sep 17 00:00:00 2001 From: MalikHou Date: Tue, 3 Mar 2026 22:07:52 +0800 Subject: [PATCH 8/8] fix --- docs/cn/bthread_active_task.md | 15 +++++++++++---- src/bthread/unstable.h | 10 ++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/docs/cn/bthread_active_task.md b/docs/cn/bthread_active_task.md index 3c6a367a75..d7c3ffc9c9 100644 --- a/docs/cn/bthread_active_task.md +++ b/docs/cn/bthread_active_task.md @@ -152,9 +152,14 @@ SubmitAsyncIo(&req); // 挂起当前 bthread,等待 active-task hook 唤醒。 // bthread_butex_wait_local 内部会对本次 wait 启用 wait-scope 本地化 pin, // 保证恢复在同一个 worker 上,且恢复前不会被 steal。 -int rc = bthread_butex_wait_local(req.done_butex, 0, NULL); -if (rc != 0 && errno != EWOULDBLOCK) { - // 处理错误/中断/超时(如果设置了超时) +// 注意:返回 0 只表示“被唤醒”,不代表条件一定满足;要按谓词循环等待。 +while (static_cast*>(req.done_butex) + ->load(butil::memory_order_acquire) == 0) { + const int rc = bthread_butex_wait_local(req.done_butex, 0, NULL); + if (rc != 0 && errno != EWOULDBLOCK && errno != EINTR) { + // 处理错误/中断/超时(如果设置了超时) + break; + } } // 被唤醒后继续执行(返回点在同一个 worker 上) @@ -167,7 +172,7 @@ bthread::butex_destroy(req.done_butex); 核心点: - 在 hook 中通过 completion 找回 `ReqCtx*` -- 写结果 +- 先写结果/状态(包括 butex 状态位),再 wake - 调 `bthread_butex_wake_within(ctx, req->done_butex)`,显式本地唤醒 ```cpp @@ -187,6 +192,8 @@ static bool HarvestCompletions( // “做点什么事”:写 completion 结果 req->result = 123; // 示例 + static_cast*>(req->done_butex) + ->store(1, butil::memory_order_release); req->done.store(1, std::memory_order_release); errno = 0; diff --git a/src/bthread/unstable.h b/src/bthread/unstable.h index ebac28ef9c..dd5701fafb 100644 --- a/src/bthread/unstable.h +++ b/src/bthread/unstable.h @@ -160,6 +160,12 @@ extern int bthread_register_active_task_type( // - owner-local pinned runqueue is full: return -1 and set errno=EAGAIN. // Caller should retry in a later harvest round. // +// Ordering rule: +// publish result/state first, then call bthread_butex_wake_within(). +// Typical pattern: +// ((butil::atomic*)butex)->store(new_value, butil::memory_order_release); +// bthread_butex_wake_within(ctx, butex); +// // Calling this API outside active-task harvest callbacks returns -1 and sets // errno=EPERM. extern int bthread_butex_wake_within(const bthread_active_task_ctx_t* ctx, @@ -175,6 +181,10 @@ extern int bthread_butex_wake_within(const bthread_active_task_ctx_t* ctx, // bthread_butex_wake_within(). Generic butex_wake* APIs do not provide a // fallback path for pinned waiters and may fail with EINVAL. // +// IMPORTANT: +// returning 0 only means the waiter was woken from butex queue. +// Caller still needs to re-check user predicate after each return. +// // Returns 0 on success, -1 otherwise and errno is set. // - EPERM: not running inside a normal bthread worker task extern int bthread_butex_wait_local(void* butex, int expected_value,