From a23acdc022512c218d7e8959588488bb33347b4a Mon Sep 17 00:00:00 2001
From: Coldwings <coldwings@me.com>
Date: Thu, 26 Mar 2026 14:19:24 +0800
Subject: [PATCH] Huge change for vthread-stack

---
 examples/async_file_io.cpp                    |  12 +-
 examples/autoscaler_example.cpp               |   4 +-
 examples/benchmark.cpp                        |  90 +--
 examples/debug_test.cpp                       |  42 +-
 examples/dynamic_threads.cpp                  |   3 +-
 examples/http_server.cpp                      |   4 +-
 examples/io_benchmark.cpp                     |   5 +-
 examples/microbench.cpp                       |  42 +-
 examples/parallel_tasks.cpp                   |   3 +-
 examples/quick_benchmark.cpp                  |   9 +-
 examples/rpc_client_example.cpp               |   6 +-
 examples/rpc_server_example.cpp               |   6 +-
 examples/scalability_test.cpp                 |   3 +-
 examples/signal_handling.cpp                  |   9 +-
 examples/sse_server.cpp                       |   7 +-
 examples/tcp_echo_client.cpp                  |   3 +-
 examples/tcp_echo_server.cpp                  |  11 +-
 examples/thread_affinity.cpp                  |   6 +-
 examples/uds_echo_client.cpp                  |   3 +-
 examples/uds_echo_server.cpp                  |  11 +-
 examples/websocket_server.cpp                 |   2 +-
 include/elio/coro/frame_allocator.hpp         | 329 -----------
 include/elio/coro/promise_base.hpp            |  71 ++-
 include/elio/coro/task.hpp                    | 208 +++----
 include/elio/coro/task_handle.hpp             | 119 ++--
 include/elio/coro/vthread_stack.hpp           | 171 ++++++
 include/elio/elio.hpp                         |   4 +
 include/elio/http/http_server.hpp             |  10 +-
 include/elio/http/websocket_server.hpp        |  10 +-
 include/elio/net/resolve.hpp                  | 197 +++----
 include/elio/rpc/rpc_client.hpp               |  64 +--
 include/elio/rpc/rpc_server.hpp               |   6 +-
 include/elio/runtime/async_main.hpp           |  90 ++-
 include/elio/runtime/blocking_pool.hpp        |  89 +++
 include/elio/runtime/scheduler.hpp            | 148 ++++-
 include/elio/runtime/serve.hpp                |  50 +-
 include/elio/runtime/spawn.hpp                |  95 +++
 include/elio/runtime/spawn_blocking.hpp       | 107 ++++
 include/elio/sync/primitives.hpp              |  26 +-
 tests/CMakeLists.txt                          |   1 +
 tests/integration/test_dynamic_threads.cpp    |  39 +-
 .../test_exception_propagation.cpp            |  24 +-
 tests/integration/test_parallel_tasks.cpp     |  24 +-
 .../test_scheduler_integration.cpp            |  18 +-
 tests/unit/test_affinity.cpp                  |  54 +-
 tests/unit/test_awaitable_base.cpp            |  30 +-
 tests/unit/test_frame_allocator.cpp           |  93 ---
 tests/unit/test_io.cpp                        | 132 ++---
 tests/unit/test_scheduler.cpp                 |  26 +-
 tests/unit/test_signalfd.cpp                  |  16 +-
 tests/unit/test_sync.cpp                      | 109 ++--
 tests/unit/test_task.cpp                      | 168 +++---
 tests/unit/test_timer.cpp                     |  36 +-
 tests/unit/test_vthread_stack.cpp             | 543 ++++++++++++++++++
 54 files changed, 2063 insertions(+), 1325 deletions(-)
 delete mode 100644 include/elio/coro/frame_allocator.hpp
 create mode 100644 include/elio/coro/vthread_stack.hpp
 create mode 100644 include/elio/runtime/blocking_pool.hpp
 create mode 100644 include/elio/runtime/spawn.hpp
 create mode 100644 include/elio/runtime/spawn_blocking.hpp
 delete mode 100644 tests/unit/test_frame_allocator.cpp
 create mode 100644 tests/unit/test_vthread_stack.cpp

diff --git a/examples/async_file_io.cpp b/examples/async_file_io.cpp
index e797300..9efd73b 100644
--- a/examples/async_file_io.cpp
+++ b/examples/async_file_io.cpp
@@ -270,8 +270,10 @@ int main(int argc, char* argv[]) {
             done = true;
         };
         
+        coro::detail::heap_alloc_guard guard;
         auto t = run();
-        sched.spawn(t.release());
+        auto handle = coro::detail::task_access::release(t);
+        sched.spawn(handle);
     } else if (mode == "--read") {
         std::vector<std::string> files;
         for (int i = 2; i < argc; ++i) {
@@ -283,8 +285,10 @@ int main(int argc, char* argv[]) {
             done = true;
         };
         
+        coro::detail::heap_alloc_guard guard;
         auto t = run();
-        sched.spawn(t.release());
+        auto handle = coro::detail::task_access::release(t);
+        sched.spawn(handle);
     } else if (argc >= 3) {
         // File copy mode
         std::string src = argv[1];
@@ -296,8 +300,10 @@ int main(int argc, char* argv[]) {
             done = true;
         };
         
+        coro::detail::heap_alloc_guard guard;
         auto t = run();
-        sched.spawn(t.release());
+        auto handle = coro::detail::task_access::release(t);
+        sched.spawn(handle);
     } else {
         std::cerr << "Invalid arguments" << std::endl;
         return 1;
diff --git a/examples/autoscaler_example.cpp b/examples/autoscaler_example.cpp
index 26208cb..0e3e0a9 100644
--- a/examples/autoscaler_example.cpp
+++ b/examples/autoscaler_example.cpp
@@ -51,7 +51,7 @@ int main() {
 
         // Submit heavy workload
         for (int i = 0; i < 2000; ++i) {
-            sched.spawn(workload_task(completed).release());
+            sched.go([&completed]() { return workload_task(completed); });
         }
 
         std::cout << "Phase 1: High load - expecting scale-up..." << std::endl;
@@ -80,7 +80,7 @@ int main() {
 
         // Submit even heavier workload
         for (int i = 0; i < 3000; ++i) {
-            sched.spawn(workload_task(completed2).release());
+            sched.go([&completed2]() { return workload_task(completed2); });
         }
 
         std::cout << "Phase 2: Higher load - expecting more scale-up..." << std::endl;
diff --git a/examples/benchmark.cpp b/examples/benchmark.cpp
index 323fc64..687b731 100644
--- a/examples/benchmark.cpp
+++ b/examples/benchmark.cpp
@@ -70,16 +70,22 @@ void benchmark_spawn_overhead() {
     while (duration_cast<seconds>(high_resolution_clock::now() - bench_start) < MIN_BENCH_DURATION) {
         runtime::scheduler sched(4);
         sched.start();
-        
+
+        std::atomic<int> completed(0);
+
         auto batch_start = high_resolution_clock::now();
-        
+
+        auto taskdef = [&completed]() -> coro::task<void> {
+            completed.fetch_add(1, std::memory_order_release);
+            co_return;
+        };
+
         for (int i = 0; i < batch_size; ++i) {
-            auto t = empty_task();
-            sched.spawn(t.release());
+            sched.go(taskdef);
         }
-        
+
         // Wait for all to complete
-        while (sched.pending_tasks() > 0) {
+        while (completed.load(std::memory_order_acquire) < batch_size) {
             std::this_thread::sleep_for(microseconds(1));
         }
         
@@ -127,24 +133,23 @@ void benchmark_context_switch() {
         runtime::scheduler sched(4);
         sched.start();
         
-        std::atomic<int> completed{0};
-        
-        auto task_with_await = [&]() -> coro::task<void> {
-            for (int i = 0; i < awaits_per_task; ++i) {
-                int value = co_await compute_task(i);
+        std::atomic<int> completed(0);
+
+        auto taskdef = [&completed]() -> coro::task<void> {
+            for (int j = 0; j < awaits_per_task; ++j) {
+                int value = co_await compute_task(j);
                 (void)value;
             }
             completed.fetch_add(1, std::memory_order_relaxed);
             co_return;
         };
-        
+
         auto batch_start = high_resolution_clock::now();
-        
+
         for (int i = 0; i < batch_size; ++i) {
-            auto t = task_with_await();
-            sched.spawn(t.release());
+            sched.go(taskdef);
         }
-        
+
         while (completed.load(std::memory_order_relaxed) < batch_size) {
             std::this_thread::sleep_for(microseconds(1));
         }
@@ -199,12 +204,11 @@ void benchmark_yield() {
             runtime::scheduler sched(1);  // Single worker thread
             sched.start();
             
-            std::atomic<int> completed{0};
-            std::atomic<int64_t> end_time_ns{0};  // Last task records end timestamp
-            
-            // Each vthread yields multiple times
-            auto yield_task = [&]() -> coro::task<void> {
-                for (int i = 0; i < yields_per_vthread; ++i) {
+            std::atomic<int> completed(0);
+            std::atomic<int64_t> end_time_ns(0);  // Last task records end timestamp
+
+            auto taskdef = [&completed, &end_time_ns, num_vthreads]() -> coro::task<void> {
+                for (int j = 0; j < yields_per_vthread; ++j) {
                     co_await time::yield();
                 }
                 // Last task to complete records the end timestamp
@@ -215,17 +219,17 @@ void benchmark_yield() {
                 }
                 co_return;
             };
-            
+
+
             // Capture start time in main thread
             auto start_time_ns = duration_cast<nanoseconds>(
                 steady_clock::now().time_since_epoch()).count();
-            
+
             // Spawn all vthreads
             for (int i = 0; i < num_vthreads; ++i) {
-                auto t = yield_task();
-                sched.spawn(t.release());
+                sched.go(taskdef);
             }
-            
+
             // Wait for end_time_ns to be set (spin-wait for accuracy)
             while (end_time_ns.load(std::memory_order_acquire) == 0) {
                 // Spin without yielding for accurate measurement
@@ -272,32 +276,31 @@ void benchmark_work_stealing() {
         runtime::scheduler sched(4);
         sched.start();
         
-        std::atomic<int> completed{0};
-        
+        std::atomic<int> completed(0);
+
         // Record initial per-worker task counts
         std::vector<size_t> initial_counts(4);
         for (size_t i = 0; i < 4; ++i) {
             initial_counts[i] = sched.worker_tasks_executed(i);
         }
-        
-        auto heavy_task = [&]() -> coro::task<void> {
+
+        auto taskdef = [&completed]() -> coro::task<void> {
             volatile int sum = 0;
-            for (int i = 0; i < 10000; ++i) {
-                sum = sum + i * i;
+            for (int j = 0; j < 10000; ++j) {
+                sum = sum + j * j;
             }
             (void)sum;
             completed.fetch_add(1, std::memory_order_relaxed);
             co_return;
         };
-        
+
         auto batch_start = high_resolution_clock::now();
-        
+
         // Spawn ALL tasks to worker 0 to test work stealing
         for (int i = 0; i < batch_size; ++i) {
-            auto t = heavy_task();
-            sched.spawn_to(0, t.release());
+            sched.go_to(0, taskdef);
         }
-        
+
         while (completed.load(std::memory_order_relaxed) < batch_size) {
             std::this_thread::sleep_for(microseconds(1));
         }
@@ -364,13 +367,13 @@ void benchmark_scalability() {
             runtime::scheduler sched(num_threads);
             sched.start();
 
-            std::atomic<int> completed{0};
+            std::atomic<int> completed(0);
 
-            auto task_func = [&]() -> coro::task<void> {
+            auto taskdef = [&completed]() -> coro::task<void> {
                 // Larger CPU-bound work to minimize scheduling overhead ratio
                 volatile int sum = 0;
-                for (int i = 0; i < work_iterations; ++i) {
-                    sum = sum + i * i;
+                for (int j = 0; j < work_iterations; ++j) {
+                    sum = sum + j * j;
                 }
                 (void)sum;
                 completed.fetch_add(1, std::memory_order_relaxed);
@@ -381,8 +384,7 @@ void benchmark_scalability() {
 
             // Distribute tasks evenly across workers for true parallel scaling test
             for (int i = 0; i < batch_size; ++i) {
-                auto t = task_func();
-                sched.spawn(t.release());  // Round-robin distribution
+                sched.go(taskdef);
             }
 
             while (completed.load(std::memory_order_relaxed) < batch_size) {
diff --git a/examples/debug_test.cpp b/examples/debug_test.cpp
index 8b9fdde..a3b9810 100644
--- a/examples/debug_test.cpp
+++ b/examples/debug_test.cpp
@@ -45,7 +45,7 @@ coro::task<void> signal_handler_task() {
     } while(0)
 
 // Helper awaitable to get promise reference
-namespace detail {
+namespace debug_detail {
 struct get_promise {
     bool await_ready() const noexcept { return false; }
     
@@ -66,7 +66,7 @@ struct get_promise {
 // Level 3: Leaf coroutine that does some work
 coro::task<int> compute_value(int x) {
     // Set debug location
-    auto& p = co_await detail::get_promise{};
+    auto& p = co_await debug_detail::get_promise{};
     p.set_location(__FILE__, "compute_value", __LINE__);
     p.set_state(coro::coroutine_state::running);
     
@@ -79,7 +79,7 @@ coro::task<int> compute_value(int x) {
 
 // Level 2: Middle coroutine
 coro::task<int> process_data(int id) {
-    auto& p = co_await detail::get_promise{};
+    auto& p = co_await debug_detail::get_promise{};
     p.set_location(__FILE__, "process_data", __LINE__);
     p.set_state(coro::coroutine_state::running);
     
@@ -90,7 +90,7 @@ coro::task<int> process_data(int id) {
 
 // Level 1: Outer coroutine (worker)
 coro::task<void> worker_task(int worker_id) {
-    auto& p = co_await detail::get_promise{};
+    auto& p = co_await debug_detail::get_promise{};
     p.set_location(__FILE__, "worker_task", __LINE__);
     p.set_state(coro::coroutine_state::running);
     
@@ -107,7 +107,7 @@ coro::task<void> worker_task(int worker_id) {
 
 // Long-running task for debugging
 coro::task<void> long_running_task([[maybe_unused]] int id) {
-    auto& p = co_await detail::get_promise{};
+    auto& p = co_await debug_detail::get_promise{};
     p.set_location(__FILE__, "long_running_task", __LINE__);
     p.set_state(coro::coroutine_state::running);
     
@@ -141,19 +141,7 @@ coro::task<int> async_main(int argc, char* argv[]) {
         std::cout << std::endl;
     }
     
-    // Spawn some worker tasks
-    std::vector<coro::task<void>> workers;
-    for (int i = 0; i < 4; ++i) {
-        workers.push_back(worker_task(i));
-    }
-    
-    // Spawn long-running tasks for debugging
-    std::vector<coro::task<void>> long_tasks;
-    for (int i = 0; i < 2; ++i) {
-        long_tasks.push_back(long_running_task(i));
-    }
-    
-    // Get scheduler and spawn tasks
+    // Get scheduler
     auto* sched = runtime::scheduler::current();
     if (!sched) {
         std::cerr << "Error: No scheduler" << std::endl;
@@ -161,17 +149,19 @@ coro::task<int> async_main(int argc, char* argv[]) {
     }
     
     // Spawn signal handler coroutine
-    auto sig_handler = signal_handler_task();
-    sched->spawn(sig_handler.release());
+    sched->go(signal_handler_task);
     
-    for (auto& w : workers) {
-        sched->spawn(w.release());
+    // Spawn some worker tasks
+    for (int i = 0; i < 4; ++i) {
+        sched->go([i]() { return worker_task(i); });
     }
-    for (auto& t : long_tasks) {
-        sched->spawn(t.release());
+    
+    // Spawn long-running tasks for debugging
+    for (int i = 0; i < 2; ++i) {
+        sched->go([i]() { return long_running_task(i); });
     }
     
-    std::cout << "Spawned " << workers.size() + long_tasks.size() << " tasks" << std::endl;
+    std::cout << "Spawned " << 4 + 2 << " tasks" << std::endl;
     std::cout << std::endl;
     
     if (pause_mode) {
@@ -198,5 +188,5 @@ int main(int argc, char* argv[]) {
     sigs.block_all_threads();
     
     // Use elio::run() with the async_main coroutine
-    return elio::run(async_main(argc, argv));
+    return elio::run([&]() { return async_main(argc, argv); });
 }
diff --git a/examples/dynamic_threads.cpp b/examples/dynamic_threads.cpp
index 512f722..35fe153 100644
--- a/examples/dynamic_threads.cpp
+++ b/examples/dynamic_threads.cpp
@@ -21,8 +21,7 @@ void run_batch(runtime::scheduler& sched, int num_tasks, const std::string& labe
     
     // Spawn tasks
     for (int i = 0; i < num_tasks; ++i) {
-        auto t = simple_task(completed);
-        sched.spawn(t.release());
+        sched.go([&completed]() { return simple_task(completed); });
     }
     
     // Wait for completion
diff --git a/examples/http_server.cpp b/examples/http_server.cpp
index 954f74b..a83f55a 100644
--- a/examples/http_server.cpp
+++ b/examples/http_server.cpp
@@ -231,14 +231,14 @@ coro::task<int> async_main(int argc, char* argv[]) {
         try {
             auto tls_ctx = tls::tls_context::make_server(cert_file, key_file);
             ELIO_LOG_INFO("Starting HTTPS server on {}", bind_addr.to_string());
-            co_await elio::serve(srv, srv.listen_tls(bind_addr, tls_ctx, opts));
+            co_await elio::serve(srv, [&]() { return srv.listen_tls(bind_addr, tls_ctx, opts); });
         } catch (const std::exception& e) {
             ELIO_LOG_ERROR("Failed to start HTTPS server: {}", e.what());
             co_return 1;
         }
     } else {
         ELIO_LOG_INFO("Starting HTTP server on {}", bind_addr.to_string());
-        co_await elio::serve(srv, srv.listen(bind_addr, opts));
+        co_await elio::serve(srv, [&]() { return srv.listen(bind_addr, opts); });
     }
 
     co_return 0;
diff --git a/examples/io_benchmark.cpp b/examples/io_benchmark.cpp
index 0c5611d..133f7dd 100644
--- a/examples/io_benchmark.cpp
+++ b/examples/io_benchmark.cpp
@@ -1,4 +1,5 @@
 #include <elio/runtime/scheduler.hpp>
+#include <elio/runtime/spawn.hpp>
 #include <elio/coro/task.hpp>
 #include <elio/io/io_awaitables.hpp>
 #include <elio/log/macros.hpp>
@@ -48,7 +49,7 @@ void benchmark_file_io() {
     };
 
     auto start = high_resolution_clock::now();
-    io_task().go();
+    elio::go(io_task);
 
     while (completed.load(std::memory_order_acquire) == 0) {
         std::this_thread::sleep_for(microseconds(100));
@@ -104,7 +105,7 @@ void benchmark_concurrent_file_io() {
 
     auto start = high_resolution_clock::now();
     for (int i = 0; i < NUM_TASKS; ++i) {
-        io_task().go();
+        elio::go(io_task);
     }
 
     while (completed.load(std::memory_order_acquire) < NUM_TASKS) {
diff --git a/examples/microbench.cpp b/examples/microbench.cpp
index 09c09d0..a557a0f 100644
--- a/examples/microbench.cpp
+++ b/examples/microbench.cpp
@@ -19,6 +19,10 @@ int main() {
 
     constexpr int N = 100000;
 
+    // NOTE: Tests 1-2 below use detail::task_access to directly measure
+    // coroutine frame allocation overhead. This is intentional for low-level
+    // performance analysis and requires manual handle management.
+
     // 1. Measure coroutine frame allocation (cold - first time)
     {
         std::vector<std::coroutine_handle<>> handles;
@@ -27,7 +31,7 @@ int main() {
         auto start = high_resolution_clock::now();
         for (int i = 0; i < N; ++i) {
             auto t = empty_task();
-            handles.push_back(t.release());
+            handles.push_back(coro::detail::task_access::release(t));
         }
         auto end = high_resolution_clock::now();
         auto ns = duration_cast<nanoseconds>(end - start).count();
@@ -46,7 +50,7 @@ int main() {
         auto start = high_resolution_clock::now();
         for (int i = 0; i < N; ++i) {
             auto t = empty_task();
-            handles.push_back(t.release());
+            handles.push_back(coro::detail::task_access::release(t));
         }
         auto end = high_resolution_clock::now();
         auto ns = duration_cast<nanoseconds>(end - start).count();
@@ -119,23 +123,14 @@ int main() {
         close(fd);
     }
 
-    // 7. Full spawn path (with running scheduler) - cold
+    // 7. Full spawn path (with running scheduler) - includes alloc + spawn
     {
         runtime::scheduler sched(4);
         sched.start();
 
-        std::vector<std::coroutine_handle<>> handles;
-        handles.reserve(N);
-
-        // Pre-create tasks
-        for (int i = 0; i < N; ++i) {
-            auto t = empty_task();
-            handles.push_back(t.release());
-        }
-
         auto start = high_resolution_clock::now();
-        for (auto h : handles) {
-            sched.spawn(h);
+        for (int i = 0; i < N; ++i) {
+            sched.go(empty_task);
         }
         auto end = high_resolution_clock::now();
 
@@ -145,12 +140,12 @@ int main() {
         }
 
         auto ns = duration_cast<nanoseconds>(end - start).count();
-        std::cout << "spawn() only (pre-alloc): " << (ns / N) << " ns/spawn" << std::endl;
+        std::cout << "sched.go() full path: " << (ns / N) << " ns/go" << std::endl;
 
         sched.shutdown();
     }
 
-    // 8. Measure idle worker overhead
+    // 8. Measure warmed-up worker overhead
     {
         runtime::scheduler sched(4);
         sched.start();
@@ -158,18 +153,9 @@ int main() {
         // Let workers warm up
         std::this_thread::sleep_for(std::chrono::milliseconds(50));
 
-        std::vector<std::coroutine_handle<>> handles;
-        handles.reserve(N);
-
-        // Pre-create tasks
-        for (int i = 0; i < N; ++i) {
-            auto t = empty_task();
-            handles.push_back(t.release());
-        }
-
         auto start = high_resolution_clock::now();
-        for (auto h : handles) {
-            sched.spawn(h);
+        for (int i = 0; i < N; ++i) {
+            sched.go(empty_task);
         }
         auto end = high_resolution_clock::now();
 
@@ -179,7 +165,7 @@ int main() {
         }
 
         auto ns = duration_cast<nanoseconds>(end - start).count();
-        std::cout << "spawn() only (workers idle): " << (ns / N) << " ns/spawn" << std::endl;
+        std::cout << "sched.go() (workers warmed): " << (ns / N) << " ns/go" << std::endl;
 
         sched.shutdown();
     }
diff --git a/examples/parallel_tasks.cpp b/examples/parallel_tasks.cpp
index 35a55d5..6671392 100644
--- a/examples/parallel_tasks.cpp
+++ b/examples/parallel_tasks.cpp
@@ -64,8 +64,7 @@ coro::task<int> async_main([[maybe_unused]] int argc, [[maybe_unused]] char* arg
     for (int i = 0; i < num_tasks; ++i) {
         // Vary work amount: some tasks do more work than others
         int work_amount = 10 + (i % 20);
-        auto t = worker_task(i, work_amount, completed);
-        sched->spawn(t.release());
+        sched->go([i, work_amount, &completed]() { return worker_task(i, work_amount, completed); });
     }
     
     // Monitor progress using yield
diff --git a/examples/quick_benchmark.cpp b/examples/quick_benchmark.cpp
index 5b7fff6..9ed7fd1 100644
--- a/examples/quick_benchmark.cpp
+++ b/examples/quick_benchmark.cpp
@@ -69,8 +69,7 @@ void benchmark_spawn_overhead() {
         auto batch_start = high_resolution_clock::now();
 
         for (int i = 0; i < batch_size; ++i) {
-            auto t = empty_task();
-            sched.spawn(t.release());
+            sched.go(empty_task);
         }
 
         while (sched.pending_tasks() > 0) {
@@ -125,8 +124,7 @@ void benchmark_context_switch() {
         auto batch_start = high_resolution_clock::now();
 
         for (int i = 0; i < batch_size; ++i) {
-            auto t = task_with_await();
-            sched.spawn(t.release());
+            sched.go(task_with_await);
         }
 
         while (completed.load(std::memory_order_relaxed) < batch_size) {
@@ -188,8 +186,7 @@ void benchmark_yield() {
             steady_clock::now().time_since_epoch()).count();
 
         for (int i = 0; i < num_vthreads; ++i) {
-            auto t = yield_task();
-            sched.spawn(t.release());
+            sched.go(yield_task);
         }
 
         while (end_time_ns.load(std::memory_order_acquire) == 0) {}
diff --git a/examples/rpc_client_example.cpp b/examples/rpc_client_example.cpp
index c1c8ffb..e1b31e6 100644
--- a/examples/rpc_client_example.cpp
+++ b/examples/rpc_client_example.cpp
@@ -268,8 +268,7 @@ task<void> run_demo(tcp_rpc_client::ptr client) {
             
             auto* sched = scheduler::current();
             if (sched) {
-                auto t = call_task();
-                sched->spawn(t.release());
+                sched->go(call_task);
             }
         }
         
@@ -369,8 +368,7 @@ int main(int argc, char* argv[]) {
     sched.start();
     
     // Run client
-    auto client = client_main(host, port);
-    sched.spawn(client.release());
+    sched.go([&]() { return client_main(host, port); });
     
     // Wait for completion
     std::this_thread::sleep_for(std::chrono::seconds(5));
diff --git a/examples/rpc_server_example.cpp b/examples/rpc_server_example.cpp
index ca33657..d6a704d 100644
--- a/examples/rpc_server_example.cpp
+++ b/examples/rpc_server_example.cpp
@@ -321,12 +321,10 @@ int main(int argc, char* argv[]) {
     sched.start();
     
     // Spawn signal handler coroutine
-    auto sig_handler = signal_handler_task();
-    sched.spawn(sig_handler.release());
+    sched.go(signal_handler_task);
     
     // Run server
-    auto server = server_main(port, sched);
-    sched.spawn(server.release());
+    sched.go([port, &sched]() { return server_main(port, sched); });
     
     // Wait for shutdown
     while (g_running) {
diff --git a/examples/scalability_test.cpp b/examples/scalability_test.cpp
index 1596dee..afadf46 100644
--- a/examples/scalability_test.cpp
+++ b/examples/scalability_test.cpp
@@ -52,8 +52,7 @@ int main() {
 
             // Distribute tasks evenly via spawn (round-robin)
             for (int i = 0; i < batch_size; ++i) {
-                auto t = task_func();
-                sched.spawn(t.release());
+                sched.go(task_func);
             }
 
             while (completed.load(std::memory_order_relaxed) < batch_size) {
diff --git a/examples/signal_handling.cpp b/examples/signal_handling.cpp
index dfba213..162567f 100644
--- a/examples/signal_handling.cpp
+++ b/examples/signal_handling.cpp
@@ -105,14 +105,12 @@ task<void> main_task(scheduler& sched) {
     ELIO_LOG_INFO("Starting application with PID {}", getpid());
     
     // Spawn the signal handler
-    auto sig_handler = signal_handler_task(sched);
-    sched.spawn(sig_handler.release());
+    sched.go([&sched]() { return signal_handler_task(sched); });
     
     // Spawn some worker coroutines
     constexpr int num_workers = 3;
     for (int i = 0; i < num_workers; ++i) {
-        auto worker = worker_task(i);
-        sched.spawn(worker.release());
+        sched.go([i]() { return worker_task(i); });
     }
     
     ELIO_LOG_INFO("All workers started");
@@ -142,8 +140,7 @@ int main() {
     sched.start();
     
     // Spawn main task
-    auto main = main_task(sched);
-    sched.spawn(main.release());
+    sched.go([&sched]() { return main_task(sched); });
     
     // Run until shutdown
     while (g_running) {
diff --git a/examples/sse_server.cpp b/examples/sse_server.cpp
index 70723cf..6ae36a1 100644
--- a/examples/sse_server.cpp
+++ b/examples/sse_server.cpp
@@ -129,8 +129,9 @@ class sse_http_server {
                 continue;
             }
 
-            auto handler = handle_connection(std::move(*stream_result));
-            sched->spawn(handler.release());
+            sched->go([this, stream = std::move(*stream_result)]() mutable {
+                return handle_connection(std::move(stream));
+            });
         }
     }
 
@@ -367,7 +368,7 @@ coro::task<int> async_main(int argc, char* argv[]) {
 
     // Start server and wait for shutdown signal
     // elio::serve() handles signal waiting and graceful shutdown automatically
-    co_await elio::serve(srv, srv.listen(bind_addr));
+    co_await elio::serve(srv, [&]() { return srv.listen(bind_addr); });
 
     co_return 0;
 }
diff --git a/examples/tcp_echo_client.cpp b/examples/tcp_echo_client.cpp
index f6c6c22..f6131b8 100644
--- a/examples/tcp_echo_client.cpp
+++ b/examples/tcp_echo_client.cpp
@@ -209,8 +209,7 @@ int main(int argc, char* argv[]) {
         done = true;
     };
     
-    auto client = run_client();
-    sched.spawn(client.release());
+    sched.go(run_client);
     
     // Wait for completion
     while (!done) {
diff --git a/examples/tcp_echo_server.cpp b/examples/tcp_echo_server.cpp
index 0cb25db..bbaea48 100644
--- a/examples/tcp_echo_server.cpp
+++ b/examples/tcp_echo_server.cpp
@@ -133,8 +133,9 @@ task<void> server_main(const socket_address& bind_addr, const tcp_options& opts,
         
         // Spawn handler coroutine for this client
         int client_id = ++client_counter;
-        auto handler = handle_client(std::move(*stream_result), client_id);
-        sched.spawn(handler.release());
+        sched.go([stream = std::move(*stream_result), client_id]() mutable {
+            return handle_client(std::move(stream), client_id);
+        });
     }
     
     ELIO_LOG_INFO("Server shutting down...");
@@ -199,12 +200,10 @@ int main(int argc, char* argv[]) {
     sched.start();
     
     // Spawn signal handler coroutine
-    auto sig_handler = signal_handler_task();
-    sched.spawn(sig_handler.release());
+    sched.go(signal_handler_task);
     
     // Run server
-    auto server = server_main(bind_addr, opts, sched);
-    sched.spawn(server.release());
+    sched.go([&bind_addr, &opts, &sched]() { return server_main(bind_addr, opts, sched); });
     
     // Wait until interrupted
     while (g_running) {
diff --git a/examples/thread_affinity.cpp b/examples/thread_affinity.cpp
index c173f6c..81d8617 100644
--- a/examples/thread_affinity.cpp
+++ b/examples/thread_affinity.cpp
@@ -106,8 +106,7 @@ coro::task<void> thread_local_state_example() {
     // Spawn multiple tasks bound to different workers
     for (int i = 0; i < 8; ++i) {
         size_t target = i % std::min(num_workers, size_t(2));  // Distribute across 2 workers
-        auto t = thread_local_state_task(i, target);
-        sched->spawn(t.release());
+        sched->go([i, target]() { return thread_local_state_task(i, target); });
     }
     
     // Give tasks time to complete
@@ -175,8 +174,7 @@ coro::task<void> multi_worker_example() {
     std::cout << "Spawning " << num_workers << " tasks, one per worker..." << std::endl;
     
     for (size_t i = 0; i < num_workers; ++i) {
-        auto t = worker_task(i, counters[i]);
-        sched->spawn(t.release());
+        sched->go([i, &counters]() { return worker_task(i, counters[i]); });
     }
     
     // Wait for completion
diff --git a/examples/uds_echo_client.cpp b/examples/uds_echo_client.cpp
index 45a3436..4ae05a9 100644
--- a/examples/uds_echo_client.cpp
+++ b/examples/uds_echo_client.cpp
@@ -188,8 +188,7 @@ int main(int argc, char* argv[]) {
         done = true;
     };
     
-    auto client = run_client();
-    sched.spawn(client.release());
+    sched.go(run_client);
     
     // Wait for completion
     while (!done) {
diff --git a/examples/uds_echo_server.cpp b/examples/uds_echo_server.cpp
index 994fff8..c2b7c2f 100644
--- a/examples/uds_echo_server.cpp
+++ b/examples/uds_echo_server.cpp
@@ -126,8 +126,9 @@ task<void> server_main(const unix_address& addr, scheduler& sched) {
         
         // Spawn handler coroutine for this client
         int client_id = ++client_counter;
-        auto handler = handle_client(std::move(*stream_result), client_id);
-        sched.spawn(handler.release());
+        sched.go([stream = std::move(*stream_result), client_id]() mutable {
+            return handle_client(std::move(stream), client_id);
+        });
     }
     
     ELIO_LOG_INFO("Server shutting down...");
@@ -162,12 +163,10 @@ int main(int argc, char* argv[]) {
     sched.start();
     
     // Spawn signal handler coroutine
-    auto sig_handler = signal_handler_task();
-    sched.spawn(sig_handler.release());
+    sched.go(signal_handler_task);
     
     // Run server
-    auto server = server_main(addr, sched);
-    sched.spawn(server.release());
+    sched.go([&addr, &sched]() { return server_main(addr, sched); });
     
     // Wait until interrupted
     while (g_running) {
diff --git a/examples/websocket_server.cpp b/examples/websocket_server.cpp
index 01ada1a..7a4583a 100644
--- a/examples/websocket_server.cpp
+++ b/examples/websocket_server.cpp
@@ -297,7 +297,7 @@ coro::task<int> async_main(int argc, char* argv[]) {
 
     // Start server and wait for shutdown signal
     // elio::serve() handles signal waiting and graceful shutdown automatically
-    co_await elio::serve(srv, srv.listen(bind_addr));
+    co_await elio::serve(srv, [&]() { return srv.listen(bind_addr); });
 
     co_return 0;
 }
diff --git a/include/elio/coro/frame_allocator.hpp b/include/elio/coro/frame_allocator.hpp
deleted file mode 100644
index f98d93a..0000000
--- a/include/elio/coro/frame_allocator.hpp
+++ /dev/null
@@ -1,329 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <cstdlib>
-#include <new>
-#include <array>
-#include <atomic>
-#include <cstdint>
-#include <mutex>
-
-// Architecture-specific CPU pause/yield hint for tight spin loops.
-// Reduces power consumption and allows the HT sibling to run.
-#if defined(__x86_64__) || defined(__i386__)
-#  define ELIO_CPU_PAUSE() __builtin_ia32_pause()
-#elif defined(__aarch64__) || defined(__arm__)
-#  define ELIO_CPU_PAUSE() __asm__ __volatile__("yield" ::: "memory")
-#else
-#  include <thread>
-#  define ELIO_CPU_PAUSE() std::this_thread::yield()
-#endif
-
-namespace elio::coro {
-
-/// Thread-local free-list based frame allocator for small coroutine frames
-/// Dramatically reduces allocation overhead for frequently created/destroyed coroutines
-///
-/// Design: Each allocated frame has a hidden header storing the source pool ID and size class.
-/// When deallocated on a different thread, the frame is returned via an MPSC queue
-/// to its source pool. This handles work-stealing scenarios where coroutines
-/// are allocated on thread A but deallocated on thread B.
-///
-/// Size Classes: Multiple pools for different frame sizes (32, 64, 128, 256 bytes)
-/// reduce memory waste for small frames while maintaining allocation performance.
-///
-/// Note: Under sanitizers, pooling is disabled to allow proper leak/error detection.
-class frame_allocator {
-public:
-    // Size classes for different frame sizes
-    static constexpr size_t SIZE_CLASSES[] = {32, 64, 128, 256};
-    static constexpr size_t NUM_SIZE_CLASSES = 4;
-    static constexpr size_t POOL_SIZE = 512;  // Per size class
-    static constexpr size_t REMOTE_QUEUE_BATCH = 64;  // Process remote returns in batches
-
-// Detect sanitizers: GCC uses __SANITIZE_*, Clang uses __has_feature
-#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
-#define ELIO_SANITIZER_ACTIVE 1
-#elif defined(__has_feature)
-#if __has_feature(address_sanitizer) || __has_feature(thread_sanitizer)
-#define ELIO_SANITIZER_ACTIVE 1
-#endif
-#endif
-
-#ifdef ELIO_SANITIZER_ACTIVE
-    // Under sanitizers, bypass pooling entirely for accurate leak detection
-    static void* allocate(size_t size) {
-        return ::operator new(size);
-    }
-
-    static void deallocate(void* ptr, [[maybe_unused]] size_t size) noexcept {
-        ::operator delete(ptr);
-    }
-#else
-    static void* allocate(size_t size) {
-        size_t sc = find_size_class(size);
-        if (sc < NUM_SIZE_CLASSES) {
-            auto& alloc = instance();
-
-            // First try to reclaim remote returns periodically
-            alloc.reclaim_remote_returns();
-
-            if (alloc.free_count_[sc] > 0) {
-                void* block = alloc.pool_[sc][--alloc.free_count_[sc]];
-                // Update header to reflect current pool ownership
-                auto* header = static_cast<block_header*>(block);
-                header->source_pool_id = alloc.pool_id_;
-                header->size_class = static_cast<uint8_t>(sc);
-                return block_to_user(block);
-            }
-
-            // Allocate new block with header
-            void* block = ::operator new(alloc_block_size(sc));
-            auto* header = static_cast<block_header*>(block);
-            header->source_pool_id = alloc.pool_id_;
-            header->size_class = static_cast<uint8_t>(sc);
-            header->next.store(nullptr, std::memory_order_relaxed);
-            return block_to_user(block);
-        }
-        // Fall back to standard allocation for large frames
-        return ::operator new(size);
-    }
-
-    static void deallocate(void* ptr, size_t size) noexcept {
-        size_t sc = find_size_class(size);
-        if (sc < NUM_SIZE_CLASSES) {
-            void* block = user_to_block(ptr);
-            auto* header = static_cast<block_header*>(block);
-            auto& alloc = instance();
-
-            // Fast path: same thread - return directly to local pool
-            if (header->source_pool_id == alloc.pool_id_) {
-                if (alloc.free_count_[sc] < POOL_SIZE) {
-                    alloc.pool_[sc][alloc.free_count_[sc]++] = block;
-                    return;
-                }
-                // Pool full, delete the block
-                ::operator delete(block);
-                return;
-            } else {
-                // Cross-thread deallocation: push to source pool's remote queue
-                frame_allocator* source = get_pool_by_id(header->source_pool_id);
-                if (source) {
-                    source->push_remote_return(block);
-                    return;
-                }
-                // Source pool no longer exists (thread exited), delete the block
-                ::operator delete(block);
-                return;
-            }
-        }
-        // Large allocation - was allocated without header
-        ::operator delete(ptr);
-    }
-#endif
-
-private:
-    // Block header stored before user data
-    struct block_header {
-        uint32_t source_pool_id;      // ID of the pool that allocated this block
-        uint8_t size_class;           // Size class index (0-3)
-        std::atomic<block_header*> next;  // For MPSC queue linkage
-    };
-
-    // Header size
-    static constexpr size_t HEADER_SIZE = sizeof(block_header);
-
-    // Find size class index for requested size
-    static size_t find_size_class(size_t size) noexcept {
-        for (size_t i = 0; i < NUM_SIZE_CLASSES; ++i) {
-            if (size <= SIZE_CLASSES[i]) {
-                return i;
-            }
-        }
-        return NUM_SIZE_CLASSES; // Not found (for sizes > 256)
-    }
-
-    // Get actual size for a size class
-    static size_t size_class_size(size_t idx) noexcept {
-        return SIZE_CLASSES[idx];
-    }
-
-    // Total block size including header for a given size class
-    static size_t alloc_block_size(size_t size_class_idx) noexcept {
-        return HEADER_SIZE + SIZE_CLASSES[size_class_idx];
-    }
-
-    // Convert between block (with header) and user pointer
-    static void* block_to_user(void* block) noexcept {
-        return static_cast<char*>(block) + HEADER_SIZE;
-    }
-
-    static void* user_to_block(void* user) noexcept {
-        return static_cast<char*>(user) - HEADER_SIZE;
-    }
-
-    frame_allocator()
-        : pool_id_(next_pool_id_.fetch_add(1, std::memory_order_relaxed))
-        , remote_head_{}
-        , remote_tail_(&remote_head_) {
-        // Initialize remote_head_ fields after default construction
-        remote_head_.source_pool_id = 0;
-        remote_head_.size_class = 0;
-        remote_head_.next.store(nullptr, std::memory_order_relaxed);
-        // Initialize free counts to 0
-        for (size_t i = 0; i < NUM_SIZE_CLASSES; ++i) {
-            free_count_[i] = 0;
-        }
-        // Register this pool for cross-thread access
-        register_pool(this);
-    }
-
-    ~frame_allocator() {
-        // Unregister before cleanup
-        unregister_pool(this);
-
-        // Reclaim any remaining remote returns
-        reclaim_all_remote_returns();
-
-        // Free all cached frames when thread exits
-        for (size_t sc = 0; sc < NUM_SIZE_CLASSES; ++sc) {
-            for (size_t i = 0; i < free_count_[sc]; ++i) {
-                ::operator delete(pool_[sc][i]);
-            }
-        }
-    }
-
-    // MPSC queue: push from any thread (producers), pop from owner only (consumer)
-    void push_remote_return(void* block) noexcept {
-        auto* header = static_cast<block_header*>(block);
-        header->next.store(nullptr, std::memory_order_relaxed);
-
-        // Atomic push to MPSC queue (lock-free)
-        block_header* prev = remote_tail_.exchange(header, std::memory_order_acq_rel);
-        prev->next.store(header, std::memory_order_release);
-    }
-
-    // Called by owner thread to reclaim remote returns for all size classes
-    void reclaim_remote_returns() noexcept {
-        // Quick check without full synchronization
-        block_header* head = remote_head_.next.load(std::memory_order_acquire);
-        if (!head) return;
-
-        size_t count = 0;
-        while (head && count < REMOTE_QUEUE_BATCH) {
-            block_header* next = head->next.load(std::memory_order_acquire);
-
-            // If next is null but tail points elsewhere, the producer is in the
-            // middle of push() (has done the tail exchange but not yet written
-            // prev->next).  Spin briefly with a CPU pause hint.
-            if (!next && remote_tail_.load(std::memory_order_acquire) != head) {
-                for (int i = 0; i < 16; ++i) {
-                    ELIO_CPU_PAUSE();
-                    next = head->next.load(std::memory_order_acquire);
-                    if (next) break;
-                }
-                // If the link still isn't ready, stop without consuming 'head'.
-                // Consuming it would leave the queue in a broken state because
-                // the producer would later write through a recycled pointer.
-                if (!next) break;
-            }
-
-            // Add to appropriate size class pool
-            size_t sc = head->size_class;
-            if (sc < NUM_SIZE_CLASSES && free_count_[sc] < POOL_SIZE) {
-                pool_[sc][free_count_[sc]++] = head;
-                remote_head_.next.store(next, std::memory_order_release);
-                ++count;
-            } else if (sc >= NUM_SIZE_CLASSES) {
-                // Invalid size class - delete the block
-                ::operator delete(head);
-                remote_head_.next.store(next, std::memory_order_release);
-            } else {
-                // Pool full - leave it in the queue for later
-                break;
-            }
-            head = next;
-        }
-    }
-
-    // Called during destruction to reclaim all
-    void reclaim_all_remote_returns() noexcept {
-        block_header* head = remote_head_.next.load(std::memory_order_acquire);
-        while (head) {
-            block_header* next = head->next.load(std::memory_order_acquire);
-
-            // Same safe spin pattern as reclaim_remote_returns(), but with more
-            // retries because we're in teardown and really want to drain the queue.
-            if (!next && remote_tail_.load(std::memory_order_acquire) != head) {
-                for (int i = 0; i < 32; ++i) {
-                    ELIO_CPU_PAUSE();
-                    next = head->next.load(std::memory_order_acquire);
-                    if (next) break;
-                }
-                // Stop safely rather than risk corrupting a partially-linked node.
-                if (!next) break;
-            }
-
-            size_t sc = head->size_class;
-            if (sc < NUM_SIZE_CLASSES && free_count_[sc] < POOL_SIZE) {
-                pool_[sc][free_count_[sc]++] = head;
-            } else {
-                ::operator delete(head);
-            }
-            head = next;
-        }
-        remote_head_.next.store(nullptr, std::memory_order_release);
-        remote_tail_.store(&remote_head_, std::memory_order_release);
-    }
-
-    static frame_allocator& instance() {
-        static thread_local frame_allocator alloc;
-        return alloc;
-    }
-
-    // Pool registry for cross-thread access
-    static constexpr size_t MAX_POOLS = 256;
-
-    // Registry entries - atomic for lock-free reads, protected by mutex for writes
-    static inline std::atomic<frame_allocator*> pool_registry_[MAX_POOLS]{};
-    static inline std::mutex registry_mutex_;  // Protects unregister operations
-
-    static void register_pool(frame_allocator* pool) noexcept {
-        uint32_t id = pool->pool_id_;
-        if (id < MAX_POOLS) {
-            pool_registry_[id].store(pool, std::memory_order_release);
-        }
-    }
-
-    static void unregister_pool(frame_allocator* pool) noexcept {
-        uint32_t id = pool->pool_id_;
-        if (id < MAX_POOLS) {
-            // Use mutex to ensure no concurrent lookups during unregister
-            // This prevents the race where a lookup sees a valid pointer
-            // but the pool is being destroyed
-            std::lock_guard<std::mutex> lock(registry_mutex_);
-            pool_registry_[id].store(nullptr, std::memory_order_release);
-        }
-    }
-
-    // Get pool by ID - returns nullptr if pool was unregistered
-    static frame_allocator* get_pool_by_id(uint32_t id) noexcept {
-        if (id < MAX_POOLS) {
-            return pool_registry_[id].load(std::memory_order_acquire);
-        }
-        return nullptr;
-    }
-
-    std::array<std::array<void*, POOL_SIZE>, NUM_SIZE_CLASSES> pool_;
-    std::array<size_t, NUM_SIZE_CLASSES> free_count_;
-    uint32_t pool_id_;
-
-    // MPSC queue for remote returns (dummy head node pattern)
-    block_header remote_head_;  // Dummy node - next points to actual head
-    std::atomic<block_header*> remote_tail_;
-
-    // Global pool ID counter
-    static inline std::atomic<uint32_t> next_pool_id_{0};
-};
-
-} // namespace elio::coro
diff --git a/include/elio/coro/promise_base.hpp b/include/elio/coro/promise_base.hpp
index 113c296..5b317b5 100644
--- a/include/elio/coro/promise_base.hpp
+++ b/include/elio/coro/promise_base.hpp
@@ -5,6 +5,8 @@
 #include <cstdint>
 #include <limits>
 
+#include "vthread_stack.hpp"
+
 namespace elio::coro {
 
 /// Constant indicating no affinity (vthread can migrate freely)
@@ -94,12 +96,38 @@ class promise_base {
         , debug_id_(0)  // Lazy allocation - only allocated when id() is called
 #endif
         , affinity_(NO_AFFINITY)
+        , vstack_(current_frame_ ? current_frame_->vstack() : nullptr)
+        , owns_vstack_(false)
     {
         current_frame_ = this;
     }
 
     ~promise_base() noexcept {
         current_frame_ = parent_;
+        if (owns_vstack_) {
+            // Clear current_ before deleting vstack. When operator delete later
+            // calls tagged_dealloc() -> vthread_stack::deallocate(), it will find
+            // current_ is nullptr and correctly no-op (memory already freed by vstack).
+            auto* vs = vstack_.exchange(nullptr, std::memory_order_acq_rel);
+            if (vthread_stack::current() == vs) {
+                vthread_stack::set_current(nullptr);
+            }
+            delete vs;
+        }
+    }
+
+    /// Detach this frame from the current thread's frame chain.
+    /// Call this before spawning a coroutine to another thread to avoid
+    /// use-after-free when the original thread creates another coroutine.
+    void detach_from_parent() noexcept {
+        if (current_frame_ == this) {
+            // Set to nullptr instead of parent_ to avoid use-after-free.
+            // parent_ may have been spawned to another thread and destroyed.
+            current_frame_ = nullptr;
+        }
+        parent_ = nullptr;
+        // Ensure all writes before detach are visible to the thread that will execute this coroutine
+        std::atomic_thread_fence(std::memory_order_release);
     }
 
     promise_base(const promise_base&) = delete;
@@ -126,6 +154,10 @@ class promise_base {
         return current_frame_;
     }
 
+    static void set_current_frame(promise_base* frame) noexcept {
+        current_frame_ = frame;
+    }
+
     // Debug accessors (available only when debug metadata is enabled)
 #if ELIO_ENABLE_DEBUG_METADATA
     [[nodiscard]] uint64_t frame_magic() const noexcept { return frame_magic_; }
@@ -172,17 +204,41 @@ class promise_base {
     // Affinity accessors
     /// Get the current thread affinity for this vthread
     /// @return Worker ID this vthread is bound to, or NO_AFFINITY if unbound
-    [[nodiscard]] size_t affinity() const noexcept { return affinity_; }
+    [[nodiscard]] size_t affinity() const noexcept { 
+        return affinity_.load(std::memory_order_acquire); 
+    }
 
     /// Set thread affinity for this vthread
     /// @param worker_id Worker ID to bind to, or NO_AFFINITY to clear
-    void set_affinity(size_t worker_id) noexcept { affinity_ = worker_id; }
+    void set_affinity(size_t worker_id) noexcept { 
+        affinity_.store(worker_id, std::memory_order_release); 
+    }
 
     /// Check if this vthread has affinity set
-    [[nodiscard]] bool has_affinity() const noexcept { return affinity_ != NO_AFFINITY; }
+    [[nodiscard]] bool has_affinity() const noexcept { 
+        return affinity_.load(std::memory_order_acquire) != NO_AFFINITY; 
+    }
 
     /// Clear thread affinity, allowing this vthread to migrate freely
-    void clear_affinity() noexcept { affinity_ = NO_AFFINITY; }
+    void clear_affinity() noexcept { 
+        affinity_.store(NO_AFFINITY, std::memory_order_release); 
+    }
+
+    // vthread_stack accessors
+    [[nodiscard]] vthread_stack* vstack() const noexcept { 
+        return vstack_.load(std::memory_order_acquire); 
+    }
+
+    void set_vstack(vthread_stack* vs) noexcept { 
+        vstack_.store(vs, std::memory_order_release); 
+    }
+
+    void set_vstack_owner(vthread_stack* vs) noexcept {
+        vstack_.store(vs, std::memory_order_release);
+        owns_vstack_ = true;
+    }
+
+    [[nodiscard]] bool owns_vstack() const noexcept { return owns_vstack_; }
 
 private:
     // Magic number at start for debugger validation
@@ -201,7 +257,12 @@ class promise_base {
 #endif
 
     // Thread affinity: NO_AFFINITY means can migrate freely
-    size_t affinity_;
+    // Must be atomic to avoid data races in work-stealing scenarios
+    std::atomic<size_t> affinity_;
+
+    // vthread_stack support
+    std::atomic<vthread_stack*> vstack_{nullptr};
+    bool owns_vstack_ = false;
 
     static inline thread_local promise_base* current_frame_ = nullptr;
 };
diff --git a/include/elio/coro/task.hpp b/include/elio/coro/task.hpp
index ceff525..f5b0a10 100644
--- a/include/elio/coro/task.hpp
+++ b/include/elio/coro/task.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "promise_base.hpp"
-#include "frame_allocator.hpp"
+#include "vthread_stack.hpp"
 #include <coroutine>
 #include <optional>
 #include <exception>
@@ -47,6 +47,54 @@ struct final_awaiter {
     void await_resume() const noexcept {}
 };
 
+// Allocation modes
+enum class alloc_mode : uint8_t { stack = 0, heap = 1 };
+inline thread_local alloc_mode current_alloc_mode_ = alloc_mode::stack;
+
+// RAII guard: temporarily switch to heap allocation
+struct heap_alloc_guard {
+    heap_alloc_guard() noexcept { current_alloc_mode_ = alloc_mode::heap; }
+    ~heap_alloc_guard() noexcept { current_alloc_mode_ = alloc_mode::stack; }
+    heap_alloc_guard(const heap_alloc_guard&) = delete;
+    heap_alloc_guard& operator=(const heap_alloc_guard&) = delete;
+};
+
+// Tagged allocation
+static constexpr size_t TAG_OFFSET = alignof(std::max_align_t);
+
+inline void* tagged_alloc(size_t size, alloc_mode tag) {
+    void* raw = (tag == alloc_mode::heap)
+        ? ::operator new(size + TAG_OFFSET)
+        : vthread_stack::allocate(size + TAG_OFFSET);
+    *static_cast<alloc_mode*>(raw) = tag;
+    return static_cast<char*>(raw) + TAG_OFFSET;
+}
+
+inline void tagged_dealloc(void* ptr, size_t size) noexcept {
+    void* raw = static_cast<char*>(ptr) - TAG_OFFSET;
+    auto tag = *static_cast<alloc_mode*>(raw);
+    if (tag == alloc_mode::heap)
+        ::operator delete(raw);
+    else
+        vthread_stack::deallocate(raw, size + TAG_OFFSET);
+}
+
+// Friend accessor: extract handle from immovable task<T>
+struct task_access {
+    template<typename TaskT>
+    static auto release(TaskT& t) noexcept {
+        if (t.handle_) {
+            t.handle_.promise().detached_ = true;
+        }
+        return std::exchange(t.handle_, nullptr);
+    }
+    // Get handle without transferring ownership (for testing)
+    template<typename TaskT>
+    static auto handle(TaskT& t) noexcept {
+        return t.handle_;
+    }
+};
+
 /// Shared state for join_handle<T> - stores result and waiter
 template<typename T>
 struct join_state {
@@ -240,96 +288,62 @@ class join_handle<void> {
 /// Primary template for task<T> where T is not void
 template<typename T>
 class task {
+    friend struct detail::task_access;
 public:
+    using value_type = T;
+
     struct promise_type : promise_base {
         std::optional<T> value_;
         std::coroutine_handle<> continuation_;
         bool detached_ = false;
-        // Join state for spawn() - only used when task is spawned
         std::shared_ptr<detail::join_state<T>> join_state_;
 
-        promise_type() noexcept = default;
+        void* operator new(size_t size) {
+            return detail::tagged_alloc(size, detail::current_alloc_mode_);
+        }
+        void operator delete(void* ptr, size_t size) noexcept {
+            detail::tagged_dealloc(ptr, size);
+        }
 
         [[nodiscard]] task get_return_object() noexcept {
             return task{std::coroutine_handle<promise_type>::from_promise(*this)};
         }
-
         [[nodiscard]] std::suspend_always initial_suspend() noexcept { return {}; }
         [[nodiscard]] detail::final_awaiter final_suspend() noexcept { return {}; }
 
         template<typename U>
         void return_value(U&& value) {
             value_.emplace(std::forward<U>(value));
-            // Notify join state if present
-            if (join_state_) {
-                join_state_->set_value(std::move(*value_));
-            }
+            if (join_state_) join_state_->set_value(std::move(*value_));
         }
 
         void unhandled_exception() noexcept {
             promise_base::unhandled_exception();
-            // Notify join state if present
-            if (join_state_) {
-                join_state_->set_exception(exception());
-            }
-        }
-
-        // Custom allocator for coroutine frames
-        void* operator new(size_t size) {
-            return frame_allocator::allocate(size);
-        }
-
-        void operator delete(void* ptr, size_t size) noexcept {
-            frame_allocator::deallocate(ptr, size);
+            if (join_state_) join_state_->set_exception(exception());
         }
     };
 
     using handle_type = std::coroutine_handle<promise_type>;
 
-    explicit task(handle_type handle) noexcept : handle_(handle) {}
-    task(task&& other) noexcept : handle_(std::exchange(other.handle_, nullptr)) {}
-
-    task& operator=(task&& other) noexcept {
-        if (this != &other) {
-            if (handle_) handle_.destroy();
-            handle_ = std::exchange(other.handle_, nullptr);
-        }
-        return *this;
-    }
-
-    ~task() { if (handle_) handle_.destroy(); }
+    explicit task(handle_type h) noexcept : handle_(h) {}
 
+    // Non-copyable, non-movable
     task(const task&) = delete;
     task& operator=(const task&) = delete;
+    task(task&&) = delete;
+    task& operator=(task&&) = delete;
 
-    [[nodiscard]] handle_type handle() const noexcept { return handle_; }
-    [[nodiscard]] handle_type release() noexcept { 
-        if (handle_) handle_.promise().detached_ = true;
-        return std::exchange(handle_, nullptr); 
-    }
-    
-    /// Spawn this task on the current scheduler (fire-and-forget)
-    /// The task will run asynchronously and self-destruct when complete
-    void go() {
-        runtime::schedule_handle(release());
-    }
-    
-    /// Spawn this task and return a join_handle for awaiting the result
-    /// Usage: auto handle = some_task().spawn(); T result = co_await handle;
-    [[nodiscard]] join_handle<T> spawn();
+    ~task() { if (handle_) handle_.destroy(); }
 
+    // co_await interface
     [[nodiscard]] bool await_ready() const noexcept { return false; }
-
     [[nodiscard]] std::coroutine_handle<> await_suspend(std::coroutine_handle<> awaiter) noexcept {
         handle_.promise().continuation_ = awaiter;
         return handle_;
     }
-
     T await_resume() {
         auto& promise = handle_.promise();
-        if (promise.exception()) {
-            std::rethrow_exception(promise.exception());
-        }
+        if (promise.exception()) std::rethrow_exception(promise.exception());
         return std::move(*promise.value_);
     }
 
@@ -340,115 +354,63 @@ class task {
 /// Specialization for task<void>
 template<>
 class task<void> {
+    friend struct detail::task_access;
 public:
+    using value_type = void;
+
     struct promise_type : promise_base {
         std::coroutine_handle<> continuation_;
         bool detached_ = false;
-        // Join state for spawn() - only used when task is spawned
         std::shared_ptr<detail::join_state<void>> join_state_;
 
-        promise_type() noexcept = default;
+        void* operator new(size_t size) {
+            return detail::tagged_alloc(size, detail::current_alloc_mode_);
+        }
+        void operator delete(void* ptr, size_t size) noexcept {
+            detail::tagged_dealloc(ptr, size);
+        }
 
         [[nodiscard]] task get_return_object() noexcept {
             return task{std::coroutine_handle<promise_type>::from_promise(*this)};
         }
-
         [[nodiscard]] std::suspend_always initial_suspend() noexcept { return {}; }
         [[nodiscard]] detail::final_awaiter final_suspend() noexcept { return {}; }
 
         void return_void() noexcept {
-            // Notify join state if present
-            if (join_state_) {
-                join_state_->set_value();
-            }
+            if (join_state_) join_state_->set_value();
         }
 
         void unhandled_exception() noexcept {
             promise_base::unhandled_exception();
-            // Notify join state if present
-            if (join_state_) {
-                join_state_->set_exception(exception());
-            }
-        }
-
-        // Custom allocator for coroutine frames
-        void* operator new(size_t size) {
-            return frame_allocator::allocate(size);
-        }
-
-        void operator delete(void* ptr, size_t size) noexcept {
-            frame_allocator::deallocate(ptr, size);
+            if (join_state_) join_state_->set_exception(exception());
         }
     };
 
     using handle_type = std::coroutine_handle<promise_type>;
 
-    explicit task(handle_type handle) noexcept : handle_(handle) {}
-    task(task&& other) noexcept : handle_(std::exchange(other.handle_, nullptr)) {}
-
-    task& operator=(task&& other) noexcept {
-        if (this != &other) {
-            if (handle_) handle_.destroy();
-            handle_ = std::exchange(other.handle_, nullptr);
-        }
-        return *this;
-    }
-
-    ~task() { if (handle_) handle_.destroy(); }
+    explicit task(handle_type h) noexcept : handle_(h) {}
 
+    // Non-copyable, non-movable
     task(const task&) = delete;
     task& operator=(const task&) = delete;
+    task(task&&) = delete;
+    task& operator=(task&&) = delete;
 
-    [[nodiscard]] handle_type handle() const noexcept { return handle_; }
-    [[nodiscard]] handle_type release() noexcept { 
-        if (handle_) handle_.promise().detached_ = true;
-        return std::exchange(handle_, nullptr); 
-    }
-    
-    /// Spawn this task on the current scheduler (fire-and-forget)
-    /// The task will run asynchronously and self-destruct when complete
-    void go() {
-        runtime::schedule_handle(release());
-    }
-    
-    /// Spawn this task and return a join_handle for awaiting completion
-    /// Usage: auto handle = some_task().spawn(); co_await handle;
-    [[nodiscard]] join_handle<void> spawn();
+    ~task() { if (handle_) handle_.destroy(); }
 
+    // co_await interface
     [[nodiscard]] bool await_ready() const noexcept { return false; }
-
     [[nodiscard]] std::coroutine_handle<> await_suspend(std::coroutine_handle<> awaiter) noexcept {
         handle_.promise().continuation_ = awaiter;
         return handle_;
     }
-
     void await_resume() {
         auto& promise = handle_.promise();
-        if (promise.exception()) {
-            std::rethrow_exception(promise.exception());
-        }
+        if (promise.exception()) std::rethrow_exception(promise.exception());
     }
 
 private:
     handle_type handle_;
 };
 
-// Out-of-line definitions for spawn() methods
-template<typename T>
-join_handle<T> task<T>::spawn() {
-    // Create join state and attach to task's promise
-    auto state = std::make_shared<detail::join_state<T>>();
-    handle_.promise().join_state_ = state;
-    // Release and schedule - the promise will notify join state on completion
-    runtime::schedule_handle(release());
-    return join_handle<T>(std::move(state));
-}
-
-inline join_handle<void> task<void>::spawn() {
-    auto state = std::make_shared<detail::join_state<void>>();
-    handle_.promise().join_state_ = state;
-    runtime::schedule_handle(release());
-    return join_handle<void>(std::move(state));
-}
-
 } // namespace elio::coro
diff --git a/include/elio/coro/task_handle.hpp b/include/elio/coro/task_handle.hpp
index 33743ce..c7fc6c1 100644
--- a/include/elio/coro/task_handle.hpp
+++ b/include/elio/coro/task_handle.hpp
@@ -2,7 +2,6 @@
 
 #include "promise_base.hpp"
 #include "cancel_token.hpp"
-#include "frame_allocator.hpp"
 #include <coroutine>
 #include <optional>
 #include <exception>
@@ -23,28 +22,28 @@ void schedule_handle(std::coroutine_handle<> handle) noexcept;
 
 namespace elio::coro {
 
-/// 任务执行状态
+/// Task execution status
 enum class task_status {
-    pending,       ///< 尚未开始或正在执行
-    completed,     ///< 正常完成（成功）
-    logic_failed,  ///< 业务失败（显式失败，非异常）
-    exception,     ///< 异常失败（抛出异常）
-    cancelled      ///< 被取消
+    pending,       ///< Not started or currently executing
+    completed,     ///< Normal completion (success)
+    logic_failed,  ///< Business failure (explicit failure, not exception)
+    exception,     ///< Exception failure (exception thrown)
+    cancelled      ///< Cancelled
 };
 
-/// 任务结果状态（用于 task_result / awaitable_result）
+/// Task result status (for task_result / awaitable_result)
 enum class result_status {
-    completed,     ///< 正常完成
-    logic_failed,  ///< 业务失败
-    timeout,       ///< 超时
-    cancelled,     ///< 被取消
-    exception      ///< 异常失败
+    completed,     ///< Normal completion
+    logic_failed,  ///< Business failure
+    timeout,       ///< Timeout
+    cancelled,     ///< Cancelled
+    exception      ///< Exception failure
 };
 
-/// 失败信息（业务失败，非异常）
+/// Failure info (business failure, not exception)
 struct failure {
-    int code = 0;              ///< 错误码
-    std::string message;       ///< 错误信息
+    int code = 0;              ///< Error code
+    std::string message;       ///< Error message
     
     failure() = default;
     failure(int c, std::string msg) : code(c), message(std::move(msg)) {}
@@ -52,8 +51,8 @@ struct failure {
     explicit failure(std::string msg) : code(0), message(std::move(msg)) {}
 };
 
-/// 辅助函数：创建 failure（用于 co_return，仅适用于非 void task）
-/// 用法: co_return coro::fail(404, "not found");
+/// Helper function: create failure (for co_return, only for non-void task)
+/// Usage: co_return coro::fail(404, "not found");
 inline failure fail(int code, std::string message) {
     return failure{code, std::move(message)};
 }
@@ -64,20 +63,20 @@ inline failure fail(std::string message) {
 
 namespace detail {
 
-/// 内部共享状态
+/// Internal shared state
 template<typename T>
 struct task_state {
-    // 状态与结果存储
+    // State and result storage
     std::atomic<task_status> status_{task_status::pending};
     std::optional<T> value_;
     failure failure_;
     std::exception_ptr exception_;
     
-    // 等待者管理
+    // Waiter management
     std::atomic<void*> waiter_{nullptr};
     std::mutex mutex_;
     
-    // 取消控制
+    // Cancel control
     std::atomic<bool> cancel_requested_{false};
     
     void set_value(T&& val) {
@@ -158,7 +157,7 @@ struct task_state {
     }
 };
 
-/// void 特化
+/// void specialization
 template<>
 struct task_state<void> {
     std::atomic<task_status> status_{task_status::pending};
@@ -248,7 +247,7 @@ struct task_state<void> {
 } // namespace detail
 
 // ============================================================================
-// task_result<T> - 结果包装器
+// task_result<T> - Result wrapper
 // ============================================================================
 
 template<typename T>
@@ -258,34 +257,34 @@ class task_result {
     
     task_result() = default;
     
-    /// 构造成功结果
+    /// Construct success result
     explicit task_result(T value)
         : status_(result_status::completed)
         , value_(std::move(value)) {}
     
-    /// 构造业务失败结果
+    /// Construct business failure result
     explicit task_result(result_status status, failure f)
         : status_(status)
         , failure_(std::move(f)) {}
     
-    /// 构造异常结果
+    /// Construct exception result
     explicit task_result(result_status status, std::exception_ptr ep)
         : status_(status)
         , exception_(std::move(ep)) {}
     
-    /// 构造 timeout/cancelled 结果
+    /// Construct timeout/cancelled result
     explicit task_result(result_status status)
         : status_(status) {}
     
-    // 移动语义
+    // Move semantics
     task_result(task_result&&) = default;
     task_result& operator=(task_result&&) = default;
     
-    // 不支持拷贝
+    // No copy support
     task_result(const task_result&) = delete;
     task_result& operator=(const task_result&) = delete;
     
-    // ===== 状态查询 =====
+    // ===== Status query =====
     [[nodiscard]] bool has_value() const noexcept {
         return status_ == result_status::completed;
     }
@@ -314,7 +313,7 @@ class task_result {
         return status_ == result_status::logic_failed;
     }
     
-    // ===== 值访问 =====
+    // ===== Value access =====
     T& value() & {
         return *value_;
     }
@@ -337,7 +336,7 @@ class task_result {
         return has_value() ? std::move(value()) : static_cast<T>(std::forward<U>(default_value));
     }
     
-    // ===== 结果访问 =====
+    // ===== Result access =====
     const failure& failure_info() const {
         return failure_;
     }
@@ -360,7 +359,7 @@ class task_result {
         }
     }
     
-    // ===== 隐式转换 =====
+    // ===== Implicit conversion =====
     explicit operator bool() const noexcept {
         return has_value();
     }
@@ -372,7 +371,7 @@ class task_result {
     std::exception_ptr exception_;
 };
 
-// ===== void 特化 =====
+// ===== void specialization =====
 template<>
 class task_result<void> {
 public:
@@ -380,28 +379,28 @@ class task_result<void> {
     
     task_result() = default;
     
-    /// 构造成功/timeout/cancelled 结果
+    /// Construct success/timeout/cancelled result
     explicit task_result(result_status status)
         : status_(status) {}
     
-    /// 构造业务失败结果
+    /// Construct business failure result
     explicit task_result(result_status status, failure f)
         : status_(status)
         , failure_(std::move(f)) {}
     
-    /// 构造异常结果
+    /// Construct exception result
     explicit task_result(result_status status, std::exception_ptr ep)
         : status_(status)
         , exception_(std::move(ep)) {}
     
-    // 移动语义
+    // Move semantics
     task_result(task_result&&) = default;
     task_result& operator=(task_result&&) = default;
     
     task_result(const task_result&) = delete;
     task_result& operator=(const task_result&) = delete;
     
-    // ===== 状态查询 =====
+    // ===== Status query =====
     [[nodiscard]] bool has_value() const noexcept {
         return status_ == result_status::completed;
     }
@@ -430,7 +429,7 @@ class task_result<void> {
         return status_ == result_status::logic_failed;
     }
     
-    // ===== 结果访问 =====
+    // ===== Result access =====
     const failure& failure_info() const {
         return failure_;
     }
@@ -464,7 +463,7 @@ class task_result<void> {
 };
 
 // ============================================================================
-// task_handle<T> - 任务句柄
+// task_handle<T> - Task handle
 // ============================================================================
 
 template<typename T>
@@ -479,15 +478,15 @@ class task_handle {
     
     ~task_handle() = default;
     
-    // 移动语义
+    // Move semantics
     task_handle(task_handle&&) noexcept = default;
     task_handle& operator=(task_handle&&) noexcept = default;
     
-    // 不支持拷贝
+    // No copy support
     task_handle(const task_handle&) = delete;
     task_handle& operator=(const task_handle&) = delete;
     
-    // ===== 有效性检查 =====
+    // ===== Validity check =====
     [[nodiscard]] bool valid() const noexcept {
         return state_ != nullptr;
     }
@@ -496,7 +495,7 @@ class task_handle {
         return valid();
     }
     
-    // ===== 状态查询 =====
+    // ===== Status query =====
     [[nodiscard]] task_status status() const noexcept {
         if (!state_) return task_status::exception;
         return state_->status_.load(std::memory_order_acquire);
@@ -527,7 +526,7 @@ class task_handle {
         return status() == task_status::pending;
     }
     
-    // ===== 显式结果获取 =====
+    // ===== Explicit result retrieval =====
     bool try_get(T& out) const {
         if (!state_) return false;
         std::lock_guard<std::mutex> lock(state_->mutex_);
@@ -569,7 +568,7 @@ class task_handle {
         return static_cast<T>(std::forward<U>(default_value));
     }
     
-    // ===== 获取完整结果 =====
+    // ===== Get complete result =====
     task_result<T> get_result() const {
         if (!state_) {
             return task_result<T>(result_status::exception,
@@ -598,7 +597,7 @@ class task_handle {
         }
     }
     
-    // ===== 同步等待 =====
+    // ===== Synchronous wait =====
     task_status wait() {
         if (!state_) return task_status::exception;
         
@@ -619,7 +618,7 @@ class task_handle {
         std::unique_lock<std::mutex> lock(state_->mutex_);
         while (!state_->is_done()) {
             if (std::chrono::steady_clock::now() >= deadline) {
-                return status();  // 可能仍为 pending
+                return status();  // May still be pending
             }
             lock.unlock();
             std::this_thread::yield();
@@ -628,7 +627,7 @@ class task_handle {
         return status();
     }
     
-    // ===== 取消控制 =====
+    // ===== Cancel control =====
     void request_cancel() {
         if (!state_) return;
         state_->request_cancel();
@@ -639,7 +638,7 @@ class task_handle {
         return state_->is_cancellation_requested();
     }
     
-    // ===== 协程等待（返回 task_result，不抛异常）=====
+    // ===== Coroutine await (returns task_result, no exception thrown) =====
     auto operator co_await() const {
         struct awaiter {
             std::shared_ptr<detail::task_state<T>> state;
@@ -688,7 +687,7 @@ class task_handle {
 };
 
 // ============================================================================
-// task_handle<void> - void 特化
+// task_handle<void> - void specialization
 // ============================================================================
 
 template<>
@@ -709,7 +708,7 @@ class task_handle<void> {
     task_handle(const task_handle&) = delete;
     task_handle& operator=(const task_handle&) = delete;
     
-    // ===== 有效性检查 =====
+    // ===== Validity check =====
     [[nodiscard]] bool valid() const noexcept {
         return state_ != nullptr;
     }
@@ -718,7 +717,7 @@ class task_handle<void> {
         return valid();
     }
     
-    // ===== 状态查询 =====
+    // ===== Status query =====
     [[nodiscard]] task_status status() const noexcept {
         if (!state_) return task_status::exception;
         return state_->status_.load(std::memory_order_acquire);
@@ -749,7 +748,7 @@ class task_handle<void> {
         return status() == task_status::pending;
     }
     
-    // ===== 显式结果获取 =====
+    // ===== Explicit result retrieval =====
     bool try_get(failure& out) const {
         if (!state_) return false;
         std::lock_guard<std::mutex> lock(state_->mutex_);
@@ -770,7 +769,7 @@ class task_handle<void> {
         return false;
     }
     
-    // ===== 获取完整结果 =====
+    // ===== Get complete result =====
     task_result<void> get_result() const {
         if (!state_) {
             return task_result<void>(result_status::exception,
@@ -795,7 +794,7 @@ class task_handle<void> {
         }
     }
     
-    // ===== 同步等待 =====
+    // ===== Synchronous wait =====
     task_status wait() {
         if (!state_) return task_status::exception;
         
@@ -825,7 +824,7 @@ class task_handle<void> {
         return status();
     }
     
-    // ===== 取消控制 =====
+    // ===== Cancel control =====
     void request_cancel() {
         if (!state_) return;
         state_->request_cancel();
@@ -836,7 +835,7 @@ class task_handle<void> {
         return state_->is_cancellation_requested();
     }
     
-    // ===== 协程等待（返回 task_result，不抛异常）=====
+    // ===== Coroutine await (returns task_result, no exception thrown) =====
     auto operator co_await() const {
         struct awaiter {
             std::shared_ptr<detail::task_state<void>> state;
diff --git a/include/elio/coro/vthread_stack.hpp b/include/elio/coro/vthread_stack.hpp
new file mode 100644
index 0000000..ce4ec7a
--- /dev/null
+++ b/include/elio/coro/vthread_stack.hpp
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <new>
+#include <cassert>
+
+// Sanitizer detection for vthread_stack
+#ifndef ELIO_SANITIZER_ACTIVE
+#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
+#define ELIO_SANITIZER_ACTIVE 1
+#elif defined(__has_feature)
+#if __has_feature(address_sanitizer) || __has_feature(thread_sanitizer)
+#define ELIO_SANITIZER_ACTIVE 1
+#endif
+#endif
+#endif
+
+namespace elio::coro {
+
+/// Segmented bump-pointer stack allocator for vthread coroutine frames.
+///
+/// Each vthread maintains its own stack allocator. Coroutine frames are
+/// allocated in LIFO order within stack segments. When a segment is exhausted,
+/// a new segment is allocated and linked. When all frames in a segment are
+/// freed, the segment is released.
+///
+/// This allocator provides significant performance improvements over
+/// general-purpose allocation for coroutines that follow strict LIFO
+/// allocation/deallocation patterns (which is natural for nested coroutines).
+class vthread_stack {
+public:
+    // Static interface — for promise_type::operator new/delete
+#ifdef ELIO_SANITIZER_ACTIVE
+    static void* allocate(size_t size) {
+        return ::operator new(size);
+    }
+
+    static void deallocate(void* ptr, [[maybe_unused]] size_t size) noexcept {
+        ::operator delete(ptr);
+    }
+#else
+    static void* allocate(size_t size) {
+        if (current_ != nullptr) {
+            return current_->push(size);
+        }
+        // No vthread context, use global new directly
+        return ::operator new(size);
+    }
+
+    static void deallocate(void* ptr, size_t size) noexcept {
+        if (current_ != nullptr) {
+            current_->pop(ptr, size);
+        } else {
+            // When current_ is nullptr and we get here via tagged_dealloc() with
+            // a vstack tag, it means the vstack that owned this memory has been
+            // deleted (its destructor clears current_ and frees all segments).
+            // The memory pointed to by ptr is now invalid (already freed by vstack's
+            // destructor), so we must NOT try to free it again.
+            //
+            // This is a no-op: the memory was already freed when the vstack was deleted.
+            // Note: calling ::operator delete(ptr) here would be wrong
+            // because the memory was already freed by the owning vthread_stack.
+            //
+            // This situation occurs when:
+            // 1. A coroutine owns its vstack (owns_vstack_ = true)
+            // 2. The coroutine completes and its promise destructor runs
+            // 3. Promise destructor deletes the vstack (freeing all segment memory)
+            // 4. Then operator delete calls tagged_dealloc() -> vthread_stack::deallocate()
+            // 5. But current_ is now nullptr because the vstack was just deleted
+        }
+    }
+#endif
+
+    // thread-local current vthread_stack management
+    static vthread_stack* current() noexcept {
+        return current_;
+    }
+
+    static void set_current(vthread_stack* s) noexcept {
+        current_ = s;
+    }
+
+    // Instance lifecycle
+    vthread_stack() = default;
+
+    ~vthread_stack() {
+        free_segments();
+    }
+
+    vthread_stack(const vthread_stack&) = delete;
+    vthread_stack& operator=(const vthread_stack&) = delete;
+
+    // Instance allocation interface
+    void* push(size_t size) {
+        size_t aligned_size = align_up(size);
+
+        // Check if current segment has enough space
+        if (current_segment_ == nullptr ||
+            current_segment_->used + aligned_size > current_segment_->capacity) {
+            allocate_segment(aligned_size);
+        }
+
+        void* ptr = current_segment_->data() + current_segment_->used;
+        current_segment_->used += aligned_size;
+        return ptr;
+    }
+
+    void pop([[maybe_unused]] void* ptr, size_t size) noexcept {
+        size_t aligned_size = align_up(size);
+
+        assert(current_segment_ != nullptr && "pop called with no segment");
+        assert(current_segment_->used >= aligned_size && "pop size exceeds used");
+        assert(ptr == current_segment_->data() + current_segment_->used - aligned_size &&
+               "pop ptr does not match expected position");
+
+        current_segment_->used -= aligned_size;
+
+        // If current segment is empty and has a previous segment, free current segment and backtrack
+        if (current_segment_->used == 0 && current_segment_->prev != nullptr) {
+            segment* old = current_segment_;
+            current_segment_ = current_segment_->prev;
+            ::operator delete(old);
+        }
+    }
+
+private:
+    struct segment {
+        segment* prev;
+        size_t capacity;
+        size_t used;
+        
+        // Flexible array member workaround: compute data pointer from end of struct
+        char* data() noexcept {
+            return reinterpret_cast<char*>(this + 1);
+        }
+        const char* data() const noexcept {
+            return reinterpret_cast<const char*>(this + 1);
+        }
+    };
+
+    segment* current_segment_ = nullptr;
+    static constexpr size_t DEFAULT_SEGMENT_SIZE = 16384;  // 16KB
+    static constexpr size_t ALIGNMENT = alignof(std::max_align_t);
+
+    static constexpr size_t align_up(size_t n) noexcept {
+        return (n + ALIGNMENT - 1) & ~(ALIGNMENT - 1);
+    }
+
+    void allocate_segment(size_t min_payload) {
+        size_t payload = min_payload > DEFAULT_SEGMENT_SIZE ? min_payload : DEFAULT_SEGMENT_SIZE;
+        void* mem = ::operator new(sizeof(segment) + payload);
+        segment* seg = static_cast<segment*>(mem);
+        seg->prev = current_segment_;
+        seg->capacity = payload;
+        seg->used = 0;
+        current_segment_ = seg;
+    }
+
+    void free_segments() {
+        while (current_segment_ != nullptr) {
+            segment* prev = current_segment_->prev;
+            ::operator delete(current_segment_);
+            current_segment_ = prev;
+        }
+    }
+
+    static inline thread_local vthread_stack* current_ = nullptr;
+};
+
+} // namespace elio::coro
diff --git a/include/elio/elio.hpp b/include/elio/elio.hpp
index 86157a7..2496f07 100644
--- a/include/elio/elio.hpp
+++ b/include/elio/elio.hpp
@@ -18,6 +18,7 @@
 #include "coro/awaitable_base.hpp"
 #include "coro/frame.hpp"
 #include "coro/cancel_token.hpp"
+#include "coro/vthread_stack.hpp"
 
 // Runtime scheduler
 #include "runtime/scheduler.hpp"
@@ -26,6 +27,9 @@
 #include "runtime/async_main.hpp"
 #include "runtime/affinity.hpp"
 #include "runtime/serve.hpp"
+#include "runtime/spawn.hpp"
+#include "runtime/blocking_pool.hpp"
+#include "runtime/spawn_blocking.hpp"
 #include "runtime/autoscaler_config.hpp"
 #include "runtime/autoscaler_triggers.hpp"
 #include "runtime/autoscaler_actions.hpp"
diff --git a/include/elio/http/http_server.hpp b/include/elio/http/http_server.hpp
index 552a303..fe84ff6 100644
--- a/include/elio/http/http_server.hpp
+++ b/include/elio/http/http_server.hpp
@@ -270,8 +270,9 @@ class server {
             }
 
             // Spawn connection handler
-            auto handler = handle_connection(std::move(*stream_result));
-            sched->spawn(handler.release());
+            sched->go([this, s = std::move(*stream_result)]() mutable {
+                return handle_connection(std::move(s));
+            });
         }
     }
 
@@ -306,8 +307,9 @@ class server {
             }
 
             // Create TLS stream and spawn handler
-            auto handler = handle_tls_connection(std::move(*stream_result), tls_ctx);
-            sched->spawn(handler.release());
+            sched->go([this, s = std::move(*stream_result), &tls_ctx]() mutable {
+                return handle_tls_connection(std::move(s), tls_ctx);
+            });
         }
     }
     
diff --git a/include/elio/http/websocket_server.hpp b/include/elio/http/websocket_server.hpp
index b0b88bd..997e102 100644
--- a/include/elio/http/websocket_server.hpp
+++ b/include/elio/http/websocket_server.hpp
@@ -455,8 +455,9 @@ class ws_server {
             }
 
             // Spawn connection handler
-            auto handler = handle_connection(std::move(*stream_result));
-            sched->spawn(handler.release());
+            sched->go([this, s = std::move(*stream_result)]() mutable {
+                return handle_connection(std::move(s));
+            });
         }
     }
 
@@ -490,8 +491,9 @@ class ws_server {
             }
 
             // Spawn TLS connection handler
-            auto handler = handle_tls_connection(std::move(*stream_result), tls_ctx);
-            sched->spawn(handler.release());
+            sched->go([this, s = std::move(*stream_result), &tls_ctx]() mutable {
+                return handle_tls_connection(std::move(s), tls_ctx);
+            });
         }
     }
     
diff --git a/include/elio/net/resolve.hpp b/include/elio/net/resolve.hpp
index 57e9de8..1570856 100644
--- a/include/elio/net/resolve.hpp
+++ b/include/elio/net/resolve.hpp
@@ -2,7 +2,7 @@
 
 #include <elio/net/tcp.hpp>
 #include <elio/coro/task.hpp>
-#include <elio/runtime/scheduler.hpp>
+#include <elio/runtime/spawn_blocking.hpp>
 
 #include <netdb.h>
 #include <net/if.h>
@@ -11,13 +11,10 @@
 #include <atomic>
 #include <array>
 #include <chrono>
-#include <coroutine>
-#include <memory>
 #include <mutex>
 #include <optional>
 #include <string>
 #include <string_view>
-#include <thread>
 #include <unordered_map>
 #include <vector>
 
@@ -186,30 +183,6 @@ inline resolve_options default_cached_resolve_options() {
     return opts;
 }
 
-struct resolve_waiter_state {
-    std::vector<socket_address> results;
-    int error = 0;
-    runtime::scheduler* scheduler = nullptr;
-    std::coroutine_handle<> handle;
-    size_t saved_affinity = coro::NO_AFFINITY;
-    void* handle_address = nullptr;
-
-    void restore_affinity() const noexcept {
-        if (!handle_address) {
-            return;
-        }
-        auto* promise = coro::get_promise_base(handle_address);
-        if (!promise) {
-            return;
-        }
-        if (saved_affinity == coro::NO_AFFINITY) {
-            promise->clear_affinity();
-        } else {
-            promise->set_affinity(saved_affinity);
-        }
-    }
-};
-
 inline bool try_parse_ipv4_literal(std::string_view host, uint16_t port,
                                    std::vector<socket_address>& out) {
     struct in_addr addr{};
@@ -249,124 +222,92 @@ inline bool try_parse_ipv6_literal(std::string_view host, uint16_t port,
     return true;
 }
 
-class resolve_all_awaitable {
-public:
-    resolve_all_awaitable(std::string_view host, uint16_t port, resolve_options options)
-        : host_(host)
-        , key_{std::string(host), port}
-        , options_(options)
-        , state_(std::make_shared<resolve_waiter_state>()) {
-        if (host.empty() || host == "::" || host == "0.0.0.0") {
-            state_->results.push_back(socket_address(host, port));
-            return;
-        }
+inline coro::task<std::vector<socket_address>> resolve_all(
+    std::string_view host,
+    uint16_t port,
+    resolve_options options = {}) {
 
-        if (host.find(':') != std::string_view::npos) {
-            try_parse_ipv6_literal(host, port, state_->results);
-            return;
-        }
+    std::vector<socket_address> results;
 
-        try_parse_ipv4_literal(host, port, state_->results);
+    // Handle empty host or wildcard addresses
+    if (host.empty() || host == "::" || host == "0.0.0.0") {
+        results.push_back(socket_address(host, port));
+        co_return results;
     }
 
-    bool await_ready() const noexcept {
-        if (!state_->results.empty()) {
-            return true;
+    // Try parsing as IPv6 literal
+    if (host.find(':') != std::string_view::npos) {
+        if (try_parse_ipv6_literal(host, port, results)) {
+            co_return results;
         }
+    }
 
-        if (!options_.use_cache) {
-            return false;
-        }
+    // Try parsing as IPv4 literal
+    if (try_parse_ipv4_literal(host, port, results)) {
+        co_return results;
+    }
 
-        resolve_cache* cache = options_.cache ? options_.cache : &default_resolve_cache();
-        if (cache->try_get(key_, state_->results)) {
-            return true;
+    // Check cache if enabled
+    resolve_cache_key key{std::string(host), port};
+    if (options.use_cache) {
+        resolve_cache* cache = options.cache ? options.cache : &default_resolve_cache();
+        if (cache->try_get(key, results)) {
+            co_return results;
         }
-
         cache->record_miss();
-        return false;
     }
 
-    template<typename Promise>
-    bool await_suspend(std::coroutine_handle<Promise> awaiter) {
-        state_->handle = awaiter;
-        state_->scheduler = runtime::scheduler::current();
-        state_->handle_address = awaiter.address();
-
-        if constexpr (std::is_base_of_v<coro::promise_base, Promise>) {
-            state_->saved_affinity = awaiter.promise().affinity();
-            auto* worker = runtime::worker_thread::current();
-            if (worker) {
-                awaiter.promise().set_affinity(worker->worker_id());
-            }
-        }
-
-        auto host = host_;
-        auto key = key_;
-        auto options = options_;
-        auto state = state_;
-
-        std::thread([host = std::move(host), key = std::move(key), options, state]() mutable {
-            struct addrinfo hints{};
-            struct addrinfo* result = nullptr;
-            hints.ai_family = AF_UNSPEC;
-            hints.ai_socktype = SOCK_STREAM;
-
-            std::string service = std::to_string(key.port);
-            int rc = getaddrinfo(host.c_str(), service.c_str(), &hints, &result);
-            if (rc == 0 && result) {
-                for (auto* current = result; current != nullptr; current = current->ai_next) {
-                    if (current->ai_family == AF_INET6) {
-                        auto* sa = reinterpret_cast<struct sockaddr_in6*>(current->ai_addr);
-                        state->results.push_back(socket_address(ipv6_address(*sa)));
-                    } else if (current->ai_family == AF_INET) {
-                        auto* sa = reinterpret_cast<struct sockaddr_in*>(current->ai_addr);
-                        state->results.push_back(socket_address(ipv4_address(*sa)));
-                    }
-                }
-                freeaddrinfo(result);
-            }
-
-            if (state->results.empty()) {
-                state->error = (rc == EAI_SYSTEM) ? errno : EHOSTUNREACH;
-                if (options.use_cache) {
-                    resolve_cache* cache = options.cache ? options.cache : &default_resolve_cache();
-                    cache->store(key, {}, options.negative_ttl);
+    // Perform blocking DNS resolution via spawn_blocking
+    std::string host_str(host);
+    auto dns_result = co_await elio::spawn_blocking([host_str, port]() {
+        struct resolve_result {
+            std::vector<socket_address> addresses;
+            int error = 0;
+        };
+
+        resolve_result result;
+        struct addrinfo hints{};
+        struct addrinfo* ai_result = nullptr;
+        hints.ai_family = AF_UNSPEC;
+        hints.ai_socktype = SOCK_STREAM;
+
+        std::string service = std::to_string(port);
+        int rc = getaddrinfo(host_str.c_str(), service.c_str(), &hints, &ai_result);
+        if (rc == 0 && ai_result) {
+            for (auto* current = ai_result; current != nullptr; current = current->ai_next) {
+                if (current->ai_family == AF_INET6) {
+                    auto* sa = reinterpret_cast<struct sockaddr_in6*>(current->ai_addr);
+                    result.addresses.push_back(socket_address(ipv6_address(*sa)));
+                } else if (current->ai_family == AF_INET) {
+                    auto* sa = reinterpret_cast<struct sockaddr_in*>(current->ai_addr);
+                    result.addresses.push_back(socket_address(ipv4_address(*sa)));
                 }
-            } else if (options.use_cache) {
-                resolve_cache* cache = options.cache ? options.cache : &default_resolve_cache();
-                cache->store(key, state->results, options.positive_ttl);
-            }
-
-            if (state->scheduler && state->scheduler->is_running()) {
-                state->scheduler->spawn(state->handle);
-            } else {
-                runtime::schedule_handle(state->handle);
             }
-        }).detach();
-
-        return true;
-    }
+            freeaddrinfo(ai_result);
+        }
 
-    std::vector<socket_address> await_resume() {
-        state_->restore_affinity();
-        if (state_->results.empty()) {
-            errno = state_->error;
+        if (result.addresses.empty()) {
+            result.error = (rc == EAI_SYSTEM) ? errno : EHOSTUNREACH;
+        }
+        return result;
+    });
+
+    // Update cache based on result
+    if (options.use_cache) {
+        resolve_cache* cache = options.cache ? options.cache : &default_resolve_cache();
+        if (dns_result.addresses.empty()) {
+            cache->store(key, {}, options.negative_ttl);
+        } else {
+            cache->store(key, dns_result.addresses, options.positive_ttl);
         }
-        return state_->results;
     }
 
-private:
-    std::string host_;
-    resolve_cache_key key_;
-    resolve_options options_;
-    std::shared_ptr<resolve_waiter_state> state_;
-};
+    // Set errno on failure
+    if (dns_result.addresses.empty()) {
+        errno = dns_result.error;
+    }
 
-inline auto resolve_all(std::string_view host,
-                        uint16_t port,
-                        resolve_options options = {}) {
-    return resolve_all_awaitable(host, port, options);
+    co_return dns_result.addresses;
 }
 
 inline coro::task<std::optional<socket_address>> resolve_hostname(std::string_view host,
diff --git a/include/elio/rpc/rpc_client.hpp b/include/elio/rpc/rpc_client.hpp
index e0aabc6..97635de 100644
--- a/include/elio/rpc/rpc_client.hpp
+++ b/include/elio/rpc/rpc_client.hpp
@@ -296,29 +296,26 @@ class rpc_client : public std::enable_shared_from_this<rpc_client<Stream>> {
         }
         
         // Wait for response with timeout
-        // Start a timeout coroutine
-        auto self = this->shared_from_this();
-        auto timeout_task = [](std::chrono::milliseconds ms,
-                               std::shared_ptr<pending_request> pending,
-                               coro::cancel_token tok) 
-            -> coro::task<void> 
-        {
-            auto result = co_await time::sleep_for(ms, tok);
-            
-            // Only timeout if sleep completed normally (not cancelled)
-            if (result == coro::cancel_result::completed && pending->try_complete()) {
-                pending->timed_out = true;
-                pending->error = rpc_error::timeout;
-                pending->completion_event.set();
-            }
-        };
-        
         // Spawn timeout watcher
         auto* sched = runtime::scheduler::current();
         if (sched) {
-            auto task = timeout_task(
-                std::chrono::duration_cast<std::chrono::milliseconds>(timeout), pending, token);
-            sched->spawn(task.release());
+            sched->go([ms = std::chrono::duration_cast<std::chrono::milliseconds>(timeout),
+                       p = pending, tok = token]() mutable {
+                return [](std::chrono::milliseconds ms,
+                          std::shared_ptr<pending_request> pending,
+                          coro::cancel_token tok)
+                    -> coro::task<void>
+                {
+                    auto result = co_await time::sleep_for(ms, tok);
+                    
+                    // Only timeout if sleep completed normally (not cancelled)
+                    if (result == coro::cancel_result::completed && pending->try_complete()) {
+                        pending->timed_out = true;
+                        pending->error = rpc_error::timeout;
+                        pending->completion_event.set();
+                    }
+                }(ms, p, std::move(tok));
+            });
         }
         
         // Wait for completion (either response, timeout, or cancellation)
@@ -405,22 +402,18 @@ class rpc_client : public std::enable_shared_from_this<rpc_client<Stream>> {
         }
         
         // Setup timeout
-        auto self = this->shared_from_this();
-        auto timeout_task = [](std::chrono::milliseconds ms,
-                               std::shared_ptr<pending_request> pending)
-            -> coro::task<void>
-        {
-            co_await time::sleep_for(ms);
-            if (pending->try_complete()) {
-                pending->timed_out = true;
-                pending->completion_event.set();
-            }
-        };
-        
         auto* sched = runtime::scheduler::current();
         if (sched) {
-            auto task = timeout_task(timeout, pending);
-            sched->spawn(task.release());
+            sched->go([ms = timeout, p = pending]() { 
+                return [](std::chrono::milliseconds ms, std::shared_ptr<pending_request> p)
+                    -> coro::task<void> {
+                    co_await time::sleep_for(ms);
+                    if (p->try_complete()) {
+                        p->timed_out = true;
+                        p->completion_event.set();
+                    }
+                }(ms, p);
+            });
         }
         
         // Wait for pong
@@ -448,8 +441,7 @@ class rpc_client : public std::enable_shared_from_this<rpc_client<Stream>> {
         auto self = this->shared_from_this();
         auto* sched = runtime::scheduler::current();
         if (sched) {
-            auto task = receive_loop(self);
-            sched->spawn(task.release());
+            sched->go([s = self]() { return receive_loop(s); });
         }
     }
     
diff --git a/include/elio/rpc/rpc_server.hpp b/include/elio/rpc/rpc_server.hpp
index 0f3282e..dc886ef 100644
--- a/include/elio/rpc/rpc_server.hpp
+++ b/include/elio/rpc/rpc_server.hpp
@@ -445,8 +445,7 @@ class rpc_server {
             // Spawn session handler
             auto* sched = runtime::scheduler::current();
             if (sched) {
-                auto task = run_session(session);
-                sched->spawn(task.release());
+                sched->go([this, s = session]() { return run_session(s); });
             }
         }
         
@@ -482,8 +481,7 @@ class rpc_server {
             // Spawn session handler
             auto* sched = runtime::scheduler::current();
             if (sched) {
-                auto task = run_session(session);
-                sched->spawn(task.release());
+                sched->go([this, s = session]() { return run_session(s); });
             }
         }
         
diff --git a/include/elio/runtime/async_main.hpp b/include/elio/runtime/async_main.hpp
index d41b232..c88ba05 100644
--- a/include/elio/runtime/async_main.hpp
+++ b/include/elio/runtime/async_main.hpp
@@ -2,8 +2,10 @@
 
 #include "scheduler.hpp"
 #include <elio/coro/task.hpp>
+#include <elio/coro/vthread_stack.hpp>
 #include <atomic>
 #include <condition_variable>
+#include <functional>
 #include <mutex>
 #include <optional>
 #include <thread>
@@ -17,10 +19,15 @@ namespace elio::runtime {
 struct run_config {
     /// Number of worker threads (0 = hardware concurrency)
     size_t num_threads = 0;
+    /// Blocking thread pool size (0 = fallback to std::thread per task)
+    size_t blocking_threads = 4;
 };
 
 namespace detail {
 
+/// Type alias using definitions from scheduler.hpp
+template<typename T> using task_value_t = typename task_value<T>::type;
+
 /// Completion signal for async_main
 template<typename T>
 struct completion_signal {
@@ -84,15 +91,14 @@ struct completion_signal<void> {
 };
 
 /// Wrapper task that signals completion
-template<typename T>
-coro::task<void> completion_wrapper(coro::task<T> inner, completion_signal<T>* signal) {
+template<typename T, typename F>
+coro::task<void> completion_wrapper(F f, completion_signal<T>* signal) {
     try {
         if constexpr (std::is_void_v<T>) {
-            co_await std::move(inner);
+            co_await std::invoke(std::move(f));
             signal->set_result();
         } else {
-            T result = co_await std::move(inner);
-            signal->set_result(std::move(result));
+            signal->set_result(co_await std::invoke(std::move(f)));
         }
     } catch (...) {
         signal->set_exception(std::current_exception());
@@ -101,13 +107,13 @@ coro::task<void> completion_wrapper(coro::task<T> inner, completion_signal<T>* s
 
 } // namespace detail
 
-/// Run a coroutine task to completion and return its result
+/// Run a callable that returns a coroutine task to completion
 /// 
 /// This function creates a scheduler, runs the given task, waits for
 /// completion, and returns the result. It's the recommended way to
 /// run async code from a synchronous context (like main()).
 /// 
-/// @param task The coroutine task to run
+/// @param f The callable that returns a coroutine task
 /// @param config Configuration (threads)
 /// @return The result of the task
 /// 
@@ -119,27 +125,41 @@ coro::task<void> completion_wrapper(coro::task<T> inner, completion_signal<T>* s
 /// }
 /// 
 /// int main() {
-///     return elio::run(async_main());
+///     return elio::run(async_main);
 /// }
 /// @endcode
-template<typename T>
-T run(coro::task<T> task, const run_config& config = {}) {
+
+/// Overload 1: no-arg callable + optional config
+template<typename F>
+    requires (std::invocable<F> && detail::is_task_v<std::invoke_result_t<F>>)
+auto run(F&& f, const run_config& config = {})
+    -> detail::task_value_t<std::invoke_result_t<F>>
+{
+    using T = detail::task_value_t<std::invoke_result_t<F>>;
     detail::completion_signal<T> signal;
-    
+
     size_t threads = config.num_threads;
     if (threads == 0) {
         threads = std::thread::hardware_concurrency();
         if (threads == 0) threads = 1;
     }
-    
-    scheduler sched(threads);
+
+    scheduler sched(threads, wait_strategy::blocking(),
+                    config.blocking_threads);
     sched.start();
-    
-    // Create wrapper that signals completion
-    auto wrapper = detail::completion_wrapper(std::move(task), &signal);
-    sched.spawn(wrapper.release());
-    
-    // Wait for completion
+
+    // Wrap user function
+    auto bound = [&f]() { return std::invoke(std::forward<F>(f)); };
+
+    {
+        coro::detail::heap_alloc_guard guard;
+        auto wrapper = detail::completion_wrapper<T>(std::move(bound), &signal);
+        auto handle = coro::detail::task_access::release(wrapper);
+        auto* root_vstack = new coro::vthread_stack();
+        handle.promise().set_vstack_owner(root_vstack);
+        sched.spawn(handle);
+    }
+
     if constexpr (std::is_void_v<T>) {
         signal.wait();
         sched.shutdown();
@@ -150,10 +170,26 @@ T run(coro::task<T> task, const run_config& config = {}) {
     }
 }
 
-/// Run a coroutine task with specified number of threads
-template<typename T>
-T run(coro::task<T> task, size_t num_threads) {
-    return run(std::move(task), run_config{.num_threads = num_threads});
+/// Overload 2: (func, args...) with config first
+template<typename F, typename... Args>
+    requires (sizeof...(Args) > 0 && std::invocable<F, Args...> && detail::is_task_v<std::invoke_result_t<F, Args...>>)
+auto run(const run_config& config, F&& f, Args&&... args)
+    -> detail::task_value_t<std::invoke_result_t<F, Args...>>
+{
+    auto bound = [f = std::forward<F>(f),
+                  ...args = std::forward<Args>(args)]() mutable {
+        return std::invoke(std::move(f), std::move(args)...);
+    };
+    return run(std::move(bound), config);
+}
+
+/// Overload 3: (func, args...) without config
+template<typename F, typename Arg0, typename... Args>
+    requires (!std::is_same_v<std::decay_t<Arg0>, run_config> && std::invocable<F, Arg0, Args...> && detail::is_task_v<std::invoke_result_t<F, Arg0, Args...>>)
+auto run(F&& f, Arg0&& arg0, Args&&... args)
+    -> detail::task_value_t<std::invoke_result_t<F, Arg0, Args...>>
+{
+    return run(run_config{}, std::forward<F>(f), std::forward<Arg0>(arg0), std::forward<Args>(args)...);
 }
 
 } // namespace elio::runtime
@@ -188,7 +224,7 @@ using runtime::run_config;
 /// @endcode
 #define ELIO_ASYNC_MAIN(async_main_func) \
     int main(int argc, char* argv[]) { \
-        return elio::run(async_main_func(argc, argv)); \
+        return elio::run(async_main_func, argc, argv); \
     }
 
 /// Macro for async_main that returns void (exits with 0)
@@ -197,7 +233,7 @@ using runtime::run_config;
 ///   coro::task<void> async_main(int argc, char* argv[])
 #define ELIO_ASYNC_MAIN_VOID(async_main_func) \
     int main(int argc, char* argv[]) { \
-        elio::run(async_main_func(argc, argv)); \
+        elio::run(async_main_func, argc, argv); \
         return 0; \
     }
 
@@ -207,7 +243,7 @@ using runtime::run_config;
 ///   coro::task<int> async_main()
 #define ELIO_ASYNC_MAIN_NOARGS(async_main_func) \
     int main() { \
-        return elio::run(async_main_func()); \
+        return elio::run(async_main_func); \
     }
 
 /// Macro for async_main without arguments, returning void
@@ -216,6 +252,6 @@ using runtime::run_config;
 ///   coro::task<void> async_main()
 #define ELIO_ASYNC_MAIN_VOID_NOARGS(async_main_func) \
     int main() { \
-        elio::run(async_main_func()); \
+        elio::run(async_main_func); \
         return 0; \
     }
diff --git a/include/elio/runtime/blocking_pool.hpp b/include/elio/runtime/blocking_pool.hpp
new file mode 100644
index 0000000..2a5bc6c
--- /dev/null
+++ b/include/elio/runtime/blocking_pool.hpp
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+namespace elio::runtime {
+
+// A simple thread pool for executing blocking tasks.
+// Supports both pooled mode (fixed threads) and non-pooled mode (spawn per task).
+class blocking_pool {
+public:
+    // num_threads: pool size. 0 means no pooling, each submit spawns a new thread.
+    explicit blocking_pool(size_t num_threads)
+        : num_threads_(num_threads) {
+        threads_.reserve(num_threads);
+        for (size_t i = 0; i < num_threads; ++i) {
+            threads_.emplace_back([this] { worker_loop(); });
+        }
+    }
+
+    ~blocking_pool() {
+        shutdown();
+    }
+
+    blocking_pool(const blocking_pool&) = delete;
+    blocking_pool& operator=(const blocking_pool&) = delete;
+
+    // Submit a task for execution. Thread-safe.
+    // If num_threads == 0, spawns a detached thread directly.
+    // Otherwise enqueues and wakes one worker.
+    void submit(std::function<void()> task) {
+        if (num_threads_ == 0) {
+            std::thread(std::move(task)).detach();
+            return;
+        }
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            queue_.push_back(std::move(task));
+        }
+        cv_.notify_one();
+    }
+
+    // Graceful shutdown: signals stop, wakes all workers, joins threads.
+    // Pending tasks in queue are discarded.
+    void shutdown() {
+        if (stopped_.exchange(true)) return;  // idempotent
+        cv_.notify_all();
+        for (auto& t : threads_) {
+            if (t.joinable()) t.join();
+        }
+    }
+
+private:
+    void worker_loop() {
+        while (!stopped_.load(std::memory_order_relaxed)) {
+            std::function<void()> task;
+
+            // Block until task available or stopped
+            std::unique_lock<std::mutex> lock(mutex_);
+            cv_.wait(lock, [this] {
+                return stopped_.load(std::memory_order_relaxed) || !queue_.empty();
+            });
+            if (stopped_.load(std::memory_order_relaxed) && queue_.empty()) return;
+            if (!queue_.empty()) {
+                task = std::move(queue_.front());
+                queue_.pop_front();
+            }
+
+            lock.unlock();
+            if (task) {
+                task();
+            }
+        }
+    }
+
+    std::vector<std::thread> threads_;
+    std::deque<std::function<void()>> queue_;
+    std::mutex mutex_;
+    std::condition_variable cv_;
+    std::atomic<bool> stopped_{false};
+    size_t num_threads_;
+};
+
+}  // namespace elio::runtime
diff --git a/include/elio/runtime/scheduler.hpp b/include/elio/runtime/scheduler.hpp
index 9208c7a..dc84304 100644
--- a/include/elio/runtime/scheduler.hpp
+++ b/include/elio/runtime/scheduler.hpp
@@ -1,17 +1,32 @@
 #pragma once
 
 #include "worker_thread.hpp"
+#include "blocking_pool.hpp"
 #include <elio/log/macros.hpp>
 #include <elio/coro/frame.hpp>
+#include <elio/coro/vthread_stack.hpp>
+#include <elio/coro/task.hpp>
 #include <vector>
 #include <memory>
 #include <atomic>
 #include <mutex>
 #include <coroutine>
 #include <thread>
+#include <functional>
 
 namespace elio::runtime {
 
+namespace detail {
+    // Type traits for task<T>
+    template<typename T> struct task_value;
+    template<typename T> struct task_value<coro::task<T>> { using type = T; };
+    template<typename T> using task_value_t = typename task_value<T>::type;
+
+    template<typename T> struct is_task : std::false_type {};
+    template<typename T> struct is_task<coro::task<T>> : std::true_type {};
+    template<typename T> inline constexpr bool is_task_v = is_task<T>::value;
+} // namespace detail
+
 /// Work-stealing scheduler for coroutines
 class scheduler {
     friend class worker_thread;  // Allow workers to set current_scheduler_
@@ -20,12 +35,14 @@ class scheduler {
     static constexpr size_t MAX_THREADS = 256;
 
     explicit scheduler(size_t num_threads = std::thread::hardware_concurrency(),
-                       wait_strategy strategy = wait_strategy::blocking())
+                       wait_strategy strategy = wait_strategy::blocking(),
+                       size_t blocking_threads = 4)
         : num_threads_(num_threads == 0 ? 1 : num_threads)
         , running_(false)
         , paused_(false)
         , spawn_index_(0)
-        , wait_strategy_(strategy) {
+        , wait_strategy_(strategy)
+        , blocking_pool_(std::make_unique<blocking_pool>(blocking_threads)) {
 
         size_t n = num_threads_.load(std::memory_order_relaxed);
         // Pre-reserve to MAX_THREADS to prevent reallocation during runtime
@@ -66,7 +83,12 @@ class scheduler {
             return;
         }
         
-        // First stop all workers (sets running_=false and joins threads)
+        // First shutdown blocking pool (before stopping workers)
+        if (blocking_pool_) {
+            blocking_pool_->shutdown();
+        }
+        
+        // Then stop all workers (sets running_=false and joins threads)
         for (auto& worker : workers_) {
             worker->stop();
         }
@@ -91,6 +113,12 @@ class scheduler {
             handle.destroy();
             return;
         }
+        // Detach from current thread's frame chain before spawning to another thread
+        // to avoid use-after-free when this thread creates another coroutine.
+        auto* promise = coro::get_promise_base(handle.address());
+        if (promise) {
+            promise->detach_from_parent();
+        }
         do_spawn(handle);
     }
     
@@ -102,6 +130,67 @@ class scheduler {
         spawn(std::forward<Task>(t).release());
     }
 
+    /// High-level API: fire-and-forget, spawn to this scheduler
+    /// @param f  Callable that returns a task<T>
+    /// @param args  Arguments to forward to the callable
+    template<typename F, typename... Args>
+        requires (std::invocable<F, Args...> && detail::is_task_v<std::invoke_result_t<F, Args...>>)
+    void go(F&& f, Args&&... args) {
+        coro::detail::heap_alloc_guard guard;
+        auto t = std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+        auto handle = coro::detail::task_access::release(t);
+        handle.promise().detached_ = true;
+        auto* vstack = new coro::vthread_stack();
+        handle.promise().set_vstack_owner(vstack);
+        // Detach from current thread's frame chain before spawning to another thread
+        // to avoid use-after-free when this thread creates another coroutine.
+        handle.promise().detach_from_parent();
+        do_spawn(handle);
+    }
+
+    /// High-level API: fire-and-forget, spawn to specific worker
+    /// @param worker_id  Target worker index
+    /// @param f  Callable that returns a task<T>
+    /// @param args  Arguments to forward to the callable
+    template<typename F, typename... Args>
+        requires (std::invocable<F, Args...> && detail::is_task_v<std::invoke_result_t<F, Args...>>)
+    void go_to(size_t worker_id, F&& f, Args&&... args) {
+        coro::detail::heap_alloc_guard guard;
+        auto t = std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+        auto handle = coro::detail::task_access::release(t);
+        handle.promise().detached_ = true;
+        auto* vstack = new coro::vthread_stack();
+        handle.promise().set_vstack_owner(vstack);
+        // Detach from current thread's frame chain before spawning to another thread
+        // to avoid use-after-free when this thread creates another coroutine.
+        handle.promise().detach_from_parent();
+        spawn_to(worker_id, handle);
+    }
+
+    /// High-level API: spawn + join, spawn to this scheduler
+    /// @param f  Callable that returns a task<T>
+    /// @param args  Arguments to forward to the callable
+    /// @return join_handle<T> that can be awaited to get the result
+    template<typename F, typename... Args>
+        requires (std::invocable<F, Args...> && detail::is_task_v<std::invoke_result_t<F, Args...>>)
+    auto go_joinable(F&& f, Args&&... args)
+        -> coro::join_handle<detail::task_value_t<std::invoke_result_t<F, Args...>>>
+    {
+        using T = detail::task_value_t<std::invoke_result_t<F, Args...>>;
+        coro::detail::heap_alloc_guard guard;
+        auto t = std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+        auto handle = coro::detail::task_access::release(t);
+        auto state = std::make_shared<coro::detail::join_state<T>>();
+        handle.promise().join_state_ = state;
+        auto* vstack = new coro::vthread_stack();
+        handle.promise().set_vstack_owner(vstack);
+        // Detach from current thread's frame chain before spawning to another thread
+        // to avoid use-after-free when this thread creates another coroutine.
+        handle.promise().detach_from_parent();
+        do_spawn(handle);
+        return coro::join_handle<T>(std::move(state));
+    }
+
     void spawn_to(size_t worker_id, std::coroutine_handle<> handle) {
         if (!handle) [[unlikely]] return;
         if (!running_.load(std::memory_order_relaxed)) [[unlikely]] {
@@ -109,6 +198,13 @@ class scheduler {
             return;
         }
         
+        // Detach from current thread's frame chain before spawning to another thread
+        // to avoid use-after-free when this thread creates another coroutine.
+        auto* promise = coro::get_promise_base(handle.address());
+        if (promise) {
+            promise->detach_from_parent();
+        }
+        
         size_t n = num_threads_.load(std::memory_order_acquire);
         workers_[worker_id % n]->schedule(handle);
     }
@@ -214,6 +310,11 @@ class scheduler {
         return wait_strategy_;
     }
 
+    /// Get the blocking pool for spawn_blocking operations
+    [[nodiscard]] blocking_pool* get_blocking_pool() noexcept {
+        return blocking_pool_.get();
+    }
+
 private:
     void do_spawn(std::coroutine_handle<> handle) {
         // Release fence ensures all writes to the coroutine frame (including
@@ -276,6 +377,9 @@ class scheduler {
     alignas(64) mutable std::mutex workers_mutex_;
     wait_strategy wait_strategy_;
 
+    // Blocking pool for spawn_blocking operations
+    std::unique_ptr<blocking_pool> blocking_pool_;
+
     static inline thread_local scheduler* current_scheduler_ = nullptr;
 };
 
@@ -313,14 +417,15 @@ inline void worker_thread::stop() {
     if (thread_.joinable()) thread_.join();
 }
 
-/// Drain and destroy remaining tasks - only call after ALL workers have stopped
+/// Final cleanup for any orphaned tasks - only call after ALL workers have stopped.
+/// This is a safety net for edge cases where tasks might still exist after drain phase.
 inline void worker_thread::drain_remaining_tasks() noexcept {
     // First drain inbox to deque
     void* addr;
     while ((addr = inbox_->pop()) != nullptr) {
         queue_->push(addr);
     }
-    // Then destroy all tasks in the deque
+    // Destroy any remaining tasks (should be rare after drain phase in run())
     while ((addr = queue_->pop()) != nullptr) {
         auto handle = std::coroutine_handle<>::from_address(addr);
         if (handle) {
@@ -377,12 +482,25 @@ inline void worker_thread::run() {
         }
     }
     
+    // Drain phase: after running_ becomes false, continue executing all
+    // remaining tasks until both local queue and inbox are empty.
+    // This ensures shutdown() returns only when all submitted tasks have
+    // fully completed (including coroutine cleanup and lambda destruction).
+    while (true) {
+        drain_inbox();
+        void* addr = queue_->pop_local(false);  // No concurrent stealers, workers are stopping
+        if (!addr) break;
+        
+        auto handle = std::coroutine_handle<>::from_address(addr);
+        if (handle && !handle.done()) {
+            needs_sync_ = true;  // Conservatively ensure memory visibility for drained tasks
+            run_task(handle);
+        }
+    }
+    
     // Clear the references when done
     scheduler::current_scheduler_ = nullptr;
     current_worker_ = nullptr;
-    
-    // Note: Cleanup of remaining tasks is handled in stop() AFTER join
-    // to avoid race conditions with work stealing
 }
 
 inline std::coroutine_handle<> worker_thread::get_next_task() noexcept {
@@ -432,8 +550,20 @@ inline void worker_thread::run_task(std::coroutine_handle<> handle) noexcept {
     }
     
     if (!handle || handle.done()) [[unlikely]] return;
-    
+
+    // Context switch: set vstack and current_frame before resume, restore after
+    auto* promise = coro::get_promise_base(handle.address());
+    auto* prev_vstack = coro::vthread_stack::current();
+    auto* prev_frame = coro::promise_base::current_frame();
+    if (promise) {
+        coro::vthread_stack::set_current(promise->vstack());
+        coro::promise_base::set_current_frame(promise);
+    }
+
     handle.resume();
+
+    coro::vthread_stack::set_current(prev_vstack);
+    coro::promise_base::set_current_frame(prev_frame);
     tasks_executed_.fetch_add(1, std::memory_order_relaxed);
     update_last_task_time();
 
diff --git a/include/elio/runtime/serve.hpp b/include/elio/runtime/serve.hpp
index e91476c..b170d4e 100644
--- a/include/elio/runtime/serve.hpp
+++ b/include/elio/runtime/serve.hpp
@@ -23,6 +23,7 @@
 
 #include <elio/coro/task.hpp>
 #include <elio/signal/signalfd.hpp>
+#include <elio/runtime/scheduler.hpp>
 #include <elio/log/macros.hpp>
 
 #include <csignal>
@@ -75,9 +76,9 @@ inline coro::task<signal::signal_info> wait_shutdown_signal(
 /// task to complete.
 ///
 /// @tparam Server Server type (must have stop() method)
-/// @tparam ListenTask The awaitable returned by server.listen()
+/// @tparam ListenFunc Callable that returns a listen coroutine task
 /// @param server Reference to the server (used to call stop())
-/// @param listen_task The listen coroutine task
+/// @param listen_func Function that returns the listen coroutine task
 /// @param signals Signals to wait for shutdown (defaults to SIGINT, SIGTERM)
 ///
 /// Example:
@@ -87,23 +88,31 @@ inline coro::task<signal::signal_info> wait_shutdown_signal(
 ///     r.get("/", handler);
 ///
 ///     http::server srv(r);
-///     co_await serve(srv, srv.listen(net::ipv4_address(8080)));
+///     co_await serve(srv, [&]() { return srv.listen(net::ipv4_address(8080)); });
 ///
 ///     co_return 0;
 /// }
 ///
 /// ELIO_ASYNC_MAIN(async_main)
 /// @endcode
-template<typename Server, typename ListenTask>
-coro::task<void> serve(Server& server, ListenTask listen_task,
+template<typename Server, typename ListenFunc>
+    requires std::invocable<ListenFunc>
+coro::task<void> serve(Server& server, ListenFunc listen_func,
                        std::initializer_list<int> signals = default_shutdown_signals)
 {
     // Set up signal handling
     signal::signal_set sigs(signals);
     signal::signal_fd sigfd(sigs);
 
-    // Spawn the listen task
-    auto listen_handle = std::move(listen_task).spawn();
+    // Get the scheduler
+    auto* sched = runtime::scheduler::current();
+    if (!sched) {
+        ELIO_LOG_ERROR("serve() must be called within a scheduler context");
+        co_return;
+    }
+
+    // Spawn the listen task as a joinable coroutine
+    auto listen_handle = sched->go_joinable(std::move(listen_func));
 
     // Wait for shutdown signal
     auto info = co_await sigfd.wait();
@@ -153,9 +162,9 @@ coro::task<void> serve(Server& server, ListenTask listen_task,
 /// When signal is received, stops all servers.
 ///
 /// @tparam Servers Variadic server types
-/// @tparam ListenTasks Variadic listen task types
+/// @tparam ListenFuncs Variadic listen function types
 /// @param servers Tuple of server references
-/// @param listen_tasks Tuple of listen tasks
+/// @param listen_funcs Tuple of listen functions (each returning a task)
 /// @param signals Signals to wait for shutdown
 ///
 /// Example:
@@ -167,26 +176,33 @@ coro::task<void> serve(Server& server, ListenTask listen_task,
 ///     co_await serve_all(
 ///         std::tie(http_srv, ws_srv),
 ///         std::make_tuple(
-///             http_srv.listen(addr1),
-///             ws_srv.listen(addr2)
+///             [&]() { return http_srv.listen(addr1); },
+///             [&]() { return ws_srv.listen(addr2); }
 ///         )
 ///     );
 /// }
 /// @endcode
-template<typename... Servers, typename... ListenTasks>
+template<typename... Servers, typename... ListenFuncs>
 coro::task<void> serve_all(std::tuple<Servers&...> servers,
-                           std::tuple<ListenTasks...> listen_tasks,
+                           std::tuple<ListenFuncs...> listen_funcs,
                            std::initializer_list<int> signals = default_shutdown_signals)
 {
     // Set up signal handling
     signal::signal_set sigs(signals);
     signal::signal_fd sigfd(sigs);
 
-    // Spawn all listen tasks
-    auto spawn_tasks = [](auto&&... tasks) {
-        return std::make_tuple(std::move(tasks).spawn()...);
+    // Get the scheduler
+    auto* sched = runtime::scheduler::current();
+    if (!sched) {
+        ELIO_LOG_ERROR("serve_all() must be called within a scheduler context");
+        co_return;
+    }
+
+    // Spawn all listen tasks as joinable coroutines
+    auto spawn_tasks = [sched](auto&&... funcs) {
+        return std::make_tuple(sched->go_joinable(std::move(funcs))...);
     };
-    auto handles = std::apply(spawn_tasks, std::move(listen_tasks));
+    auto handles = std::apply(spawn_tasks, std::move(listen_funcs));
 
     // Wait for shutdown signal
     auto info = co_await sigfd.wait();
diff --git a/include/elio/runtime/spawn.hpp b/include/elio/runtime/spawn.hpp
new file mode 100644
index 0000000..a60bcc3
--- /dev/null
+++ b/include/elio/runtime/spawn.hpp
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <functional>
+#include <type_traits>
+#include <utility>
+#include "../coro/task.hpp"
+#include "../coro/vthread_stack.hpp"
+#include "scheduler.hpp"
+
+namespace elio {
+
+namespace detail {
+    // Type traits for task<T>
+    template<typename T> struct task_value;
+    template<typename T> struct task_value<coro::task<T>> { using type = T; };
+    template<typename T> using task_value_t = typename task_value<T>::type;
+
+    template<typename T> struct is_task : std::false_type {};
+    template<typename T> struct is_task<coro::task<T>> : std::true_type {};
+    template<typename T> inline constexpr bool is_task_v = is_task<T>::value;
+} // namespace detail
+
+/// Fire-and-forget: spawn a coroutine without awaiting its result.
+/// The coroutine runs independently and self-destructs on completion.
+///
+/// @tparam F  Callable type that returns a task<T>
+/// @tparam Args  Argument types
+/// @param f  Callable to invoke (must return a task)
+/// @param args  Arguments to forward to the callable
+///
+/// Example:
+///   elio::go(async_work);
+///   elio::go(async_work_with_args, 1, 2, 3);
+///   elio::go([&]() -> coro::task<void> { co_await some_async_op(); });
+template<typename F, typename... Args>
+    requires (std::invocable<F, Args...> && detail::is_task_v<std::invoke_result_t<F, Args...>>)
+void go(F&& f, Args&&... args) {
+    coro::detail::heap_alloc_guard guard;
+    auto t = std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+
+    auto handle = coro::detail::task_access::release(t);
+    handle.promise().detached_ = true;
+    auto* vstack = new coro::vthread_stack();
+    handle.promise().set_vstack_owner(vstack);
+    // Detach from current thread's frame chain before spawning to another thread
+    // to avoid use-after-free when this thread creates another coroutine.
+    handle.promise().detach_from_parent();
+    runtime::schedule_handle(handle);
+}
+
+/// Spawn a coroutine and return a join_handle to await its result.
+/// The coroutine runs concurrently and the result can be retrieved via co_await.
+///
+/// @tparam F  Callable type that returns a task<T>
+/// @tparam Args  Argument types
+/// @param f  Callable to invoke (must return a task)
+/// @param args  Arguments to forward to the callable
+/// @return join_handle<T> that can be awaited to get the result
+///
+/// Example:
+///   auto handle = elio::spawn(compute_async, input);
+///   auto result = co_await handle;
+template<typename F, typename... Args>
+    requires (std::invocable<F, Args...> && detail::is_task_v<std::invoke_result_t<F, Args...>>)
+auto spawn(F&& f, Args&&... args)
+    -> coro::join_handle<detail::task_value_t<std::invoke_result_t<F, Args...>>>
+{
+    using T = detail::task_value_t<std::invoke_result_t<F, Args...>>;
+    coro::detail::heap_alloc_guard guard;
+    auto t = std::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+
+    auto handle = coro::detail::task_access::release(t);
+    auto state = std::make_shared<coro::detail::join_state<T>>();
+    handle.promise().join_state_ = state;
+    auto* vstack = new coro::vthread_stack();
+    handle.promise().set_vstack_owner(vstack);
+    // Detach from current thread's frame chain before spawning to another thread
+    // to avoid use-after-free when this thread creates another coroutine.
+    handle.promise().detach_from_parent();
+    runtime::schedule_handle(handle);
+    return coro::join_handle<T>(std::move(state));
+}
+
+} // namespace elio
+
+// Macros — syntactic sugar for inline lambda coroutines
+// These capture by reference and wrap the expression in a lambda returning task
+
+/// Fire-and-forget macro for inline coroutine expressions
+/// Usage: ELIO_GO(some_async_operation())
+#define ELIO_GO(...)    elio::go([&]() { return __VA_ARGS__; })
+
+/// Spawn macro for inline coroutine expressions, returns join_handle
+/// Usage: auto h = ELIO_SPAWN(compute_async()); auto result = co_await h;
+#define ELIO_SPAWN(...) elio::spawn([&]() { return __VA_ARGS__; })
diff --git a/include/elio/runtime/spawn_blocking.hpp b/include/elio/runtime/spawn_blocking.hpp
new file mode 100644
index 0000000..92bde1d
--- /dev/null
+++ b/include/elio/runtime/spawn_blocking.hpp
@@ -0,0 +1,107 @@
+#pragma once
+
+#include "scheduler.hpp"
+#include "blocking_pool.hpp"
+#include <coroutine>
+#include <exception>
+#include <optional>
+#include <thread>
+#include <type_traits>
+
+namespace elio {
+namespace detail {
+
+// State for non-void results
+template<typename T>
+struct blocking_state {
+    std::optional<T> result;
+    std::exception_ptr exception;
+};
+
+// State for void results (avoid std::optional<void>)
+template<>
+struct blocking_state<void> {
+    bool completed = false;
+    std::exception_ptr exception;
+};
+
+template<typename T, typename F>
+class blocking_awaitable {
+public:
+    explicit blocking_awaitable(F&& f) : func_(std::forward<F>(f)) {}
+    blocking_awaitable(blocking_awaitable&&) = default;
+    blocking_awaitable(const blocking_awaitable&) = delete;
+    blocking_awaitable& operator=(const blocking_awaitable&) = delete;
+
+    bool await_ready() const noexcept { return false; }
+
+    void await_suspend(std::coroutine_handle<> caller) {
+        auto* state = &state_;
+        // Capture scheduler pointer to ensure we resume on the right scheduler,
+        // not directly on the blocking pool thread.
+        auto* sched = runtime::get_current_scheduler();
+        auto work = [state, caller, sched, f = std::move(func_)]() mutable {
+            try {
+                if constexpr (std::is_void_v<T>) {
+                    f();
+                    state->completed = true;
+                } else {
+                    state->result.emplace(f());
+                }
+            } catch (...) {
+                state->exception = std::current_exception();
+            }
+            // Resume caller via scheduler to ensure it runs on the right thread.
+            // If no scheduler, fall back to direct resume (single-threaded case).
+            if (sched && sched->is_running()) {
+                sched->spawn(caller);
+            } else if (caller && !caller.done()) {
+                caller.resume();
+            }
+        };
+
+        // Try blocking pool first, fallback to detached thread
+        if (sched && sched->is_running()) {
+            if (auto* pool = sched->get_blocking_pool()) {
+                pool->submit(std::move(work));
+                return;
+            }
+        }
+        std::thread(std::move(work)).detach();
+    }
+
+    T await_resume() {
+        if (state_.exception) {
+            std::rethrow_exception(state_.exception);
+        }
+        if constexpr (std::is_void_v<T>) {
+            return;
+        } else {
+            return std::move(*state_.result);
+        }
+    }
+
+private:
+    F func_;
+    blocking_state<T> state_;
+};
+
+}  // namespace detail
+
+/// Spawn a blocking operation on a dedicated thread pool.
+/// The calling coroutine suspends until the operation completes.
+/// Any exception thrown by f() is propagated to the awaiting coroutine.
+///
+/// Example:
+///   int fd = co_await elio::spawn_blocking([&] {
+///       return ::open("/path/to/file", O_RDONLY);
+///   });
+template<typename F>
+auto spawn_blocking(F&& f) {
+    using R = std::invoke_result_t<std::decay_t<F>>;
+    static_assert(!std::is_reference_v<R>,
+                  "spawn_blocking does not support callables returning references");
+    return detail::blocking_awaitable<R, std::decay_t<F>>(std::forward<F>(f));
+}
+
+}  // namespace elio
diff --git a/include/elio/sync/primitives.hpp b/include/elio/sync/primitives.hpp
index d65f6ac..f12a1f6 100644
--- a/include/elio/sync/primitives.hpp
+++ b/include/elio/sync/primitives.hpp
@@ -52,7 +52,12 @@ class mutex {
     /// mutex's intrusive waiter list.
     class lock_awaitable {
     public:
-        explicit lock_awaitable(mutex& m) noexcept : mutex_(m) {}
+        explicit lock_awaitable(mutex& m) noexcept : mutex_(m) {
+            // Use release stores to ensure writes are visible to other threads
+            // This also helps TSAN understand the synchronization
+            next_.store(nullptr, std::memory_order_release);
+            handle_.store(nullptr, std::memory_order_release);
+        }
 
         bool await_ready() const noexcept {
             return mutex_.try_lock();
@@ -63,7 +68,7 @@ class mutex {
         /// true (suspend).  Loops until one of these two outcomes is achieved
         /// via lock-free CAS.
         bool await_suspend(std::coroutine_handle<> h) noexcept {
-            handle_ = h;
+            handle_.store(h.address(), std::memory_order_relaxed);
             void* old_state = mutex_.state_.load(std::memory_order_acquire);
             while (true) {
                 if (old_state == nullptr) {
@@ -77,9 +82,10 @@ class mutex {
                     // CAS failed, old_state refreshed — retry
                 } else {
                     // Locked — push this awaitable onto the LIFO stack
-                    next_ = (old_state == mutex_.locked_no_waiters())
+                    next_.store((old_state == mutex_.locked_no_waiters())
                                 ? nullptr
-                                : static_cast<lock_awaitable*>(old_state);
+                                : static_cast<lock_awaitable*>(old_state),
+                                std::memory_order_relaxed);
                     if (mutex_.state_.compare_exchange_weak(
                             old_state, this,
                             std::memory_order_release,
@@ -96,8 +102,8 @@ class mutex {
     private:
         friend class mutex;
         mutex& mutex_;
-        lock_awaitable* next_{nullptr};      // intrusive LIFO linkage
-        std::coroutine_handle<> handle_;     // handle to resume on unlock
+        std::atomic<lock_awaitable*> next_;      // intrusive LIFO linkage
+        std::atomic<void*> handle_;     // handle to resume on unlock
     };
 
     /// Acquire the mutex
@@ -130,13 +136,15 @@ class mutex {
 
         // Pop head waiter and transfer lock ownership to it (LIFO)
         auto* head = static_cast<lock_awaitable*>(state);
-        void* next_state = (head->next_ == nullptr)
+        auto* next = head->next_.load(std::memory_order_acquire);
+        void* next_state = (next == nullptr)
                                ? locked_no_waiters()
-                               : static_cast<void*>(head->next_);
+                               : static_cast<void*>(next);
         state_.store(next_state, std::memory_order_release);
 
         // Schedule the waiter — it now holds the lock
-        runtime::schedule_handle(head->handle_);
+        auto handle_addr = head->handle_.load(std::memory_order_acquire);
+        runtime::schedule_handle(std::coroutine_handle<>::from_address(handle_addr));
     }
 
     /// Check if mutex is currently locked
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9922f40..a498df5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,6 +10,7 @@ set(TEST_SOURCES
     test_main.cpp
     unit/test_logger.cpp
     unit/test_virtual_stack.cpp
+    unit/test_vthread_stack.cpp
     unit/test_task.cpp
     unit/test_awaitable_base.cpp
     unit/test_chase_lev_deque.cpp
diff --git a/tests/integration/test_dynamic_threads.cpp b/tests/integration/test_dynamic_threads.cpp
index 016b417..fbfb736 100644
--- a/tests/integration/test_dynamic_threads.cpp
+++ b/tests/integration/test_dynamic_threads.cpp
@@ -29,8 +29,7 @@ TEST_CASE("Dynamic thread pool growth under load", "[dynamic_threads]") {
     
     // Spawn initial batch
     for (int i = 0; i < 50; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(100));
@@ -41,8 +40,7 @@ TEST_CASE("Dynamic thread pool growth under load", "[dynamic_threads]") {
     
     // Spawn more tasks
     for (int i = 50; i < num_tasks; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(1000));
@@ -72,8 +70,7 @@ TEST_CASE("Dynamic thread pool shrink under load", "[dynamic_threads]") {
     
     // Spawn initial batch
     for (int i = 0; i < 50; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(100));
@@ -84,8 +81,7 @@ TEST_CASE("Dynamic thread pool shrink under load", "[dynamic_threads]") {
     
     // Spawn more tasks
     for (int i = 50; i < num_tasks; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(1000));
@@ -114,8 +110,7 @@ TEST_CASE("Multiple thread pool adjustments", "[dynamic_threads]") {
     
     // Start with 2 threads
     for (int i = 0; i < 20; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(50));
@@ -125,8 +120,7 @@ TEST_CASE("Multiple thread pool adjustments", "[dynamic_threads]") {
     REQUIRE(sched.num_threads() == 4);
     
     for (int i = 0; i < 20; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(50));
@@ -136,8 +130,7 @@ TEST_CASE("Multiple thread pool adjustments", "[dynamic_threads]") {
     REQUIRE(sched.num_threads() == 8);
     
     for (int i = 0; i < 20; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(50));
@@ -147,8 +140,7 @@ TEST_CASE("Multiple thread pool adjustments", "[dynamic_threads]") {
     REQUIRE(sched.num_threads() == 4);
     
     for (int i = 0; i < 20; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(50));
@@ -158,8 +150,7 @@ TEST_CASE("Multiple thread pool adjustments", "[dynamic_threads]") {
     REQUIRE(sched.num_threads() == 2);
     
     for (int i = 0; i < 20; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     // Active wait for completion with timeout
@@ -191,8 +182,7 @@ TEST_CASE("Thread pool growth from 1 to many", "[dynamic_threads]") {
     
     // With 1 thread, tasks execute slowly
     for (int i = 0; i < 50; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(100));
@@ -229,8 +219,7 @@ TEST_CASE("Thread pool maintains correctness during resize", "[dynamic_threads]"
     // Spawn tasks while resizing
     std::thread spawner([&]() {
         for (int i = 0; i < num_tasks; ++i) {
-            auto t = task_func();
-            sched.spawn(t.release());
+            sched.go(task_func);
             
             // Resize periodically
             if (i % 10 == 0) {
@@ -269,8 +258,7 @@ TEST_CASE("Thread pool resize to 0 treated as 1", "[dynamic_threads]") {
         co_return;
     };
     
-    auto t = task_func();
-    sched.spawn(t.release());
+    sched.go(task_func);
     
     std::this_thread::sleep_for(scaled_ms(100));
     
@@ -299,8 +287,7 @@ TEST_CASE("Rapid thread pool adjustments", "[dynamic_threads]") {
         
         // Spawn some tasks
         for (int i = 0; i < 10; ++i) {
-            auto t = task_func();
-            sched.spawn(t.release());
+            sched.go(task_func);
         }
         
         std::this_thread::sleep_for(scaled_ms(20));
diff --git a/tests/integration/test_exception_propagation.cpp b/tests/integration/test_exception_propagation.cpp
index e414a0d..b6a4b38 100644
--- a/tests/integration/test_exception_propagation.cpp
+++ b/tests/integration/test_exception_propagation.cpp
@@ -33,8 +33,7 @@ TEST_CASE("Exception propagation through single level", "[exception]") {
         co_return;
     };
     
-    auto t = catcher();
-    sched.spawn(t.release());
+    sched.go(catcher);
     
     std::this_thread::sleep_for(scaled_ms(200));
     
@@ -79,8 +78,7 @@ TEST_CASE("Exception propagation through multiple levels", "[exception]") {
         co_return;
     };
     
-    auto t = level1();
-    sched.spawn(t.release());
+    sched.go(level1);
     
     std::this_thread::sleep_for(scaled_ms(300));
     
@@ -111,8 +109,7 @@ TEST_CASE("Exception propagation with void tasks", "[exception]") {
         co_return;
     };
     
-    auto t = catcher();
-    sched.spawn(t.release());
+    sched.go(catcher);
     
     std::this_thread::sleep_for(scaled_ms(200));
     
@@ -143,8 +140,7 @@ TEST_CASE("Multiple exceptions in different coroutines", "[exception]") {
     
     const int num_tasks = 10;
     for (int i = 0; i < num_tasks; ++i) {
-        auto t = catcher(i);
-        sched.spawn(t.release());
+        sched.go([&, i]() { return catcher(i); });
     }
     
     // Active wait for completion with timeout
@@ -193,8 +189,7 @@ TEST_CASE("Exception in middle of chain", "[exception]") {
         co_return;
     };
     
-    auto t = level1();
-    sched.spawn(t.release());
+    sched.go(level1);
     
     std::this_thread::sleep_for(scaled_ms(300));
     
@@ -232,8 +227,7 @@ TEST_CASE("Exception with custom exception type", "[exception]") {
         co_return;
     };
     
-    auto t = catcher();
-    sched.spawn(t.release());
+    sched.go(catcher);
     
     std::this_thread::sleep_for(scaled_ms(200));
     
@@ -251,8 +245,7 @@ TEST_CASE("Uncaught exception in coroutine", "[exception]") {
         co_return;
     };
     
-    auto t = thrower();
-    sched.spawn(t.release());
+    sched.go(thrower);
     
     // Should not crash the scheduler
     std::this_thread::sleep_for(scaled_ms(200));
@@ -291,8 +284,7 @@ TEST_CASE("Exception propagation preserves exception message", "[exception]") {
         co_return;
     };
     
-    auto t = level1();
-    sched.spawn(t.release());
+    sched.go(level1);
     
     std::this_thread::sleep_for(scaled_ms(300));
     
diff --git a/tests/integration/test_parallel_tasks.cpp b/tests/integration/test_parallel_tasks.cpp
index 1de83e5..5e060bf 100644
--- a/tests/integration/test_parallel_tasks.cpp
+++ b/tests/integration/test_parallel_tasks.cpp
@@ -25,8 +25,7 @@ TEST_CASE("Parallel task execution stress test", "[parallel]") {
     };
     
     for (int i = 0; i < num_tasks; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     // Wait for completion with scaled timeout
@@ -68,13 +67,11 @@ TEST_CASE("Parallel tasks with varying workloads", "[parallel]") {
     
     // Mix of light and heavy tasks
     for (int i = 0; i < 100; ++i) {
-        auto t = light_task();
-        sched.spawn(t.release());
+        sched.go(light_task);
     }
     
     for (int i = 0; i < 20; ++i) {
-        auto t = heavy_task();
-        sched.spawn(t.release());
+        sched.go(heavy_task);
     }
     
     std::this_thread::sleep_for(scaled_ms(1000));
@@ -107,8 +104,7 @@ TEST_CASE("Parallel tasks with dependencies", "[parallel]") {
     
     const int num_chains = 50;
     for (int i = 0; i < num_chains; ++i) {
-        auto t = stage2_task();
-        sched.spawn(t.release());
+        sched.go(stage2_task);
     }
     
     // Active wait for completion with timeout
@@ -147,8 +143,7 @@ TEST_CASE("Work stealing under heavy load", "[parallel]") {
     
     // Spawn all tasks quickly
     for (int i = 0; i < num_tasks; ++i) {
-        auto t = task_func();
-        sched.spawn(t.release());
+        sched.go(task_func);
     }
     
     std::this_thread::sleep_for(scaled_ms(1500));
@@ -179,8 +174,7 @@ TEST_CASE("Concurrent spawn and execution", "[parallel]") {
     for (int i = 0; i < spawner_threads; ++i) {
         spawners.emplace_back([&]() {
             for (int j = 0; j < tasks_per_thread; ++j) {
-                auto t = task_func();
-                sched.spawn(t.release());
+                sched.go(task_func);
                 std::this_thread::yield();
             }
         });
@@ -213,8 +207,7 @@ TEST_CASE("Parallel tasks with shared atomic counter", "[parallel]") {
     };
     
     for (int i = 0; i < num_tasks; ++i) {
-        auto t = increment_task();
-        sched.spawn(t.release());
+        sched.go(increment_task);
     }
     
     std::this_thread::sleep_for(scaled_ms(800));
@@ -250,8 +243,7 @@ TEST_CASE("Nested parallel tasks", "[parallel]") {
     
     const int num_outer = 20;
     for (int i = 0; i < num_outer; ++i) {
-        auto t = outer_task();
-        sched.spawn(t.release());
+        sched.go(outer_task);
     }
     
     std::this_thread::sleep_for(scaled_ms(800));
diff --git a/tests/integration/test_scheduler_integration.cpp b/tests/integration/test_scheduler_integration.cpp
index 183e572..e846a49 100644
--- a/tests/integration/test_scheduler_integration.cpp
+++ b/tests/integration/test_scheduler_integration.cpp
@@ -33,8 +33,7 @@ TEST_CASE("Chained coroutines integration", "[integration]") {
         co_return;
     };
     
-    auto t = outer();
-    sched.spawn(t.release());
+    sched.go(outer);
     
     // Wait for completion
     std::this_thread::sleep_for(scaled_ms(200));
@@ -67,8 +66,7 @@ TEST_CASE("Deep coroutine chain", "[integration]") {
         co_return;
     };
     
-    auto t = level1();
-    sched.spawn(t.release());
+    sched.go(level1);
     
     std::this_thread::sleep_for(scaled_ms(300));
     
@@ -103,8 +101,7 @@ TEST_CASE("Parallel independent coroutines", "[integration]") {
     
     // Spawn all tasks
     for (int i = 0; i < num_tasks; ++i) {
-        auto t = task_func(i);
-        sched.spawn(t.release());
+        sched.go([&, i]() { return task_func(i); });
     }
     
     // Wait for all to complete
@@ -143,8 +140,7 @@ TEST_CASE("Mixed chain and parallel coroutines", "[integration]") {
         co_return;
     };
     
-    auto t = aggregator();
-    sched.spawn(t.release());
+    sched.go(aggregator);
     
     std::this_thread::sleep_for(scaled_ms(300));
     
@@ -179,8 +175,7 @@ TEST_CASE("Virtual stack tracking in scheduler", "[integration]") {
         co_return;
     };
     
-    auto t = outer();
-    sched.spawn(t.release());
+    sched.go(outer);
     
     std::this_thread::sleep_for(scaled_ms(200));
     
@@ -208,8 +203,7 @@ TEST_CASE("Scheduler load distribution", "[integration]") {
     
     // Spawn all tasks at once
     for (int i = 0; i < num_tasks; ++i) {
-        auto t = heavy_task();
-        sched.spawn(t.release());
+        sched.go(heavy_task);
     }
     
     // Wait for completion
diff --git a/tests/unit/test_affinity.cpp b/tests/unit/test_affinity.cpp
index 55d89da..4b6c754 100644
--- a/tests/unit/test_affinity.cpp
+++ b/tests/unit/test_affinity.cpp
@@ -9,6 +9,28 @@ using namespace elio::runtime;
 using namespace elio::coro;
 using namespace elio::test;
 
+// Helper to access handle from task
+template<typename T>
+auto get_handle(task<T>& t) {
+    return elio::coro::detail::task_access::handle(t);
+}
+
+// Helper to spawn a task to scheduler
+template<typename T>
+void spawn_task(scheduler& sched, task<T>& t) {
+    elio::coro::detail::heap_alloc_guard guard;
+    auto handle = elio::coro::detail::task_access::release(t);
+    sched.spawn(handle);
+}
+
+// Helper to spawn a task to specific worker
+template<typename T>
+void spawn_task_to(scheduler& sched, size_t worker_id, task<T>& t) {
+    elio::coro::detail::heap_alloc_guard guard;
+    auto handle = elio::coro::detail::task_access::release(t);
+    sched.spawn_to(worker_id, handle);
+}
+
 TEST_CASE("Affinity constants", "[affinity]") {
     REQUIRE(NO_AFFINITY == std::numeric_limits<size_t>::max());
 }
@@ -19,7 +41,7 @@ TEST_CASE("Promise base affinity default", "[affinity]") {
     };
     
     auto t = coro();
-    auto& promise = t.handle().promise();
+    auto& promise = get_handle(t).promise();
     
     // Default should be NO_AFFINITY
     REQUIRE(promise.affinity() == NO_AFFINITY);
@@ -32,7 +54,7 @@ TEST_CASE("Promise base affinity set/get/clear", "[affinity]") {
     };
     
     auto t = coro();
-    auto& promise = t.handle().promise();
+    auto& promise = get_handle(t).promise();
     
     // Set affinity
     promise.set_affinity(2);
@@ -64,7 +86,7 @@ TEST_CASE("current_worker_id inside scheduler", "[affinity]") {
     };
     
     auto t = coro();
-    sched.spawn(t.release());
+    spawn_task(sched, t);
     
     // Wait for execution
     auto start = std::chrono::steady_clock::now();
@@ -97,7 +119,7 @@ TEST_CASE("set_affinity awaitable binds to worker", "[affinity]") {
     };
     
     auto t = coro();
-    sched.spawn(t.release());
+    spawn_task(sched, t);
     
     // Wait for execution
     auto start = std::chrono::steady_clock::now();
@@ -133,7 +155,7 @@ TEST_CASE("set_affinity without migration", "[affinity]") {
     };
     
     auto t = coro();
-    sched.spawn(t.release());
+    spawn_task(sched, t);
     
     // Wait for execution
     auto start = std::chrono::steady_clock::now();
@@ -177,7 +199,7 @@ TEST_CASE("clear_affinity allows migration", "[affinity]") {
     };
     
     auto t = coro();
-    sched.spawn(t.release());
+    spawn_task(sched, t);
     
     // Wait for execution
     auto start = std::chrono::steady_clock::now();
@@ -223,7 +245,7 @@ TEST_CASE("bind_to_current_worker pins to current", "[affinity]") {
     };
     
     auto t = coro();
-    sched.spawn(t.release());
+    spawn_task(sched, t);
     
     // Wait for execution
     auto start = std::chrono::steady_clock::now();
@@ -275,7 +297,7 @@ TEST_CASE("Affinity prevents work stealing", "[affinity]") {
     // Spawn many tasks
     for (int i = 0; i < num_iterations; ++i) {
         auto t = coro();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Wait for all to complete
@@ -307,13 +329,13 @@ TEST_CASE("Affinity with spawn_to respects binding", "[affinity]") {
     
     auto t = coro();
     // Explicitly spawn to worker 2
-    sched.spawn_to(2, t.release());
+    spawn_task_to(sched, 2, t);
     
     // Wait for execution
-    auto start = std::chrono::steady_clock::now();
+    auto start2 = std::chrono::steady_clock::now();
     while (!completed.load()) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
-        if (std::chrono::steady_clock::now() - start > scaled_sec(5)) break;
+        if (std::chrono::steady_clock::now() - start2 > scaled_sec(5)) break;
     }
     
     REQUIRE(completed.load());
@@ -328,7 +350,7 @@ TEST_CASE("get_promise_base from handle address", "[affinity]") {
     };
     
     auto t = coro();
-    void* addr = t.handle().address();
+    void* addr = get_handle(t).address();
     
     // Should be able to extract promise_base
     auto* promise = get_promise_base(addr);
@@ -348,7 +370,7 @@ TEST_CASE("check_affinity_allows with NO_AFFINITY", "[affinity]") {
     };
     
     auto t = coro();
-    void* addr = t.handle().address();
+    void* addr = get_handle(t).address();
     
     // With NO_AFFINITY, any worker should be allowed
     REQUIRE(check_affinity_allows(addr, 0));
@@ -374,14 +396,14 @@ TEST_CASE("Multiple tasks with different affinities", "[affinity]") {
     // Spawn tasks with different affinities
     for (size_t i = 0; i < 4; ++i) {
         auto t = make_coro(i);
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Wait for all to complete
-    auto start = std::chrono::steady_clock::now();
+    auto start3 = std::chrono::steady_clock::now();
     while (completed.load() < 4) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
-        if (std::chrono::steady_clock::now() - start > scaled_sec(5)) break;
+        if (std::chrono::steady_clock::now() - start3 > scaled_sec(5)) break;
     }
     
     REQUIRE(completed.load() == 4);
diff --git a/tests/unit/test_awaitable_base.cpp b/tests/unit/test_awaitable_base.cpp
index 7452ae2..852b26f 100644
--- a/tests/unit/test_awaitable_base.cpp
+++ b/tests/unit/test_awaitable_base.cpp
@@ -5,6 +5,18 @@
 
 using namespace elio::coro;
 
+// Helper to access handle from task
+template<typename T>
+auto get_handle(task<T>& t) {
+    return elio::coro::detail::task_access::handle(t);
+}
+
+// Helper to access promise value from task
+template<typename T>
+auto& get_promise(task<T>& t) {
+    return get_handle(t).promise();
+}
+
 // Test awaitable that returns an int
 class test_awaitable : public awaitable_base<test_awaitable> {
 public:
@@ -68,7 +80,7 @@ TEST_CASE("awaitable_base forwards await_suspend", "[awaitable_base]") {
     };
     
     auto t = coro();
-    t.handle().resume();
+    get_handle(t).resume();
     
     REQUIRE(suspended == true);
 }
@@ -82,9 +94,9 @@ TEST_CASE("awaitable_base forwards await_resume with return value", "[awaitable_
     };
     
     auto t = coro();
-    t.handle().resume();
+    get_handle(t).resume();
     
-    REQUIRE(t.handle().promise().value_.value() == 123);
+    REQUIRE(get_promise(t).value_.value() == 123);
 }
 
 TEST_CASE("awaitable_base works with void return", "[awaitable_base]") {
@@ -96,9 +108,9 @@ TEST_CASE("awaitable_base works with void return", "[awaitable_base]") {
     };
     
     auto t = coro();
-    t.handle().resume();
+    get_handle(t).resume();
     
-    REQUIRE(t.handle().done());
+    REQUIRE(get_handle(t).done());
 }
 
 TEST_CASE("awaitable_base in nested coroutines", "[awaitable_base]") {
@@ -114,10 +126,10 @@ TEST_CASE("awaitable_base in nested coroutines", "[awaitable_base]") {
     };
     
     auto t = outer();
-    t.handle().resume();
+    get_handle(t).resume();
     
     // Should be (50 * 2) + 10 = 110
-    REQUIRE(t.handle().promise().value_.value() == 110);
+    REQUIRE(get_promise(t).value_.value() == 110);
 }
 
 // Test awaitable with symmetric transfer
@@ -150,7 +162,7 @@ TEST_CASE("awaitable_base supports symmetric transfer", "[awaitable_base]") {
     };
     
     auto t = coro();
-    t.handle().resume();
+    get_handle(t).resume();
     
-    REQUIRE(t.handle().promise().value_.value() == 999);
+    REQUIRE(get_promise(t).value_.value() == 999);
 }
diff --git a/tests/unit/test_frame_allocator.cpp b/tests/unit/test_frame_allocator.cpp
deleted file mode 100644
index aaf60c4..0000000
--- a/tests/unit/test_frame_allocator.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <catch2/catch_test_macros.hpp>
-#include <elio/coro/frame_allocator.hpp>
-#include <thread>
-#include <atomic>
-#include <vector>
-
-using namespace elio::coro;
-
-TEST_CASE("Frame allocator basic allocation/deallocation", "[frame_allocator]") {
-    // Test basic allocation
-    void* ptr = frame_allocator::allocate(128);
-    REQUIRE(ptr != nullptr);
-
-    // Test deallocation (same thread)
-    frame_allocator::deallocate(ptr, 128);
-}
-
-TEST_CASE("Frame allocator cross-thread deallocation", "[frame_allocator]") {
-    // This test verifies the lookup-then-push race is fixed by holding
-    // the registry mutex during the entire operation
-
-    std::atomic<void*> allocated_ptr{nullptr};
-    std::atomic<bool> ready{false};
-
-    // Thread 1: Allocate a frame
-    std::thread allocator_thread([&]() {
-        void* ptr = frame_allocator::allocate(128);
-        allocated_ptr.store(ptr);
-
-        // Signal that we have a frame ready
-        ready.store(true);
-
-        // Wait for the other thread to deallocate
-        while (ready.load()) {
-            std::this_thread::yield();
-        }
-    });
-
-    // Wait for allocation
-    while (!ready.load()) {
-        std::this_thread::yield();
-    }
-
-    // Thread 2: Deallocate from different thread (simulating work-stealing)
-    std::thread deallocator_thread([&]() {
-        // Wait for the frame to be ready
-        while (!ready.load()) {
-            std::this_thread::yield();
-        }
-
-        void* ptr = allocated_ptr.load();
-        if (ptr) {
-            // This should trigger cross-thread deallocation
-            // The race condition fix holds the mutex during lookup-then-push
-            frame_allocator::deallocate(ptr, 128);
-        }
-
-        // Signal completion
-        ready.store(false);
-    });
-
-    allocator_thread.join();
-    deallocator_thread.join();
-}
-
-TEST_CASE("Frame allocator multiple frames", "[frame_allocator]") {
-    constexpr size_t num_frames = 100;
-    std::vector<void*> frames;
-
-    // Allocate multiple frames
-    for (size_t i = 0; i < num_frames; ++i) {
-        void* ptr = frame_allocator::allocate(128);
-        REQUIRE(ptr != nullptr);
-        frames.push_back(ptr);
-    }
-
-    // Deallocate all frames
-    for (void* ptr : frames) {
-        frame_allocator::deallocate(ptr, 128);
-    }
-}
-
-TEST_CASE("Frame allocator size limits", "[frame_allocator]") {
-    // Test allocation within size limit
-    void* small = frame_allocator::allocate(256);
-    REQUIRE(small != nullptr);
-    frame_allocator::deallocate(small, 256);
-
-    // Test allocation above size limit falls back to malloc
-    void* large = frame_allocator::allocate(512);
-    REQUIRE(large != nullptr);
-    frame_allocator::deallocate(large, 512);
-}
diff --git a/tests/unit/test_io.cpp b/tests/unit/test_io.cpp
index 0bcd8d3..d2e30fc 100644
--- a/tests/unit/test_io.cpp
+++ b/tests/unit/test_io.cpp
@@ -106,8 +106,7 @@ TEST_CASE("Pipe read/write with epoll", "[io][epoll][pipe]") {
         completed = true;
     };
     
-    auto t = read_coro();
-    sched.spawn(t.release());
+    sched.go(read_coro);
     
     // Wait for completion
     for (int i = 0; i < 100 && !completed; ++i) {
@@ -181,8 +180,7 @@ TEST_CASE("Socket pair with epoll", "[io][epoll][socket]") {
         completed = true;
     };
     
-    auto t = recv_coro();
-    sched.spawn(t.release());
+    sched.go(recv_coro);
     
     for (int i = 0; i < 100 && !completed; ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -239,8 +237,7 @@ TEST_CASE("Cancel operation", "[io][epoll][cancel]") {
         completed = true;
     };
     
-    auto t = recv_coro();
-    sched.spawn(t.release());
+    sched.go(recv_coro);
     
     // Wait for coroutine to start
     for (int i = 0; i < 100 && !started; ++i) {
@@ -288,11 +285,8 @@ TEST_CASE("Multiple concurrent operations", "[io][epoll][concurrent]") {
         completed++;
     };
     
-    auto t1 = recv_coro1();
-    auto t2 = recv_coro2();
-    
-    sched.spawn(t1.release());
-    sched.spawn(t2.release());
+    sched.go(recv_coro1);
+    sched.go(recv_coro2);
     
     // Wait until both complete
     for (int i = 0; i < 100 && completed < 2; ++i) {
@@ -341,8 +335,7 @@ TEST_CASE("epoll_backend registers fd before data available", "[io][epoll][regis
         completed = true;
     };
     
-    auto t = recv_coro();
-    sched.spawn(t.release());
+    sched.go(recv_coro);
     
     // Wait for coroutine to start and register the operation
     for (int i = 0; i < 100 && !started; ++i) {
@@ -399,11 +392,8 @@ TEST_CASE("epoll_backend handles multiple pending ops on same fd", "[io][epoll][
         completed++;
     };
     
-    auto t1 = recv_coro1();
-    auto t2 = recv_coro2();
-    
-    sched.spawn(t1.release());
-    sched.spawn(t2.release());
+    sched.go(recv_coro1);
+    sched.go(recv_coro2);
     
     // Give operations time to be registered
     std::this_thread::sleep_for(std::chrono::milliseconds(50));
@@ -452,8 +442,7 @@ TEST_CASE("epoll_backend write operation registration", "[io][epoll][write]") {
         completed = true;
     };
     
-    auto t = send_coro();
-    sched.spawn(t.release());
+    sched.go(send_coro);
     
     // Wait for completion
     for (int i = 0; i < 100 && !completed; ++i) {
@@ -577,8 +566,7 @@ TEST_CASE("UDS listener bind and accept", "[uds][listener]") {
             co_return;
         };
         
-        auto t = accept_coro();
-        sched.spawn(t.release());
+        sched.go(accept_coro);
         
         // Wait for completion
         for (int i = 0; i < 200 && !accepted; ++i) {
@@ -626,11 +614,8 @@ TEST_CASE("UDS connect", "[uds][connect]") {
         co_return;
     };
 
-    auto accept_task = accept_coro();
-    auto connect_task = connect_coro();
-
-    sched.spawn(accept_task.release());
-    sched.spawn(connect_task.release());
+    sched.go(accept_coro);
+    sched.go(connect_coro);
 
     // Wait until both complete
     for (int i = 0; i < 200 && (!server_accepted || !client_connected); ++i) {
@@ -675,10 +660,8 @@ TEST_CASE("UDS stream read/write", "[uds][stream]") {
         co_return;
     };
     
-    auto accept_task = accept_coro();
-    auto connect_task = connect_coro();
-    sched.spawn(accept_task.release());
-    sched.spawn(connect_task.release());
+    sched.go(accept_coro);
+    sched.go(connect_coro);
     
     for (int i = 0; i < 200 && setup_complete < 2; ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -706,10 +689,8 @@ TEST_CASE("UDS stream read/write", "[uds][stream]") {
             read_done = true;
         };
         
-        auto write_task = write_coro();
-        auto read_task = read_coro();
-        sched.spawn(write_task.release());
-        sched.spawn(read_task.release());
+        sched.go(write_coro);
+        sched.go(read_coro);
         
         for (int i = 0; i < 200 && (!write_done || !read_done); ++i) {
             std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -742,10 +723,8 @@ TEST_CASE("UDS stream read/write", "[uds][stream]") {
             read_done = true;
         };
         
-        auto write_task = write_coro();
-        auto read_task = read_coro();
-        sched.spawn(write_task.release());
-        sched.spawn(read_task.release());
+        sched.go(write_coro);
+        sched.go(read_coro);
         
         for (int i = 0; i < 200 && (!write_done || !read_done); ++i) {
             std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -818,16 +797,13 @@ TEST_CASE("UDS multiple concurrent connections", "[uds][concurrent]") {
         co_return;
     };
     
-    auto a0 = accept0(); auto a1 = accept1(); auto a2 = accept2();
-    auto c0 = connect0(); auto c1 = connect1(); auto c2 = connect2();
-    
     // Start all coroutines
-    sched.spawn(a0.release());
-    sched.spawn(a1.release());
-    sched.spawn(a2.release());
-    sched.spawn(c0.release());
-    sched.spawn(c1.release());
-    sched.spawn(c2.release());
+    sched.go(accept0);
+    sched.go(accept1);
+    sched.go(accept2);
+    sched.go(connect0);
+    sched.go(connect1);
+    sched.go(connect2);
     
     // Wait until all connections are made
     for (int i = 0; i < 500 && (accepts_done < NUM_CLIENTS || connects_done < NUM_CLIENTS); ++i) {
@@ -887,10 +863,8 @@ TEST_CASE("UDS filesystem socket", "[uds][filesystem]") {
         co_return;
     };
     
-    auto accept_task = accept_coro();
-    auto connect_task = connect_coro();
-    sched.spawn(accept_task.release());
-    sched.spawn(connect_task.release());
+    sched.go(accept_coro);
+    sched.go(connect_coro);
     
     for (int i = 0; i < 200 && (!connected || !accepted); ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -941,8 +915,7 @@ TEST_CASE("UDS echo test", "[uds][echo]") {
         server_done = true;
     };
     
-    auto server_task = server_coro();
-    sched.spawn(server_task.release());
+    sched.go(server_coro);
     
     // Client in a thread (to avoid coroutine complexity)
     std::thread client_thread([&]() {
@@ -1268,10 +1241,8 @@ TEST_CASE("TCP IPv6 listener and connect", "[tcp][ipv6][integration]") {
             co_return;
         };
         
-        auto accept_task = accept_coro();
-        auto connect_task = connect_coro();
-        sched.spawn(accept_task.release());
-        sched.spawn(connect_task.release());
+        sched.go(accept_coro);
+        sched.go(connect_coro);
         
         for (int i = 0; i < 200 && (!accepted || !connected); ++i) {
             std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -1302,8 +1273,7 @@ TEST_CASE("TCP connect regression avoids double connect", "[tcp][connect][regres
 
     constexpr int kAttempts = 64;
     for (int i = 0; i < kAttempts; ++i) {
-        auto t = tcp_connect_regression_attempt(port, connected, failed, first_error);
-        sched.spawn(t.release());
+        sched.go([&]() { return tcp_connect_regression_attempt(port, connected, failed, first_error); });
     }
 
     for (int i = 0; i < 500 && (connected + failed) < kAttempts; ++i) {
@@ -1325,8 +1295,7 @@ TEST_CASE("explicit hostname resolution", "[tcp][address][dns]") {
         std::optional<socket_address> resolved;
         std::atomic<bool> done{false};
 
-        auto task = resolve_hostname_attempt("localhost", 80, resolved, done);
-        sched.spawn(task.release());
+        sched.go([&]() { return resolve_hostname_attempt("localhost", 80, resolved, done); });
 
         for (int i = 0; i < 200 && !done.load(std::memory_order_relaxed); ++i) {
             std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -1360,11 +1329,9 @@ TEST_CASE("tcp_connect hostname resolution uses cache", "[tcp][connect][dns][cac
 
     auto stats_before = default_resolve_cache().stats();
 
-    auto accept_task = accept_n_connections(*listener, 2, accepted);
-    sched.spawn(accept_task.release());
+    sched.go([&]() { return accept_n_connections(*listener, 2, accepted); });
 
-    auto first_task = tcp_connect_hostname_attempt("localhost", port, connected, failed, first_error);
-    sched.spawn(first_task.release());
+    sched.go([&]() { return tcp_connect_hostname_attempt("localhost", port, connected, failed, first_error); });
 
     for (int i = 0; i < 300 && connected.load(std::memory_order_relaxed) < 1; ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -1372,8 +1339,7 @@ TEST_CASE("tcp_connect hostname resolution uses cache", "[tcp][connect][dns][cac
 
     auto stats_after_first = default_resolve_cache().stats();
 
-    auto second_task = tcp_connect_hostname_attempt("localhost", port, connected, failed, first_error);
-    sched.spawn(second_task.release());
+    sched.go([&]() { return tcp_connect_hostname_attempt("localhost", port, connected, failed, first_error); });
 
     for (int i = 0; i < 300 && (accepted.load(std::memory_order_relaxed) < 2
             || connected.load(std::memory_order_relaxed) < 2
@@ -1424,17 +1390,15 @@ TEST_CASE("resolve_options can disable cache", "[tcp][dns][cache][config]") {
     std::atomic<bool> done_first{false};
     std::atomic<bool> done_second{false};
 
-    auto first = resolve_all_attempt_with_options(
-        "localhost", 80, options, resolved_first, done_first);
-    sched.spawn(first.release());
+    sched.go([&]() { return resolve_all_attempt_with_options(
+        "localhost", 80, options, resolved_first, done_first); });
 
     for (int i = 0; i < 200 && !done_first.load(std::memory_order_relaxed); ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
     }
 
-    auto second = resolve_all_attempt_with_options(
-        "localhost", 80, options, resolved_second, done_second);
-    sched.spawn(second.release());
+    sched.go([&]() { return resolve_all_attempt_with_options(
+        "localhost", 80, options, resolved_second, done_second); });
 
     for (int i = 0; i < 200 && !done_second.load(std::memory_order_relaxed); ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -1469,17 +1433,15 @@ TEST_CASE("resolve_options can use custom cache instance", "[tcp][dns][cache][co
     std::atomic<bool> done_first{false};
     std::atomic<bool> done_second{false};
 
-    auto first = resolve_all_attempt_with_options(
-        "localhost", 80, options, resolved_first, done_first);
-    sched.spawn(first.release());
+    sched.go([&]() { return resolve_all_attempt_with_options(
+        "localhost", 80, options, resolved_first, done_first); });
 
     for (int i = 0; i < 200 && !done_first.load(std::memory_order_relaxed); ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
     }
 
-    auto second = resolve_all_attempt_with_options(
-        "localhost", 80, options, resolved_second, done_second);
-    sched.spawn(second.release());
+    sched.go([&]() { return resolve_all_attempt_with_options(
+        "localhost", 80, options, resolved_second, done_second); });
 
     for (int i = 0; i < 200 && !done_second.load(std::memory_order_relaxed); ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
@@ -1517,17 +1479,15 @@ TEST_CASE("resolve_options ttl controls cache expiry", "[tcp][dns][cache][config
     std::atomic<bool> done_first{false};
     std::atomic<bool> done_second{false};
 
-    auto first = resolve_all_attempt_with_options(
-        "localhost", 80, options, resolved_first, done_first);
-    sched.spawn(first.release());
+    sched.go([&]() { return resolve_all_attempt_with_options(
+        "localhost", 80, options, resolved_first, done_first); });
 
     for (int i = 0; i < 200 && !done_first.load(std::memory_order_relaxed); ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
     }
 
-    auto second = resolve_all_attempt_with_options(
-        "localhost", 80, options, resolved_second, done_second);
-    sched.spawn(second.release());
+    sched.go([&]() { return resolve_all_attempt_with_options(
+        "localhost", 80, options, resolved_second, done_second); });
 
     for (int i = 0; i < 200 && !done_second.load(std::memory_order_relaxed); ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
diff --git a/tests/unit/test_scheduler.cpp b/tests/unit/test_scheduler.cpp
index 91bd846..08f77a5 100644
--- a/tests/unit/test_scheduler.cpp
+++ b/tests/unit/test_scheduler.cpp
@@ -9,6 +9,16 @@ using namespace elio::runtime;
 using namespace elio::coro;
 using namespace elio::test;
 
+// Helper to spawn a task to scheduler
+template<typename T>
+void spawn_task(scheduler& sched, task<T>& t) {
+    elio::coro::detail::heap_alloc_guard guard;
+    auto handle = elio::coro::detail::task_access::release(t);
+    auto* vstack = new elio::coro::vthread_stack();
+    handle.promise().set_vstack_owner(vstack);
+    sched.spawn(handle);
+}
+
 TEST_CASE("Scheduler construction", "[scheduler]") {
     scheduler sched(4);
     REQUIRE(sched.num_threads() == 4);
@@ -38,7 +48,7 @@ TEST_CASE("Scheduler spawn and execute simple coroutine", "[scheduler]") {
     };
     
     auto t = coro();
-    sched.spawn(t.release());  // Transfer ownership to scheduler
+    spawn_task(sched, t);  // Transfer ownership to scheduler
     
     // Wait for execution
     std::this_thread::sleep_for(scaled_ms(100));
@@ -78,7 +88,7 @@ TEST_CASE("Scheduler spawn multiple coroutines", "[scheduler]") {
     // Spawn many tasks - scheduler takes ownership via release()
     for (int i = 0; i < num_tasks; ++i) {
         auto t = coro();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Active wait for completion with timeout
@@ -117,7 +127,7 @@ TEST_CASE("Scheduler work stealing occurs", "[scheduler]") {
     // Spawn many tasks quickly - scheduler takes ownership
     for (int i = 0; i < num_tasks; ++i) {
         auto t = coro();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Wait for all to complete
@@ -151,7 +161,7 @@ TEST_CASE("Scheduler dynamic thread pool growth", "[scheduler]") {
     
     for (int i = 0; i < 50; ++i) {
         auto t = coro();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     std::this_thread::sleep_for(scaled_ms(200));
@@ -179,7 +189,7 @@ TEST_CASE("Scheduler dynamic thread pool shrink", "[scheduler]") {
     
     for (int i = 0; i < 50; ++i) {
         auto t = coro();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     std::this_thread::sleep_for(scaled_ms(200));
@@ -202,7 +212,7 @@ TEST_CASE("Scheduler statistics", "[scheduler]") {
     
     for (int i = 0; i < num_tasks; ++i) {
         auto t = coro();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     std::this_thread::sleep_for(scaled_ms(200));
@@ -245,8 +255,8 @@ TEST_CASE("Scheduler handles spawn before start", "[scheduler]") {
     auto t = coro();
     
     // Should not crash, but task won't execute (scheduler not running)
-    // We still need to release() since spawn stores the handle
-    sched.spawn(t.release());
+    // We still need to spawn since it stores the handle
+    spawn_task(sched, t);
     
     // Now start - but the task was already queued
     sched.start();
diff --git a/tests/unit/test_signalfd.cpp b/tests/unit/test_signalfd.cpp
index 070e857..68ca201 100644
--- a/tests/unit/test_signalfd.cpp
+++ b/tests/unit/test_signalfd.cpp
@@ -14,6 +14,16 @@ using namespace elio::runtime;
 using namespace elio::io;
 using namespace std::chrono_literals;
 
+// Helper to spawn a task to scheduler
+template<typename T>
+void spawn_task(scheduler& sched, task<T>& t) {
+    elio::coro::detail::heap_alloc_guard guard;
+    auto handle = elio::coro::detail::task_access::release(t);
+    auto* vstack = new elio::coro::vthread_stack();
+    handle.promise().set_vstack_owner(vstack);
+    sched.spawn(handle);
+}
+
 TEST_CASE("signal_set basic operations", "[signal][signal_set]") {
     SECTION("default constructor creates empty set") {
         signal_set sigs;
@@ -213,7 +223,7 @@ TEST_CASE("signal_fd async wait", "[signal][signal_fd]") {
     
     {
         auto t = wait_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Give the coroutine time to start and enter wait
@@ -265,7 +275,7 @@ TEST_CASE("signal_fd multiple signals", "[signal][signal_fd]") {
     
     {
         auto t = wait_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     std::this_thread::sleep_for(50ms);
@@ -384,7 +394,7 @@ TEST_CASE("wait_signal convenience function", "[signal][wait_signal]") {
     
     {
         auto t = wait_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     std::this_thread::sleep_for(50ms);
diff --git a/tests/unit/test_sync.cpp b/tests/unit/test_sync.cpp
index 58a5db6..5058caf 100644
--- a/tests/unit/test_sync.cpp
+++ b/tests/unit/test_sync.cpp
@@ -12,6 +12,16 @@ using namespace elio::sync;
 using namespace elio::coro;
 using namespace elio::runtime;
 
+// Helper to spawn a task to scheduler
+template<typename T>
+void spawn_task(scheduler& sched, task<T>& t) {
+    elio::coro::detail::heap_alloc_guard guard;
+    auto handle = elio::coro::detail::task_access::release(t);
+    auto* vstack = new elio::coro::vthread_stack();
+    handle.promise().set_vstack_owner(vstack);
+    sched.spawn(handle);
+}
+
 TEST_CASE("mutex basic operations", "[sync][mutex]") {
     mutex m;
     
@@ -35,7 +45,7 @@ TEST_CASE("mutex basic operations", "[sync][mutex]") {
 
 TEST_CASE("mutex with coroutines", "[sync][mutex][coro]") {
     mutex m;
-    int counter = 0;
+    std::atomic<int> counter{0};
     std::atomic<int> completed{0};
     
     scheduler sched(2);
@@ -43,19 +53,18 @@ TEST_CASE("mutex with coroutines", "[sync][mutex][coro]") {
     
     auto increment_task = [&]() -> task<void> {
         co_await m.lock();
-        int temp = counter;
-        std::this_thread::yield();  // Give other coroutines a chance
-        counter = temp + 1;
+        // Use fetch_add for atomic increment
+        counter.fetch_add(1, std::memory_order_relaxed);
         m.unlock();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
     };
     
-    // Create and spawn tasks - use release() to transfer ownership to scheduler
+    // Create and spawn tasks - use spawn_task helper to transfer ownership to scheduler
     // We track completion via the atomic counter
     constexpr int NUM_TASKS = 10;
     for (int i = 0; i < NUM_TASKS; ++i) {
         auto t = increment_task();
-        sched.spawn(t.release());  // Transfer ownership - scheduler will manage lifetime
+        spawn_task(sched, t);  // Transfer ownership - scheduler will manage lifetime
     }
     
     // Wait for completion
@@ -200,12 +209,12 @@ TEST_CASE("channel with coroutines", "[sync][channel][coro]") {
     scheduler sched(2);
     sched.start();
     
-    // Use release() to transfer ownership to scheduler
+    // Use spawn_task helper to transfer ownership to scheduler
     {
         auto p = producer();
         auto c = consumer();
-        sched.spawn(p.release());
-        sched.spawn(c.release());
+        spawn_task(sched, p);
+        spawn_task(sched, c);
     }
     
     // Wait for completion
@@ -307,11 +316,11 @@ TEST_CASE("shared_mutex with coroutines", "[sync][shared_mutex][coro]") {
     // Spawn readers and writers
     for (int i = 0; i < NUM_READERS; ++i) {
         auto t = reader_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     for (int i = 0; i < NUM_WRITERS; ++i) {
         auto t = writer_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Wait for completion
@@ -399,7 +408,7 @@ TEST_CASE("spinlock with coroutines", "[sync][spinlock][coro]") {
     constexpr int NUM_TASKS = 10;
     for (int i = 0; i < NUM_TASKS; ++i) {
         auto t = increment_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
 
     for (int i = 0; i < 200 && completed < NUM_TASKS; ++i) {
@@ -456,7 +465,7 @@ TEST_CASE("condition_variable has_waiters", "[sync][condvar]") {
 TEST_CASE("condition_variable with mutex notify_one", "[sync][condvar][coro]") {
     mutex mtx;
     condition_variable cv;
-    bool ready = false;
+    std::atomic<bool> ready{false};
     std::atomic<int> completed{0};
 
     scheduler sched(2);
@@ -464,31 +473,31 @@ TEST_CASE("condition_variable with mutex notify_one", "[sync][condvar][coro]") {
 
     auto waiter = [&]() -> task<void> {
         co_await mtx.lock();
-        while (!ready) {
+        while (!ready.load(std::memory_order_acquire)) {
             co_await co_await cv.wait(mtx);
         }
         mtx.unlock();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
     };
 
     auto notifier = [&]() -> task<void> {
         co_await mtx.lock();
-        ready = true;
+        ready.store(true, std::memory_order_release);
         mtx.unlock();
         cv.notify_one();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
     };
 
     {
         auto w = waiter();
-        sched.spawn(w.release());
+        spawn_task(sched, w);
     }
 
     std::this_thread::sleep_for(std::chrono::milliseconds(50));
 
     {
         auto n = notifier();
-        sched.spawn(n.release());
+        spawn_task(sched, n);
     }
 
     for (int i = 0; i < 200 && completed < 2; ++i) {
@@ -498,13 +507,13 @@ TEST_CASE("condition_variable with mutex notify_one", "[sync][condvar][coro]") {
     sched.shutdown();
 
     REQUIRE(completed == 2);
-    REQUIRE(ready);
+    REQUIRE(ready.load());
 }
 
 TEST_CASE("condition_variable with mutex notify_all", "[sync][condvar][coro]") {
     mutex mtx;
     condition_variable cv;
-    bool ready = false;
+    std::atomic<bool> ready{false};
     std::atomic<int> completed{0};
 
     scheduler sched(4);
@@ -514,30 +523,30 @@ TEST_CASE("condition_variable with mutex notify_all", "[sync][condvar][coro]") {
 
     auto waiter = [&]() -> task<void> {
         co_await mtx.lock();
-        while (!ready) {
+        while (!ready.load(std::memory_order_acquire)) {
             co_await co_await cv.wait(mtx);
         }
         mtx.unlock();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
     };
 
     for (int i = 0; i < NUM_WAITERS; ++i) {
         auto w = waiter();
-        sched.spawn(w.release());
+        spawn_task(sched, w);
     }
 
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
     auto notifier = [&]() -> task<void> {
         co_await mtx.lock();
-        ready = true;
+        ready.store(true, std::memory_order_release);
         mtx.unlock();
         cv.notify_all();
         co_return;
     };
     {
         auto n = notifier();
-        sched.spawn(n.release());
+        spawn_task(sched, n);
     }
 
     for (int i = 0; i < 300 && completed < NUM_WAITERS; ++i) {
@@ -552,7 +561,7 @@ TEST_CASE("condition_variable with mutex notify_all", "[sync][condvar][coro]") {
 TEST_CASE("condition_variable with spinlock", "[sync][condvar][coro]") {
     spinlock sl;
     condition_variable cv;
-    bool ready = false;
+    std::atomic<bool> ready{false};
     std::atomic<int> completed{0};
 
     scheduler sched(2);
@@ -560,32 +569,32 @@ TEST_CASE("condition_variable with spinlock", "[sync][condvar][coro]") {
 
     auto waiter = [&]() -> task<void> {
         sl.lock();
-        while (!ready) {
+        while (!ready.load(std::memory_order_acquire)) {
             co_await cv.wait(sl);
         }
         sl.unlock();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
     };
 
     auto notifier = [&]() -> task<void> {
         sl.lock();
-        ready = true;
+        ready.store(true, std::memory_order_release);
         sl.unlock();
         cv.notify_one();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
         co_return;
     };
 
     {
         auto w = waiter();
-        sched.spawn(w.release());
+        spawn_task(sched, w);
     }
 
     std::this_thread::sleep_for(std::chrono::milliseconds(50));
 
     {
         auto n = notifier();
-        sched.spawn(n.release());
+        spawn_task(sched, n);
     }
 
     for (int i = 0; i < 200 && completed < 2; ++i) {
@@ -599,7 +608,7 @@ TEST_CASE("condition_variable with spinlock", "[sync][condvar][coro]") {
 
 TEST_CASE("condition_variable unlocked", "[sync][condvar][coro]") {
     condition_variable cv;
-    bool ready = false;
+    std::atomic<bool> ready{false};
     std::atomic<int> completed{0};
 
     // Single worker: all coroutines run on the same thread
@@ -607,29 +616,29 @@ TEST_CASE("condition_variable unlocked", "[sync][condvar][coro]") {
     sched.start();
 
     auto waiter = [&]() -> task<void> {
-        while (!ready) {
+        while (!ready.load(std::memory_order_acquire)) {
             co_await cv.wait_unlocked();
         }
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
     };
 
     auto notifier = [&]() -> task<void> {
-        ready = true;
+        ready.store(true, std::memory_order_release);
         cv.notify_one();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
         co_return;
     };
 
     {
         auto w = waiter();
-        sched.spawn(w.release());
+        spawn_task(sched, w);
     }
 
     std::this_thread::sleep_for(std::chrono::milliseconds(50));
 
     {
         auto n = notifier();
-        sched.spawn(n.release());
+        spawn_task(sched, n);
     }
 
     for (int i = 0; i < 200 && completed < 2; ++i) {
@@ -644,7 +653,7 @@ TEST_CASE("condition_variable unlocked", "[sync][condvar][coro]") {
 TEST_CASE("condition_variable notify_one wakes exactly one", "[sync][condvar][coro]") {
     mutex mtx;
     condition_variable cv;
-    int phase = 0;
+    std::atomic<int> phase{0};
     std::atomic<int> woken{0};
     std::atomic<int> completed{0};
 
@@ -655,17 +664,17 @@ TEST_CASE("condition_variable notify_one wakes exactly one", "[sync][condvar][co
 
     auto waiter = [&]() -> task<void> {
         co_await mtx.lock();
-        while (phase == 0) {
+        while (phase.load(std::memory_order_acquire) == 0) {
             co_await co_await cv.wait(mtx);
         }
-        woken++;
+        woken.fetch_add(1, std::memory_order_relaxed);
         mtx.unlock();
-        completed++;
+        completed.fetch_add(1, std::memory_order_relaxed);
     };
 
     for (int i = 0; i < NUM_WAITERS; ++i) {
         auto w = waiter();
-        sched.spawn(w.release());
+        spawn_task(sched, w);
     }
 
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -673,14 +682,14 @@ TEST_CASE("condition_variable notify_one wakes exactly one", "[sync][condvar][co
     // Set condition and notify exactly one
     auto notifier = [&]() -> task<void> {
         co_await mtx.lock();
-        phase = 1;
+        phase.store(1, std::memory_order_release);
         mtx.unlock();
         cv.notify_one();
         co_return;
     };
     {
         auto n = notifier();
-        sched.spawn(n.release());
+        spawn_task(sched, n);
     }
 
     // Wait for exactly one to wake
@@ -750,12 +759,12 @@ TEST_CASE("condition_variable producer-consumer", "[sync][condvar][coro]") {
 
     {
         auto c = consumer();
-        sched.spawn(c.release());
+        spawn_task(sched, c);
     }
     std::this_thread::sleep_for(std::chrono::milliseconds(20));
     {
         auto p = producer();
-        sched.spawn(p.release());
+        spawn_task(sched, p);
     }
 
     for (int i = 0; i < 300 && completed < 2; ++i) {
diff --git a/tests/unit/test_task.cpp b/tests/unit/test_task.cpp
index 82587bd..bb2c908 100644
--- a/tests/unit/test_task.cpp
+++ b/tests/unit/test_task.cpp
@@ -2,6 +2,7 @@
 #include <elio/coro/task.hpp>
 #include <elio/coro/frame.hpp>
 #include <elio/runtime/scheduler.hpp>
+#include <elio/runtime/spawn.hpp>
 #include <string>
 #include <atomic>
 #include "../test_main.cpp"  // For scaled timeouts
@@ -10,6 +11,12 @@ using namespace elio::coro;
 using namespace elio::runtime;
 using namespace elio::test;
 
+// Helper: access handle from immovable task (for testing only)
+template<typename T>
+auto get_handle(task<T>& t) {
+    return elio::coro::detail::task_access::handle(t);
+}
+
 // Helper: Simple coroutine that returns a value
 task<int> simple_return_value() {
     co_return 42;
@@ -39,50 +46,53 @@ task<int> nested_outer() {
 TEST_CASE("task construction and destruction", "[task]") {
     {
         auto t = simple_return_value();
-        REQUIRE(t.handle() != nullptr);
+        REQUIRE(get_handle(t) != nullptr);
     }
     // Task should destroy handle in destructor
 }
 
-TEST_CASE("task move semantics", "[task]") {
-    auto t1 = simple_return_value();
-    auto h1 = t1.handle();
-    REQUIRE(h1 != nullptr);
-    
-    auto t2 = std::move(t1);
-    REQUIRE(t1.handle() == nullptr);  // Moved-from
-    REQUIRE(t2.handle() == h1);       // Moved-to
+TEST_CASE("task is non-movable", "[task]") {
+    // Verify task<T> is non-movable and non-copyable
+    STATIC_REQUIRE_FALSE(std::is_move_constructible_v<task<int>>);
+    STATIC_REQUIRE_FALSE(std::is_move_assignable_v<task<int>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_constructible_v<task<int>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_assignable_v<task<int>>);
+    
+    STATIC_REQUIRE_FALSE(std::is_move_constructible_v<task<void>>);
+    STATIC_REQUIRE_FALSE(std::is_move_assignable_v<task<void>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_constructible_v<task<void>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_assignable_v<task<void>>);
 }
 
 TEST_CASE("task<int> co_return value", "[task]") {
     auto t = simple_return_value();
     
     // Start the coroutine
-    t.handle().resume();
+    get_handle(t).resume();
     
     // The promise should have the value
-    REQUIRE(t.handle().promise().value_.has_value());
-    REQUIRE(t.handle().promise().value_.value() == 42);
+    REQUIRE(get_handle(t).promise().value_.has_value());
+    REQUIRE(get_handle(t).promise().value_.value() == 42);
 }
 
 TEST_CASE("task<void> co_return void", "[task]") {
     auto t = simple_void();
     
     // Start the coroutine
-    t.handle().resume();
+    get_handle(t).resume();
     
     // Should complete without error
-    REQUIRE(t.handle().done());
+    REQUIRE(get_handle(t).done());
 }
 
 TEST_CASE("task stores exception", "[task]") {
     auto t = throwing_coroutine();
     
     // Start the coroutine
-    t.handle().resume();
+    get_handle(t).resume();
     
     // The promise should have an exception
-    REQUIRE(t.handle().promise().exception() != nullptr);
+    REQUIRE(get_handle(t).promise().exception() != nullptr);
 }
 
 TEST_CASE("task co_await basic", "[task]") {
@@ -94,18 +104,18 @@ TEST_CASE("task co_await basic", "[task]") {
     };
     
     auto t = outer();
-    t.handle().resume();
+    get_handle(t).resume();
     
     // The outer task should have result 43
-    REQUIRE(t.handle().promise().value_.value() == 43);
+    REQUIRE(get_handle(t).promise().value_.value() == 43);
 }
 
 TEST_CASE("task nested co_await", "[task]") {
     auto t = nested_outer();
-    t.handle().resume();
+    get_handle(t).resume();
     
     // The outer coroutine should return 20 (10 * 2)
-    REQUIRE(t.handle().promise().value_.value() == 20);
+    REQUIRE(get_handle(t).promise().value_.value() == 20);
 }
 
 TEST_CASE("task exception propagation via co_await", "[task]") {
@@ -120,32 +130,54 @@ TEST_CASE("task exception propagation via co_await", "[task]") {
     };
     
     auto t = outer();
-    t.handle().resume();
+    get_handle(t).resume();
     
     // Should complete without unhandled exception
-    REQUIRE(t.handle().done());
+    REQUIRE(get_handle(t).done());
 }
 
-TEST_CASE("task virtual stack integration", "[task]") {
-    auto inner = []() -> task<int> {
-        // Inside inner coroutine, virtual stack should be at least 1 deep
-        size_t depth = get_stack_depth();
-        REQUIRE(depth >= 1);
-        co_return 100;
-    };
+TEST_CASE("task virtual stack integration", "[task][.integration]") {
+    // This test requires scheduler context, run with elio::run()
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<bool> passed{false};
     
-    auto outer = [&]() -> task<int> {
-        size_t outer_depth = get_stack_depth();
-        int result = co_await inner();
-        size_t inner_depth = get_stack_depth();
+    auto test_coro = []() -> task<void> {
+        auto inner = []() -> task<int> {
+            // Inside inner coroutine, virtual stack should be at least 1 deep
+            size_t depth = get_stack_depth();
+            REQUIRE(depth >= 1);
+            co_return 100;
+        };
+        
+        auto outer = [&]() -> task<int> {
+            size_t outer_depth = get_stack_depth();
+            int result = co_await inner();
+            size_t after_depth = get_stack_depth();
+            
+            // After co_await, we should be back to outer depth
+            REQUIRE(after_depth == outer_depth);
+            co_return result;
+        };
         
-        // After co_await, we should be back to outer depth
-        REQUIRE(inner_depth == outer_depth);
-        co_return result;
+        int result = co_await outer();
+        REQUIRE(result == 100);
+        co_return;
     };
     
-    auto t = outer();
-    t.handle().resume();
+    auto driver = [&]() -> task<void> {
+        co_await test_coro();
+        passed.store(true);
+    };
+    
+    elio::go(driver);
+    
+    std::this_thread::sleep_for(scaled_ms(200));
+    
+    sched.shutdown();
+    
+    REQUIRE(passed.load());
 }
 
 TEST_CASE("task multiple levels", "[task]") {
@@ -160,9 +192,9 @@ TEST_CASE("task multiple levels", "[task]") {
     };
     
     auto t = level1();
-    t.handle().resume();
+    get_handle(t).resume();
     
-    REQUIRE(t.handle().promise().value_.value() == 3);
+    REQUIRE(get_handle(t).promise().value_.value() == 3);
 }
 
 TEST_CASE("task<void> exception propagation", "[task]") {
@@ -181,15 +213,15 @@ TEST_CASE("task<void> exception propagation", "[task]") {
     };
     
     auto t = catcher();
-    t.handle().resume();
-    REQUIRE(t.handle().done());
+    get_handle(t).resume();
+    REQUIRE(get_handle(t).done());
 }
 
 // ============================================================================
-// Tests for new task spawning API: go(), spawn(), join_handle
+// Tests for new task spawning API: elio::go(), elio::spawn(), join_handle
 // ============================================================================
 
-TEST_CASE("task::go() spawns fire-and-forget task", "[task][spawn]") {
+TEST_CASE("elio::go() spawns fire-and-forget task", "[task][spawn]") {
     scheduler sched(2);
     sched.start();
     
@@ -200,8 +232,8 @@ TEST_CASE("task::go() spawns fire-and-forget task", "[task][spawn]") {
         co_return;
     };
     
-    // Use go() to spawn fire-and-forget
-    coro().go();
+    // Use elio::go() to spawn fire-and-forget
+    elio::go(coro);
     
     // Wait for execution
     std::this_thread::sleep_for(scaled_ms(100));
@@ -211,7 +243,7 @@ TEST_CASE("task::go() spawns fire-and-forget task", "[task][spawn]") {
     sched.shutdown();
 }
 
-TEST_CASE("task<int>::go() spawns fire-and-forget task with value", "[task][spawn]") {
+TEST_CASE("elio::go() spawns fire-and-forget task with value", "[task][spawn]") {
     scheduler sched(2);
     sched.start();
     
@@ -222,7 +254,7 @@ TEST_CASE("task<int>::go() spawns fire-and-forget task with value", "[task][spaw
         co_return 42;  // Value is discarded in fire-and-forget
     };
     
-    coro().go();
+    elio::go(coro);
     
     std::this_thread::sleep_for(scaled_ms(100));
     
@@ -231,7 +263,7 @@ TEST_CASE("task<int>::go() spawns fire-and-forget task with value", "[task][spaw
     sched.shutdown();
 }
 
-TEST_CASE("task::spawn() returns joinable handle", "[task][spawn][join_handle]") {
+TEST_CASE("elio::spawn() returns joinable handle", "[task][spawn][join_handle]") {
     scheduler sched(2);
     sched.start();
     
@@ -242,13 +274,13 @@ TEST_CASE("task::spawn() returns joinable handle", "[task][spawn][join_handle]")
     };
     
     auto driver = [&]() -> task<void> {
-        auto handle = compute().spawn();
+        auto handle = elio::spawn(compute);
         int result = co_await handle;
         REQUIRE(result == 100);
         completed.store(true);
     };
     
-    driver().go();
+    elio::go(driver);
     
     std::this_thread::sleep_for(scaled_ms(200));
     
@@ -257,7 +289,7 @@ TEST_CASE("task::spawn() returns joinable handle", "[task][spawn][join_handle]")
     sched.shutdown();
 }
 
-TEST_CASE("task<void>::spawn() returns joinable handle", "[task][spawn][join_handle]") {
+TEST_CASE("elio::spawn() with void task returns joinable handle", "[task][spawn][join_handle]") {
     scheduler sched(2);
     sched.start();
     
@@ -270,13 +302,13 @@ TEST_CASE("task<void>::spawn() returns joinable handle", "[task][spawn][join_han
     };
     
     auto driver = [&]() -> task<void> {
-        auto handle = work().spawn();
+        auto handle = elio::spawn(work);
         co_await handle;
         REQUIRE(counter.load() == 1);
         completed.store(true);
     };
     
-    driver().go();
+    elio::go(driver);
     
     std::this_thread::sleep_for(scaled_ms(200));
     
@@ -298,7 +330,7 @@ TEST_CASE("join_handle propagates exceptions", "[task][spawn][join_handle]") {
     
     auto catcher = [&]() -> task<void> {
         try {
-            auto handle = thrower().spawn();
+            auto handle = elio::spawn(thrower);
             co_await handle;
             FAIL("Should have thrown");
         } catch (const std::runtime_error& e) {
@@ -307,7 +339,7 @@ TEST_CASE("join_handle propagates exceptions", "[task][spawn][join_handle]") {
         }
     };
     
-    catcher().go();
+    elio::go(catcher);
     
     std::this_thread::sleep_for(scaled_ms(200));
     
@@ -316,7 +348,7 @@ TEST_CASE("join_handle propagates exceptions", "[task][spawn][join_handle]") {
     sched.shutdown();
 }
 
-TEST_CASE("multiple spawn() tasks run concurrently", "[task][spawn][join_handle]") {
+TEST_CASE("multiple elio::spawn() tasks run concurrently", "[task][spawn][join_handle]") {
     scheduler sched(4);
     sched.start();
     
@@ -336,9 +368,9 @@ TEST_CASE("multiple spawn() tasks run concurrently", "[task][spawn][join_handle]
     };
     
     auto driver = [&]() -> task<void> {
-        auto h1 = work().spawn();
-        auto h2 = work().spawn();
-        auto h3 = work().spawn();
+        auto h1 = elio::spawn(work);
+        auto h2 = elio::spawn(work);
+        auto h3 = elio::spawn(work);
         
         co_await h1;
         co_await h2;
@@ -347,7 +379,7 @@ TEST_CASE("multiple spawn() tasks run concurrently", "[task][spawn][join_handle]
         completed.store(true);
     };
     
-    driver().go();
+    elio::go(driver);
     
     std::this_thread::sleep_for(scaled_ms(500));
     
@@ -370,7 +402,7 @@ TEST_CASE("join_handle::is_ready() reflects completion state", "[task][spawn][jo
     };
     
     auto driver = [&]() -> task<void> {
-        auto handle = slow_task().spawn();
+        auto handle = elio::spawn(slow_task);
         
         // Initially not ready
         bool was_not_ready = !handle.is_ready();
@@ -384,7 +416,7 @@ TEST_CASE("join_handle::is_ready() reflects completion state", "[task][spawn][jo
         test_passed.store(was_not_ready && is_now_ready && result == 42);
     };
     
-    driver().go();
+    elio::go(driver);
     
     std::this_thread::sleep_for(scaled_ms(300));
     
@@ -393,7 +425,7 @@ TEST_CASE("join_handle::is_ready() reflects completion state", "[task][spawn][jo
     sched.shutdown();
 }
 
-TEST_CASE("scheduler::spawn() accepts task directly", "[scheduler][spawn]") {
+TEST_CASE("elio::go() works with scheduler context", "[scheduler][spawn]") {
     scheduler sched(2);
     sched.start();
     
@@ -404,8 +436,8 @@ TEST_CASE("scheduler::spawn() accepts task directly", "[scheduler][spawn]") {
         co_return;
     };
     
-    // New API: spawn task directly without calling release()
-    sched.spawn(coro());
+    // Use elio::go() to spawn fire-and-forget task
+    elio::go(coro);
     
     std::this_thread::sleep_for(scaled_ms(100));
     
@@ -414,7 +446,7 @@ TEST_CASE("scheduler::spawn() accepts task directly", "[scheduler][spawn]") {
     sched.shutdown();
 }
 
-TEST_CASE("scheduler::spawn() accepts task<int> directly", "[scheduler][spawn]") {
+TEST_CASE("elio::go() works with task<int>", "[scheduler][spawn]") {
     scheduler sched(2);
     sched.start();
     
@@ -425,7 +457,7 @@ TEST_CASE("scheduler::spawn() accepts task<int> directly", "[scheduler][spawn]")
         co_return 99;
     };
     
-    sched.spawn(coro());
+    elio::go(coro);
     
     std::this_thread::sleep_for(scaled_ms(100));
     
diff --git a/tests/unit/test_timer.cpp b/tests/unit/test_timer.cpp
index dee502f..485a027 100644
--- a/tests/unit/test_timer.cpp
+++ b/tests/unit/test_timer.cpp
@@ -11,6 +11,22 @@ using namespace elio::coro;
 using namespace elio::runtime;
 using namespace std::chrono_literals;
 
+// Helper to access handle from task
+template<typename T>
+auto get_handle(task<T>& t) {
+    return elio::coro::detail::task_access::handle(t);
+}
+
+// Helper to spawn a task to scheduler
+template<typename T>
+void spawn_task(scheduler& sched, task<T>& t) {
+    elio::coro::detail::heap_alloc_guard guard;
+    auto handle = elio::coro::detail::task_access::release(t);
+    auto* vstack = new elio::coro::vthread_stack();
+    handle.promise().set_vstack_owner(vstack);
+    sched.spawn(handle);
+}
+
 TEST_CASE("sleep_for basic", "[time][sleep]") {
     std::atomic<bool> completed{false};
     
@@ -31,7 +47,7 @@ TEST_CASE("sleep_for basic", "[time][sleep]") {
     
     {
         auto t = sleep_task();
-        sched.spawn(t.release());  // Transfer ownership to scheduler
+        spawn_task(sched, t);  // Transfer ownership to scheduler
     }
     
     // Wait for completion
@@ -53,7 +69,7 @@ TEST_CASE("sleep_for zero duration", "[time][sleep]") {
     };
     
     auto t = sleep_task();
-    t.handle().resume();
+    get_handle(t).resume();
     
     REQUIRE(completed);
 }
@@ -76,8 +92,8 @@ TEST_CASE("yield execution", "[time][yield]") {
     {
         auto t1 = yield_task();
         auto t2 = yield_task();
-        sched.spawn(t1.release());
-        sched.spawn(t2.release());
+        spawn_task(sched, t1);
+        spawn_task(sched, t2);
     }
     
     // Wait for completion
@@ -114,7 +130,7 @@ TEST_CASE("multiple sleeps sequential", "[time][sleep]") {
     
     {
         auto t = multi_sleep();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     for (int i = 0; i < 100 && !completed; ++i) {
@@ -145,7 +161,7 @@ TEST_CASE("sleep_until", "[time][sleep]") {
     
     {
         auto t = sleep_until_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     for (int i = 0; i < 100 && !completed; ++i) {
@@ -167,7 +183,7 @@ TEST_CASE("sleep_until past time", "[time][sleep]") {
     };
     
     auto t = past_sleep();
-    t.handle().resume();
+    get_handle(t).resume();
     
     REQUIRE(completed);
 }
@@ -189,7 +205,7 @@ TEST_CASE("cancellable sleep - normal completion", "[time][sleep][cancel]") {
     
     {
         auto t = sleep_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Wait for completion without cancelling
@@ -226,7 +242,7 @@ TEST_CASE("cancellable sleep - cancelled early", "[time][sleep][cancel]") {
     
     {
         auto t = sleep_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Wait a bit then cancel
@@ -268,7 +284,7 @@ TEST_CASE("cancellable sleep - already cancelled token", "[time][sleep][cancel]"
     
     {
         auto t = sleep_task();
-        sched.spawn(t.release());
+        spawn_task(sched, t);
     }
     
     // Wait for completion
diff --git a/tests/unit/test_vthread_stack.cpp b/tests/unit/test_vthread_stack.cpp
new file mode 100644
index 0000000..af9f255
--- /dev/null
+++ b/tests/unit/test_vthread_stack.cpp
@@ -0,0 +1,543 @@
+#include <catch2/catch_test_macros.hpp>
+#include <elio/elio.hpp>
+#include <elio/runtime/spawn.hpp>
+#include <elio/coro/vthread_stack.hpp>
+#include <atomic>
+#include <thread>
+#include <vector>
+#include "../test_main.cpp"  // For scaled timeouts
+
+using namespace elio::coro;
+using namespace elio::runtime;
+using namespace elio::test;
+
+// ============================================================================
+// vthread_stack basic allocation/deallocation (LIFO correctness)
+// ============================================================================
+
+TEST_CASE("vthread_stack basic allocation and deallocation", "[vthread_stack]") {
+    vthread_stack stack;
+    
+    // Allocate several blocks
+    void* p1 = stack.push(64);
+    void* p2 = stack.push(128);
+    void* p3 = stack.push(256);
+    
+    REQUIRE(p1 != nullptr);
+    REQUIRE(p2 != nullptr);
+    REQUIRE(p3 != nullptr);
+    
+    // All pointers should be different
+    REQUIRE(p1 != p2);
+    REQUIRE(p2 != p3);
+    REQUIRE(p1 != p3);
+    
+    // Pop in reverse order (LIFO)
+    stack.pop(p3, 256);
+    stack.pop(p2, 128);
+    stack.pop(p1, 64);
+}
+
+TEST_CASE("vthread_stack LIFO order verification", "[vthread_stack]") {
+    vthread_stack stack;
+    
+    // Push multiple allocations and store their addresses
+    std::vector<std::pair<void*, size_t>> allocs;
+    for (size_t i = 1; i <= 10; ++i) {
+        size_t size = i * 16;
+        void* p = stack.push(size);
+        allocs.push_back({p, size});
+    }
+    
+    // Pop in reverse order - should not fail assertions
+    for (auto it = allocs.rbegin(); it != allocs.rend(); ++it) {
+        stack.pop(it->first, it->second);
+    }
+}
+
+// ============================================================================
+// Segment growth test
+// ============================================================================
+
+TEST_CASE("vthread_stack segment growth", "[vthread_stack]") {
+    vthread_stack stack;
+    
+    // Default segment size is 16KB, allocate more to trigger new segment
+    constexpr size_t large_size = 8192;  // 8KB each
+    
+    void* p1 = stack.push(large_size);
+    void* p2 = stack.push(large_size);
+    void* p3 = stack.push(large_size);  // This should trigger a new segment
+    
+    REQUIRE(p1 != nullptr);
+    REQUIRE(p2 != nullptr);
+    REQUIRE(p3 != nullptr);
+    
+    // All should be valid allocations
+    // Pop in reverse order
+    stack.pop(p3, large_size);
+    stack.pop(p2, large_size);
+    stack.pop(p1, large_size);
+}
+
+TEST_CASE("vthread_stack oversized allocation", "[vthread_stack]") {
+    vthread_stack stack;
+    
+    // Allocate larger than default segment size
+    constexpr size_t huge_size = 32768;  // 32KB > 16KB default
+    
+    void* p = stack.push(huge_size);
+    REQUIRE(p != nullptr);
+    
+    stack.pop(p, huge_size);
+}
+
+// ============================================================================
+// vthread_stack static API (thread-local current)
+// ============================================================================
+
+TEST_CASE("vthread_stack thread-local current", "[vthread_stack]") {
+    // Initially no current stack
+    REQUIRE(vthread_stack::current() == nullptr);
+    
+    vthread_stack stack;
+    vthread_stack::set_current(&stack);
+    REQUIRE(vthread_stack::current() == &stack);
+    
+    vthread_stack::set_current(nullptr);
+    REQUIRE(vthread_stack::current() == nullptr);
+}
+
+TEST_CASE("vthread_stack thread-local isolation", "[vthread_stack]") {
+    vthread_stack main_stack;
+    vthread_stack::set_current(&main_stack);
+    
+    std::atomic<bool> worker_isolated{false};
+    
+    std::thread worker([&]() {
+        // Worker thread should have no current stack
+        worker_isolated = (vthread_stack::current() == nullptr);
+        
+        vthread_stack worker_stack;
+        vthread_stack::set_current(&worker_stack);
+        
+        // Worker's current should be different from main's
+        worker_isolated = worker_isolated && (vthread_stack::current() != &main_stack);
+        
+        vthread_stack::set_current(nullptr);
+    });
+    
+    worker.join();
+    
+    REQUIRE(worker_isolated.load());
+    REQUIRE(vthread_stack::current() == &main_stack);
+    
+    vthread_stack::set_current(nullptr);
+}
+
+// ============================================================================
+// task<T> on-site co_await evaluation
+// ============================================================================
+
+namespace {
+task<int> compute(int x) {
+    co_return x * 2;
+}
+
+task<void> test_basic_await_impl() {
+    int val = co_await compute(21);
+    REQUIRE(val == 42);
+}
+}
+
+TEST_CASE("task co_await basic", "[vthread_stack]") {
+    elio::run(test_basic_await_impl);
+}
+
+// ============================================================================
+// task<void> symmetric test
+// ============================================================================
+
+namespace {
+task<void> void_work() {
+    co_return;
+}
+
+task<void> test_void_await_impl() {
+    co_await void_work();
+    // If we reach here, void await worked
+}
+}
+
+TEST_CASE("task<void> co_await", "[vthread_stack]") {
+    elio::run(test_void_await_impl);
+}
+
+// ============================================================================
+// Nested call chain: LIFO correctness for multi-layer co_await
+// ============================================================================
+
+namespace {
+task<int> level3() { co_return 1; }
+task<int> level2() { co_return co_await level3() + 1; }
+task<int> level1() { co_return co_await level2() + 1; }
+
+task<void> test_nested_impl() {
+    int result = co_await level1();
+    REQUIRE(result == 3);  // 1 + 1 + 1
+}
+}
+
+TEST_CASE("nested co_await chain LIFO", "[vthread_stack]") {
+    elio::run(test_nested_impl);
+}
+
+// ============================================================================
+// elio::go(func) — no-argument function
+// ============================================================================
+
+TEST_CASE("elio::go() with no-arg function", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<bool> done{false};
+    
+    auto work = [&done]() -> task<void> {
+        done.store(true);
+        co_return;
+    };
+    
+    elio::go(work);
+    
+    // Wait for completion
+    std::this_thread::sleep_for(scaled_ms(100));
+    
+    REQUIRE(done.load());
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// elio::go(func, args...) — with arguments
+// ============================================================================
+
+namespace {
+task<void> work_with_args(std::atomic<int>* counter, int increment) {
+    counter->fetch_add(increment);
+    co_return;
+}
+}
+
+TEST_CASE("elio::go() with arguments", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<int> counter{0};
+    
+    elio::go(work_with_args, &counter, 10);
+    
+    std::this_thread::sleep_for(scaled_ms(100));
+    
+    REQUIRE(counter.load() == 10);
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// elio::spawn(func) — returns join_handle
+// ============================================================================
+
+TEST_CASE("elio::spawn() returns join_handle", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<bool> completed{false};
+    
+    auto driver = [&]() -> task<void> {
+        auto h = elio::spawn(compute, 21);
+        int result = co_await h;
+        REQUIRE(result == 42);
+        completed.store(true);
+    };
+    
+    elio::go(driver);
+    
+    std::this_thread::sleep_for(scaled_ms(200));
+    
+    REQUIRE(completed.load());
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// elio::spawn(func, args...) — with arguments
+// ============================================================================
+
+namespace {
+task<int> add_values(int a, int b) {
+    co_return a + b;
+}
+}
+
+TEST_CASE("elio::spawn() with arguments", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<bool> completed{false};
+    
+    auto driver = [&]() -> task<void> {
+        auto h = elio::spawn(add_values, 10, 20);
+        int result = co_await h;
+        REQUIRE(result == 30);
+        completed.store(true);
+    };
+    
+    elio::go(driver);
+    
+    std::this_thread::sleep_for(scaled_ms(200));
+    
+    REQUIRE(completed.load());
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// ELIO_GO(expr) / ELIO_SPAWN(expr) macro forms
+// ============================================================================
+
+TEST_CASE("ELIO_GO macro", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<bool> done{false};
+    
+    auto work = [&done]() -> task<void> {
+        done.store(true);
+        co_return;
+    };
+    
+    ELIO_GO(work());
+    
+    std::this_thread::sleep_for(scaled_ms(100));
+    
+    REQUIRE(done.load());
+    
+    sched.shutdown();
+}
+
+TEST_CASE("ELIO_SPAWN macro", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<bool> completed{false};
+    
+    auto driver = [&]() -> task<void> {
+        auto h = ELIO_SPAWN(compute(21));
+        int result = co_await h;
+        REQUIRE(result == 42);
+        completed.store(true);
+    };
+    
+    elio::go(driver);
+    
+    std::this_thread::sleep_for(scaled_ms(200));
+    
+    REQUIRE(completed.load());
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// elio::run(func) — entry execution
+// ============================================================================
+
+TEST_CASE("elio::run() executes task to completion", "[vthread_stack]") {
+    auto main_task = []() -> task<int> {
+        co_return 42;
+    };
+    
+    int result = elio::run(main_task);
+    REQUIRE(result == 42);
+}
+
+TEST_CASE("elio::run() with void task", "[vthread_stack]") {
+    std::atomic<bool> executed{false};
+    
+    auto main_task = [&executed]() -> task<void> {
+        executed.store(true);
+        co_return;
+    };
+    
+    elio::run(main_task);
+    REQUIRE(executed.load());
+}
+
+// ============================================================================
+// Mixed scenario: elio::go + internal co_await chain
+// ============================================================================
+
+TEST_CASE("elio::go with internal co_await chain", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<int> final_result{0};
+    
+    auto complex_task = [&]() -> task<void> {
+        // Multi-level co_await chain inside a go'd task
+        int r1 = co_await level1();  // Returns 3
+        int r2 = co_await compute(r1);  // 3 * 2 = 6
+        int r3 = co_await add_values(r2, 4);  // 6 + 4 = 10
+        final_result.store(r3);
+    };
+    
+    elio::go(complex_task);
+    
+    std::this_thread::sleep_for(scaled_ms(200));
+    
+    REQUIRE(final_result.load() == 10);
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// Independent vstack isolation verification
+// ============================================================================
+
+TEST_CASE("independent vstack isolation between spawned coroutines", "[vthread_stack][spawn]") {
+    scheduler sched(4);
+    sched.start();
+    
+    // Each spawned coroutine should have its own vstack
+    std::atomic<int> unique_vstacks{0};
+    std::atomic<int> completed{0};
+    std::vector<vthread_stack*> observed_vstacks;
+    std::mutex vstacks_mutex;
+    
+    auto worker = [&]([[maybe_unused]] int id) -> task<void> {
+        auto* my_vstack = vthread_stack::current();
+        
+        {
+            std::lock_guard<std::mutex> lock(vstacks_mutex);
+            // Check if this vstack was seen before
+            bool is_unique = true;
+            for (auto* vs : observed_vstacks) {
+                if (vs == my_vstack) {
+                    is_unique = false;
+                    break;
+                }
+            }
+            if (is_unique && my_vstack != nullptr) {
+                observed_vstacks.push_back(my_vstack);
+                unique_vstacks.fetch_add(1);
+            }
+        }
+        
+        completed.fetch_add(1);
+        co_return;
+    };
+    
+    // Spawn multiple tasks
+    constexpr int num_tasks = 10;
+    for (int i = 0; i < num_tasks; ++i) {
+        elio::go([&worker, i]() { return worker(i); });
+    }
+    
+    // Wait for all to complete
+    while (completed.load() < num_tasks) {
+        std::this_thread::sleep_for(scaled_ms(10));
+    }
+    
+    // Each task should have had a unique vstack
+    REQUIRE(unique_vstacks.load() == num_tasks);
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// Exception propagation
+// ============================================================================
+
+namespace {
+task<int> throwing_task() {
+    throw std::runtime_error("test exception");
+    co_return 0;
+}
+}
+
+TEST_CASE("exception propagation through co_await", "[vthread_stack]") {
+    auto test_task = []() -> task<void> {
+        bool caught = false;
+        try {
+            co_await throwing_task();
+        } catch (const std::runtime_error& e) {
+            REQUIRE(std::string(e.what()) == "test exception");
+            caught = true;
+        }
+        REQUIRE(caught);
+    };
+    
+    elio::run(test_task);
+}
+
+TEST_CASE("exception propagation through spawn", "[vthread_stack][spawn]") {
+    scheduler sched(2);
+    sched.start();
+    
+    std::atomic<bool> caught_exception{false};
+    
+    auto catcher = [&]() -> task<void> {
+        try {
+            auto h = elio::spawn(throwing_task);
+            co_await h;
+        } catch (const std::runtime_error& e) {
+            REQUIRE(std::string(e.what()) == "test exception");
+            caught_exception.store(true);
+        }
+    };
+    
+    elio::go(catcher);
+    
+    std::this_thread::sleep_for(scaled_ms(200));
+    
+    REQUIRE(caught_exception.load());
+    
+    sched.shutdown();
+}
+
+// ============================================================================
+// task immovability compile-time verification
+// ============================================================================
+
+TEST_CASE("task is non-movable and non-copyable", "[vthread_stack]") {
+    STATIC_REQUIRE_FALSE(std::is_move_constructible_v<task<int>>);
+    STATIC_REQUIRE_FALSE(std::is_move_assignable_v<task<int>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_constructible_v<task<int>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_assignable_v<task<int>>);
+    
+    STATIC_REQUIRE_FALSE(std::is_move_constructible_v<task<void>>);
+    STATIC_REQUIRE_FALSE(std::is_move_assignable_v<task<void>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_constructible_v<task<void>>);
+    STATIC_REQUIRE_FALSE(std::is_copy_assignable_v<task<void>>);
+}
+
+// ============================================================================
+// Deep nested co_await test
+// ============================================================================
+
+namespace {
+task<int> deep_recursion(int depth) {
+    if (depth <= 0) {
+        co_return 1;
+    }
+    co_return co_await deep_recursion(depth - 1) + 1;
+}
+}
+
+TEST_CASE("deep nested co_await chain", "[vthread_stack]") {
+    auto test = []() -> task<void> {
+        int result = co_await deep_recursion(10);
+        REQUIRE(result == 11);  // 10 levels + 1 base
+    };
+    
+    elio::run(test);
+}