From 505ac73c6b3e87febde5af39eb43d44affdc5b15 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 2 Mar 2026 10:41:28 -0800
Subject: [PATCH 1/2] Print runner stderr in Windows E2E test script

The Voxtral CUDA Windows E2E test is failing with exit code -1073740791
and no useful diagnostics. The runner's stderr was being captured to a
file but silently deleted without printing. Surface it in CI output so
we can diagnose the crash.

This PR was authored with the assistance of Claude.
---
 .ci/scripts/test_model_e2e_windows.ps1 | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
index 92022d1ac4c..fecff3b84c2 100644
--- a/.ci/scripts/test_model_e2e_windows.ps1
+++ b/.ci/scripts/test_model_e2e_windows.ps1
@@ -186,14 +186,17 @@ try {
             -RedirectStandardError $stderrFile
 
         $stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" }
+        $stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" }
         $exitCode = $proc.ExitCode
     }
     finally {
         Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue
         Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue
     }
-    Write-Host "Runner output:"
+    Write-Host "Runner stdout:"
     Write-Host $stdout
+    Write-Host "Runner stderr:"
+    Write-Host $stderr
 
     if ($exitCode -ne 0) {
         Write-Warning "Runner exited with code $exitCode (may be benign)"

From d734f75fbec6279ff4fc667e175b686196fc6d94 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 2 Mar 2026 13:30:35 -0800
Subject: [PATCH 2/2] Fix Voxtral CUDA runner crash on Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The IRunner refactoring in #17741 split generate() into separate
prefill() and decode_from_token() calls. On Windows, calling any
sub-method from generate() triggers STATUS_STACK_BUFFER_OVERRUN
(0xC0000409) — this appears to be a Windows-specific issue with the
function call pattern (confirmed by SSH debugging: inlining the prefill
loop works, but calling it as a method crashes even with 8 MB stack).

Fix by restoring the monolithic generate(vector, ...) implementation
that keeps all prefill and decode logic inline, matching the pre-#17741
pattern that works on Windows. The separate prefill() and
decode_from_token() methods are retained for external callers and the
prefill-then-generate workflow.

Also:
- Pass std::function params to decode_from_token by const ref
- Increase voxtral_runner stack to 8 MB on Windows as a safety net
- Print runner stderr in the Windows E2E test for diagnostics

This PR was authored with the assistance of Claude.
---
 examples/models/voxtral/CMakeLists.txt     |   6 +
 extension/llm/runner/multimodal_runner.cpp | 131 +++++++++++++++++----
 extension/llm/runner/multimodal_runner.h   |   4 +-
 3 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 80baaa28ff4..036e6454efe 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -114,6 +114,12 @@ target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
 target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
 target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
 
+# AOTI-generated CUDA code can use significant stack depth; the Windows default
+# of 1 MB is not enough for large multimodal models.
+if(WIN32)
+  target_link_options(voxtral_runner PRIVATE "/STACK:8388608")
+endif()
+
 # On Windows, copy required DLLs to the executable directory
 if(MSVC AND EXECUTORCH_BUILD_CUDA)
   add_custom_command(
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index 06dbbbd11da..9afaab0b97e 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -89,9 +89,6 @@ Result<uint64_t> MultimodalRunner::prefill(
     const std::vector<MultimodalInput>& inputs,
     int32_t num_bos,
     int32_t num_eos) {
-  if (!is_loaded()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-  }
   uint64_t last_token = 0;
   for (size_t i = 0; i < inputs.size(); ++i) {
     const auto& input = inputs[i];
@@ -129,8 +126,8 @@ Result<uint64_t> MultimodalRunner::prefill(
 Error MultimodalRunner::decode_from_token(
     uint64_t cur_token,
     const GenerationConfig& config,
-    std::function<void(const std::string&)> wrapped_callback,
-    std::function<void(const Stats&)> stats_callback) {
+    const std::function<void(const std::string&)>& wrapped_callback,
+    const std::function<void(const Stats&)>& stats_callback) {
   stats_->first_token_ms = time_in_ms();
   stats_->prompt_eval_end_ms = time_in_ms();
   stats_->num_prompt_tokens = pos_;
@@ -233,6 +230,11 @@ Error MultimodalRunner::generate(
     const GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
+  if (inputs.empty()) {
+    ET_LOG(Error, "MultimodalInput vector cannot be empty");
+    return Error::InvalidArgument;
+  }
+
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
@@ -261,29 +263,112 @@ Error MultimodalRunner::generate(
   // Reset internal state and start inference
   stats_->inference_start_ms = time_in_ms();
 
-  uint64_t cur_token = 0;
-  if (!inputs.empty()) {
-    // Echo the last text input if enabled
-    if (config.echo && inputs.back().is_text()) {
-      wrapped_callback(inputs.back().get_text());
+  uint64_t prefill_next_token = 0;
+  // Process multimodal inputs in order
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const MultimodalInput& input = inputs[i];
+    if (config.echo && i == inputs.size() - 1 && input.is_text()) {
+      wrapped_callback(input.get_text());
     }
+    int32_t bos = 0;
+    int32_t eos = 0;
+    if (i == 0 && pos_ == 0) {
+      if (input.is_text() || input.is_tokens()) {
+        bos = config.num_bos;
+        eos = config.num_eos;
+      }
+    }
+    auto prefill_result = multimodal_prefiller_->prefill(input, pos_, bos, eos);
+    if (!prefill_result.ok()) {
+      return prefill_result.error();
+    }
+    prefill_next_token = prefill_result.get();
+  }
 
-    // Prefill all inputs and get the first decode token
-    auto prefill_result = prefill(inputs, config.num_bos, config.num_eos);
-    ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
-    cur_token = prefill_result.get();
-    prefill_next_token_.reset();
+  stats_->first_token_ms = time_in_ms();
+  stats_->prompt_eval_end_ms = time_in_ms();
+  stats_->num_prompt_tokens = pos_;
+
+  auto decode_result =
+      tokenizer_->decode(prefill_next_token, prefill_next_token);
+  if (!decode_result.ok()) {
+    ET_LOG(
+        Error,
+        "Tokenizers error code %d",
+        static_cast<uint32_t>(decode_result.error()));
+    return Error::InvalidArgument;
+  }
+  wrapped_callback(std::move(*decode_result));
+
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after multimodal input processing: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // Resolve max_new_tokens based on config
+  int64_t max_context_len = metadata_.at(kMaxContextLen);
+  int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_);
+
+  ET_LOG(
+      Info,
+      "Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64,
+      max_new_tokens,
+      pos_,
+      max_context_len);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      max_new_tokens > 0,
+      InvalidArgument,
+      "Max new tokens %d is less than or equal to 0",
+      max_new_tokens);
+
+  // Set ignore_eos based on config
+  text_token_generator_->set_ignore_eos(config.ignore_eos);
+
+  // Generate tokens using the text token generator
+  std::vector<uint64_t> prompt_tokens = {prefill_next_token};
+  auto generate_result = text_token_generator_->generate(
+      /*tokens=*/prompt_tokens,
+      /*start_pos=*/pos_,
+      /*max_new_tokens=*/max_new_tokens -
+          1, // Subtract 1 because prefill already generated 1 token
+      /*temperature=*/config.temperature,
+      /*token_callback=*/wrapped_callback);
+  if (!generate_result.ok()) {
+    return generate_result.error();
+  }
+  int64_t num_generated_tokens = generate_result.get();
+
+  pos_ += num_generated_tokens;
+  // Update stats
+  stats_->num_generated_tokens = num_generated_tokens;
+  // Finalize stats and call callback
+  stats_->inference_end_ms = time_in_ms();
+
+#ifdef CUDA_AVAILABLE
+  cuda_memory_tracker_->log_sample("after_generate");
+  stats_->gpu_free_after_generate_bytes =
+      cuda_memory_tracker_->last_free_bytes();
+  // update peak in case it changed after generation
+  stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
+#endif
+
+  if (!config.warming) {
+    printf("\n");
+  }
+
+  if (config.warming) {
+    ET_LOG(Info, "Warmup run finished!");
   } else {
-    // Empty inputs: consume token from a prior prefill() call
-    ET_CHECK_OR_RETURN_ERROR(
-        prefill_next_token_.has_value(),
-        InvalidState,
-        "Empty inputs requires a prior prefill() call");
-    cur_token = prefill_next_token_.value();
-    prefill_next_token_.reset();
+    // Do not print report during warmup
+    print_report(*stats_);
+  }
+
+  if (stats_callback) {
+    stats_callback(*stats_);
   }
 
-  return decode_from_token(cur_token, config, wrapped_callback, stats_callback);
+  return Error::Ok;
 }
 
 } // namespace executorch::extension::llm
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index 797bcd9c728..7c87f7bff5a 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -202,8 +202,8 @@ class ET_EXPERIMENTAL MultimodalRunner : public IRunner {
   ::executorch::runtime::Error decode_from_token(
       uint64_t cur_token,
       const GenerationConfig& config,
-      std::function<void(const std::string&)> wrapped_callback,
-      std::function<void(const Stats&)> stats_callback);
+      const std::function<void(const std::string&)>& wrapped_callback,
+      const std::function<void(const Stats&)>& stats_callback);
 };
 
 } // namespace llm