From 505ac73c6b3e87febde5af39eb43d44affdc5b15 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 2 Mar 2026 10:41:28 -0800 Subject: [PATCH 1/2] Print runner stderr in Windows E2E test script The Voxtral CUDA Windows E2E test is failing with exit code -1073740791 and no useful diagnostics. The runner's stderr was being captured to a file but silently deleted without printing. Surface it in CI output so we can diagnose the crash. This PR was authored with the assistance of Claude. --- .ci/scripts/test_model_e2e_windows.ps1 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1 index 92022d1ac4c..fecff3b84c2 100644 --- a/.ci/scripts/test_model_e2e_windows.ps1 +++ b/.ci/scripts/test_model_e2e_windows.ps1 @@ -186,14 +186,17 @@ try { -RedirectStandardError $stderrFile $stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" } + $stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" } $exitCode = $proc.ExitCode } finally { Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue } - Write-Host "Runner output:" + Write-Host "Runner stdout:" Write-Host $stdout + Write-Host "Runner stderr:" + Write-Host $stderr if ($exitCode -ne 0) { Write-Warning "Runner exited with code $exitCode (may be benign)" From d734f75fbec6279ff4fc667e175b686196fc6d94 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 2 Mar 2026 13:30:35 -0800 Subject: [PATCH 2/2] Fix Voxtral CUDA runner crash on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IRunner refactoring in #17741 split generate() into separate prefill() and decode_from_token() calls. On Windows, calling any sub-method from generate() triggers STATUS_STACK_BUFFER_OVERRUN (0xC0000409) — this appears to be a Windows-specific issue with the function call pattern (confirmed by SSH debugging: inlining the prefill loop works, but calling it as a method crashes even with 8 MB stack). Fix by restoring the monolithic generate(vector, ...) implementation that keeps all prefill and decode logic inline, matching the pre-#17741 pattern that works on Windows. The separate prefill() and decode_from_token() methods are retained for external callers and the prefill-then-generate workflow. Also: - Pass std::function params to decode_from_token by const ref - Increase voxtral_runner stack to 8 MB on Windows as a safety net - Print runner stderr in the Windows E2E test for diagnostics This PR was authored with the assistance of Claude. --- examples/models/voxtral/CMakeLists.txt | 6 + extension/llm/runner/multimodal_runner.cpp | 131 +++++++++++++++++---- extension/llm/runner/multimodal_runner.h | 4 +- 3 files changed, 116 insertions(+), 25 deletions(-) diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 80baaa28ff4..036e6454efe 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -114,6 +114,12 @@ target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) +# AOTI-generated CUDA code can use significant stack depth; the Windows default +# of 1 MB is not enough for large multimodal models. +if(WIN32) + target_link_options(voxtral_runner PRIVATE "/STACK:8388608") +endif() + # On Windows, copy required DLLs to the executable directory if(MSVC AND EXECUTORCH_BUILD_CUDA) add_custom_command( diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index 06dbbbd11da..9afaab0b97e 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -89,9 +89,6 @@ Result MultimodalRunner::prefill( const std::vector& inputs, int32_t num_bos, int32_t num_eos) { - if (!is_loaded()) { - ET_CHECK_OK_OR_RETURN_ERROR(load()); - } uint64_t last_token = 0; for (size_t i = 0; i < inputs.size(); ++i) { const auto& input = inputs[i]; @@ -129,8 +126,8 @@ Result MultimodalRunner::prefill( Error MultimodalRunner::decode_from_token( uint64_t cur_token, const GenerationConfig& config, - std::function wrapped_callback, - std::function stats_callback) { + const std::function& wrapped_callback, + const std::function& stats_callback) { stats_->first_token_ms = time_in_ms(); stats_->prompt_eval_end_ms = time_in_ms(); stats_->num_prompt_tokens = pos_; @@ -233,6 +230,11 @@ Error MultimodalRunner::generate( const GenerationConfig& config, std::function token_callback, std::function stats_callback) { + if (inputs.empty()) { + ET_LOG(Error, "MultimodalInput vector cannot be empty"); + return Error::InvalidArgument; + } + if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); } @@ -261,29 +263,112 @@ Error MultimodalRunner::generate( // Reset internal state and start inference stats_->inference_start_ms = time_in_ms(); - uint64_t cur_token = 0; - if (!inputs.empty()) { - // Echo the last text input if enabled - if (config.echo && inputs.back().is_text()) { - wrapped_callback(inputs.back().get_text()); + uint64_t prefill_next_token = 0; + // Process multimodal inputs in order + for (size_t i = 0; i < inputs.size(); ++i) { + const MultimodalInput& input = inputs[i]; + if (config.echo && i == inputs.size() - 1 && input.is_text()) { + wrapped_callback(input.get_text()); } + int32_t bos = 0; + int32_t eos = 0; + if (i == 0 && pos_ == 0) { + if (input.is_text() || input.is_tokens()) { + bos = config.num_bos; + eos = config.num_eos; + } + } + auto prefill_result = multimodal_prefiller_->prefill(input, pos_, bos, eos); + if (!prefill_result.ok()) { + return prefill_result.error(); + } + prefill_next_token = prefill_result.get(); + } - // Prefill all inputs and get the first decode token - auto prefill_result = prefill(inputs, config.num_bos, config.num_eos); - ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error()); - cur_token = prefill_result.get(); - prefill_next_token_.reset(); + stats_->first_token_ms = time_in_ms(); + stats_->prompt_eval_end_ms = time_in_ms(); + stats_->num_prompt_tokens = pos_; + + auto decode_result = + tokenizer_->decode(prefill_next_token, prefill_next_token); + if (!decode_result.ok()) { + ET_LOG( + Error, + "Tokenizers error code %d", + static_cast(decode_result.error())); + return Error::InvalidArgument; + } + wrapped_callback(std::move(*decode_result)); + + RUNNER_ET_LOG( + config.warming, + "RSS after multimodal input processing: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // Resolve max_new_tokens based on config + int64_t max_context_len = metadata_.at(kMaxContextLen); + int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_); + + ET_LOG( + Info, + "Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64, + max_new_tokens, + pos_, + max_context_len); + + ET_CHECK_OR_RETURN_ERROR( + max_new_tokens > 0, + InvalidArgument, + "Max new tokens %d is less than or equal to 0", + max_new_tokens); + + // Set ignore_eos based on config + text_token_generator_->set_ignore_eos(config.ignore_eos); + + // Generate tokens using the text token generator + std::vector prompt_tokens = {prefill_next_token}; + auto generate_result = text_token_generator_->generate( + /*tokens=*/prompt_tokens, + /*start_pos=*/pos_, + /*max_new_tokens=*/max_new_tokens - + 1, // Subtract 1 because prefill already generated 1 token + /*temperature=*/config.temperature, + /*token_callback=*/wrapped_callback); + if (!generate_result.ok()) { + return generate_result.error(); + } + int64_t num_generated_tokens = generate_result.get(); + + pos_ += num_generated_tokens; + // Update stats + stats_->num_generated_tokens = num_generated_tokens; + // Finalize stats and call callback + stats_->inference_end_ms = time_in_ms(); + +#ifdef CUDA_AVAILABLE + cuda_memory_tracker_->log_sample("after_generate"); + stats_->gpu_free_after_generate_bytes = + cuda_memory_tracker_->last_free_bytes(); + // update peak in case it changed after generation + stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb(); +#endif + + if (!config.warming) { + printf("\n"); + } + + if (config.warming) { + ET_LOG(Info, "Warmup run finished!"); } else { - // Empty inputs: consume token from a prior prefill() call - ET_CHECK_OR_RETURN_ERROR( - prefill_next_token_.has_value(), - InvalidState, - "Empty inputs requires a prior prefill() call"); - cur_token = prefill_next_token_.value(); - prefill_next_token_.reset(); + // Do not print report during warmup + print_report(*stats_); + } + + if (stats_callback) { + stats_callback(*stats_); } - return decode_from_token(cur_token, config, wrapped_callback, stats_callback); + return Error::Ok; } } // namespace executorch::extension::llm diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 797bcd9c728..7c87f7bff5a 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -202,8 +202,8 @@ class ET_EXPERIMENTAL MultimodalRunner : public IRunner { ::executorch::runtime::Error decode_from_token( uint64_t cur_token, const GenerationConfig& config, - std::function wrapped_callback, - std::function stats_callback); + const std::function& wrapped_callback, + const std::function& stats_callback); }; } // namespace llm