diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1 index 92022d1ac4c..fecff3b84c2 100644 --- a/.ci/scripts/test_model_e2e_windows.ps1 +++ b/.ci/scripts/test_model_e2e_windows.ps1 @@ -186,14 +186,17 @@ try { -RedirectStandardError $stderrFile $stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" } + $stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" } $exitCode = $proc.ExitCode } finally { Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue } - Write-Host "Runner output:" + Write-Host "Runner stdout:" Write-Host $stdout + Write-Host "Runner stderr:" + Write-Host $stderr if ($exitCode -ne 0) { Write-Warning "Runner exited with code $exitCode (may be benign)" diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 80baaa28ff4..036e6454efe 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -114,6 +114,12 @@ target_include_directories(voxtral_runner PUBLIC ${_common_include_directories}) target_link_libraries(voxtral_runner PUBLIC ${link_libraries}) target_compile_options(voxtral_runner PUBLIC ${_common_compile_options}) +# AOTI-generated CUDA code can use significant stack depth; the Windows default +# of 1 MB is not enough for large multimodal models. +if(WIN32) + target_link_options(voxtral_runner PRIVATE "/STACK:8388608") +endif() + # On Windows, copy required DLLs to the executable directory if(MSVC AND EXECUTORCH_BUILD_CUDA) add_custom_command( diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index 06dbbbd11da..9afaab0b97e 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -89,9 +89,6 @@ Result MultimodalRunner::prefill( const std::vector& inputs, int32_t num_bos, int32_t num_eos) { - if (!is_loaded()) { - ET_CHECK_OK_OR_RETURN_ERROR(load()); - } uint64_t last_token = 0; for (size_t i = 0; i < inputs.size(); ++i) { const auto& input = inputs[i]; @@ -129,8 +126,8 @@ Result MultimodalRunner::prefill( Error MultimodalRunner::decode_from_token( uint64_t cur_token, const GenerationConfig& config, - std::function wrapped_callback, - std::function stats_callback) { + const std::function& wrapped_callback, + const std::function& stats_callback) { stats_->first_token_ms = time_in_ms(); stats_->prompt_eval_end_ms = time_in_ms(); stats_->num_prompt_tokens = pos_; @@ -233,6 +230,11 @@ Error MultimodalRunner::generate( const GenerationConfig& config, std::function token_callback, std::function stats_callback) { + if (inputs.empty()) { + ET_LOG(Error, "MultimodalInput vector cannot be empty"); + return Error::InvalidArgument; + } + if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); } @@ -261,29 +263,112 @@ Error MultimodalRunner::generate( // Reset internal state and start inference stats_->inference_start_ms = time_in_ms(); - uint64_t cur_token = 0; - if (!inputs.empty()) { - // Echo the last text input if enabled - if (config.echo && inputs.back().is_text()) { - wrapped_callback(inputs.back().get_text()); + uint64_t prefill_next_token = 0; + // Process multimodal inputs in order + for (size_t i = 0; i < inputs.size(); ++i) { + const MultimodalInput& input = inputs[i]; + if (config.echo && i == inputs.size() - 1 && input.is_text()) { + wrapped_callback(input.get_text()); } + int32_t bos = 0; + int32_t eos = 0; + if (i == 0 && pos_ == 0) { + if (input.is_text() || input.is_tokens()) { + bos = config.num_bos; + eos = config.num_eos; + } + } + auto prefill_result = multimodal_prefiller_->prefill(input, pos_, bos, eos); + if (!prefill_result.ok()) { + return prefill_result.error(); + } + prefill_next_token = prefill_result.get(); + } - // Prefill all inputs and get the first decode token - auto prefill_result = prefill(inputs, config.num_bos, config.num_eos); - ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error()); - cur_token = prefill_result.get(); - prefill_next_token_.reset(); + stats_->first_token_ms = time_in_ms(); + stats_->prompt_eval_end_ms = time_in_ms(); + stats_->num_prompt_tokens = pos_; + + auto decode_result = + tokenizer_->decode(prefill_next_token, prefill_next_token); + if (!decode_result.ok()) { + ET_LOG( + Error, + "Tokenizers error code %d", + static_cast(decode_result.error())); + return Error::InvalidArgument; + } + wrapped_callback(std::move(*decode_result)); + + RUNNER_ET_LOG( + config.warming, + "RSS after multimodal input processing: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // Resolve max_new_tokens based on config + int64_t max_context_len = metadata_.at(kMaxContextLen); + int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_); + + ET_LOG( + Info, + "Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64, + max_new_tokens, + pos_, + max_context_len); + + ET_CHECK_OR_RETURN_ERROR( + max_new_tokens > 0, + InvalidArgument, + "Max new tokens %d is less than or equal to 0", + max_new_tokens); + + // Set ignore_eos based on config + text_token_generator_->set_ignore_eos(config.ignore_eos); + + // Generate tokens using the text token generator + std::vector prompt_tokens = {prefill_next_token}; + auto generate_result = text_token_generator_->generate( + /*tokens=*/prompt_tokens, + /*start_pos=*/pos_, + /*max_new_tokens=*/max_new_tokens - + 1, // Subtract 1 because prefill already generated 1 token + /*temperature=*/config.temperature, + /*token_callback=*/wrapped_callback); + if (!generate_result.ok()) { + return generate_result.error(); + } + int64_t num_generated_tokens = generate_result.get(); + + pos_ += num_generated_tokens; + // Update stats + stats_->num_generated_tokens = num_generated_tokens; + // Finalize stats and call callback + stats_->inference_end_ms = time_in_ms(); + +#ifdef CUDA_AVAILABLE + cuda_memory_tracker_->log_sample("after_generate"); + stats_->gpu_free_after_generate_bytes = + cuda_memory_tracker_->last_free_bytes(); + // update peak in case it changed after generation + stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb(); +#endif + + if (!config.warming) { + printf("\n"); + } + + if (config.warming) { + ET_LOG(Info, "Warmup run finished!"); } else { - // Empty inputs: consume token from a prior prefill() call - ET_CHECK_OR_RETURN_ERROR( - prefill_next_token_.has_value(), - InvalidState, - "Empty inputs requires a prior prefill() call"); - cur_token = prefill_next_token_.value(); - prefill_next_token_.reset(); + // Do not print report during warmup + print_report(*stats_); + } + + if (stats_callback) { + stats_callback(*stats_); } - return decode_from_token(cur_token, config, wrapped_callback, stats_callback); + return Error::Ok; } } // namespace executorch::extension::llm diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 797bcd9c728..7c87f7bff5a 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -202,8 +202,8 @@ class ET_EXPERIMENTAL MultimodalRunner : public IRunner { ::executorch::runtime::Error decode_from_token( uint64_t cur_token, const GenerationConfig& config, - std::function wrapped_callback, - std::function stats_callback); + const std::function& wrapped_callback, + const std::function& stats_callback); }; } // namespace llm