pytorch · larryliu0820 · Mar 3, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 3, 2026
diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
@@ -186,14 +186,17 @@ try {
             -RedirectStandardError $stderrFile
 
         $stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" }
+        $stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" }
         $exitCode = $proc.ExitCode
     }
     finally {
         Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue
         Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue
     }
-    Write-Host "Runner output:"
+    Write-Host "Runner stdout:"
     Write-Host $stdout
+    Write-Host "Runner stderr:"
+    Write-Host $stderr
 
     if ($exitCode -ne 0) {
         Write-Warning "Runner exited with code $exitCode (may be benign)"

@@ -114,6 +114,12 @@ target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
 target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
 target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
 
+# AOTI-generated CUDA code can use significant stack depth; the Windows default
+# of 1 MB is not enough for large multimodal models.
+if(WIN32)
+  target_link_options(voxtral_runner PRIVATE "/STACK:8388608")
+endif()
+
 # On Windows, copy required DLLs to the executable directory
 if(MSVC AND EXECUTORCH_BUILD_CUDA)
   add_custom_command(

@@ -89,9 +89,6 @@ Result<uint64_t> MultimodalRunner::prefill(
     const std::vector<MultimodalInput>& inputs,
     int32_t num_bos,
     int32_t num_eos) {
-  if (!is_loaded()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-  }
   uint64_t last_token = 0;
   for (size_t i = 0; i < inputs.size(); ++i) {
     const auto& input = inputs[i];
@@ -129,8 +126,8 @@ Result<uint64_t> MultimodalRunner::prefill(
 Error MultimodalRunner::decode_from_token(
     uint64_t cur_token,
     const GenerationConfig& config,
-    std::function<void(const std::string&)> wrapped_callback,
-    std::function<void(const Stats&)> stats_callback) {
+    const std::function<void(const std::string&)>& wrapped_callback,
+    const std::function<void(const Stats&)>& stats_callback) {
   stats_->first_token_ms = time_in_ms();
   stats_->prompt_eval_end_ms = time_in_ms();
   stats_->num_prompt_tokens = pos_;
@@ -233,6 +230,11 @@ Error MultimodalRunner::generate(
     const GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
+  if (inputs.empty()) {
+    ET_LOG(Error, "MultimodalInput vector cannot be empty");
+    return Error::InvalidArgument;
+  }
+
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
   }
@@ -261,29 +263,112 @@ Error MultimodalRunner::generate(
   // Reset internal state and start inference
   stats_->inference_start_ms = time_in_ms();
 
-  uint64_t cur_token = 0;
-  if (!inputs.empty()) {
-    // Echo the last text input if enabled
-    if (config.echo && inputs.back().is_text()) {
-      wrapped_callback(inputs.back().get_text());
+  uint64_t prefill_next_token = 0;
+  // Process multimodal inputs in order
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const MultimodalInput& input = inputs[i];
+    if (config.echo && i == inputs.size() - 1 && input.is_text()) {
+      wrapped_callback(input.get_text());
     }
+    int32_t bos = 0;
+    int32_t eos = 0;
+    if (i == 0 && pos_ == 0) {
+      if (input.is_text() || input.is_tokens()) {
+        bos = config.num_bos;
+        eos = config.num_eos;
+      }
+    }
+    auto prefill_result = multimodal_prefiller_->prefill(input, pos_, bos, eos);
+    if (!prefill_result.ok()) {
+      return prefill_result.error();
+    }
+    prefill_next_token = prefill_result.get();
+  }
 
-    // Prefill all inputs and get the first decode token
-    auto prefill_result = prefill(inputs, config.num_bos, config.num_eos);
-    ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
-    cur_token = prefill_result.get();
-    prefill_next_token_.reset();
+  stats_->first_token_ms = time_in_ms();
+  stats_->prompt_eval_end_ms = time_in_ms();
+  stats_->num_prompt_tokens = pos_;
+
+  auto decode_result =
+      tokenizer_->decode(prefill_next_token, prefill_next_token);
+  if (!decode_result.ok()) {
+    ET_LOG(
+        Error,
+        "Tokenizers error code %d",
+        static_cast<uint32_t>(decode_result.error()));
+    return Error::InvalidArgument;
+  }
+  wrapped_callback(std::move(*decode_result));
+
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after multimodal input processing: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // Resolve max_new_tokens based on config
+  int64_t max_context_len = metadata_.at(kMaxContextLen);
+  int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_);
+
+  ET_LOG(
+      Info,
+      "Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64,
+      max_new_tokens,
+      pos_,
+      max_context_len);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      max_new_tokens > 0,
+      InvalidArgument,
+      "Max new tokens %d is less than or equal to 0",
+      max_new_tokens);
+
+  // Set ignore_eos based on config
+  text_token_generator_->set_ignore_eos(config.ignore_eos);
+
+  // Generate tokens using the text token generator
+  std::vector<uint64_t> prompt_tokens = {prefill_next_token};
+  auto generate_result = text_token_generator_->generate(
+      /*tokens=*/prompt_tokens,
+      /*start_pos=*/pos_,
+      /*max_new_tokens=*/max_new_tokens -
+          1, // Subtract 1 because prefill already generated 1 token
+      /*temperature=*/config.temperature,
+      /*token_callback=*/wrapped_callback);
+  if (!generate_result.ok()) {
+    return generate_result.error();
+  }
+  int64_t num_generated_tokens = generate_result.get();
+
+  pos_ += num_generated_tokens;
+  // Update stats
+  stats_->num_generated_tokens = num_generated_tokens;
+  // Finalize stats and call callback
+  stats_->inference_end_ms = time_in_ms();
+
+#ifdef CUDA_AVAILABLE
+  cuda_memory_tracker_->log_sample("after_generate");
+  stats_->gpu_free_after_generate_bytes =
+      cuda_memory_tracker_->last_free_bytes();
+  // update peak in case it changed after generation
+  stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
+#endif
+
+  if (!config.warming) {
+    printf("\n");
+  }
+
+  if (config.warming) {
+    ET_LOG(Info, "Warmup run finished!");
   } else {
-    // Empty inputs: consume token from a prior prefill() call
-    ET_CHECK_OR_RETURN_ERROR(
-        prefill_next_token_.has_value(),
-        InvalidState,
-        "Empty inputs requires a prior prefill() call");
-    cur_token = prefill_next_token_.value();
-    prefill_next_token_.reset();
+    // Do not print report during warmup
+    print_report(*stats_);
+  }
+
+  if (stats_callback) {
+    stats_callback(*stats_);
   }
 
-  return decode_from_token(cur_token, config, wrapped_callback, stats_callback);
+  return Error::Ok;
 }
 
 } // namespace executorch::extension::llm
@@ -202,8 +202,8 @@ class ET_EXPERIMENTAL MultimodalRunner : public IRunner {
   ::executorch::runtime::Error decode_from_token(
       uint64_t cur_token,
       const GenerationConfig& config,
-      std::function<void(const std::string&)> wrapped_callback,
-      std::function<void(const Stats&)> stats_callback);
+      const std::function<void(const std::string&)>& wrapped_callback,
+      const std::function<void(const Stats&)>& stats_callback);
 };
 
 } // namespace llm