Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .ci/scripts/test_model_e2e_windows.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,17 @@ try {
-RedirectStandardError $stderrFile

$stdout = if (Test-Path -Path $stdoutFile -PathType Leaf) { Get-Content -Path $stdoutFile -Raw } else { "" }
$stderr = if (Test-Path -Path $stderrFile -PathType Leaf) { Get-Content -Path $stderrFile -Raw } else { "" }
$exitCode = $proc.ExitCode
}
finally {
Remove-Item -Path $stdoutFile -ErrorAction SilentlyContinue
Remove-Item -Path $stderrFile -ErrorAction SilentlyContinue
}
Write-Host "Runner output:"
Write-Host "Runner stdout:"
Write-Host $stdout
Write-Host "Runner stderr:"
Write-Host $stderr

if ($exitCode -ne 0) {
Write-Warning "Runner exited with code $exitCode (may be benign)"
Expand Down
6 changes: 6 additions & 0 deletions examples/models/voxtral/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})

# AOTI-generated CUDA code can use significant stack depth; the Windows default
# of 1 MB is not enough for large multimodal models.
if(WIN32)
target_link_options(voxtral_runner PRIVATE "/STACK:8388608")
endif()

# On Windows, copy required DLLs to the executable directory
if(MSVC AND EXECUTORCH_BUILD_CUDA)
add_custom_command(
Expand Down
131 changes: 108 additions & 23 deletions extension/llm/runner/multimodal_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,6 @@ Result<uint64_t> MultimodalRunner::prefill(
const std::vector<MultimodalInput>& inputs,
int32_t num_bos,
int32_t num_eos) {
if (!is_loaded()) {
ET_CHECK_OK_OR_RETURN_ERROR(load());
}
uint64_t last_token = 0;
for (size_t i = 0; i < inputs.size(); ++i) {
const auto& input = inputs[i];
Expand Down Expand Up @@ -129,8 +126,8 @@ Result<uint64_t> MultimodalRunner::prefill(
Error MultimodalRunner::decode_from_token(
uint64_t cur_token,
const GenerationConfig& config,
std::function<void(const std::string&)> wrapped_callback,
std::function<void(const Stats&)> stats_callback) {
const std::function<void(const std::string&)>& wrapped_callback,
const std::function<void(const Stats&)>& stats_callback) {
stats_->first_token_ms = time_in_ms();
stats_->prompt_eval_end_ms = time_in_ms();
stats_->num_prompt_tokens = pos_;
Expand Down Expand Up @@ -233,6 +230,11 @@ Error MultimodalRunner::generate(
const GenerationConfig& config,
std::function<void(const std::string&)> token_callback,
std::function<void(const Stats&)> stats_callback) {
if (inputs.empty()) {
ET_LOG(Error, "MultimodalInput vector cannot be empty");
return Error::InvalidArgument;
}

if (!is_loaded()) {
ET_CHECK_OK_OR_RETURN_ERROR(load());
}
Expand Down Expand Up @@ -261,29 +263,112 @@ Error MultimodalRunner::generate(
// Reset internal state and start inference
stats_->inference_start_ms = time_in_ms();

uint64_t cur_token = 0;
if (!inputs.empty()) {
// Echo the last text input if enabled
if (config.echo && inputs.back().is_text()) {
wrapped_callback(inputs.back().get_text());
uint64_t prefill_next_token = 0;
// Process multimodal inputs in order
for (size_t i = 0; i < inputs.size(); ++i) {
const MultimodalInput& input = inputs[i];
if (config.echo && i == inputs.size() - 1 && input.is_text()) {
wrapped_callback(input.get_text());
}
int32_t bos = 0;
int32_t eos = 0;
if (i == 0 && pos_ == 0) {
if (input.is_text() || input.is_tokens()) {
bos = config.num_bos;
eos = config.num_eos;
}
}
auto prefill_result = multimodal_prefiller_->prefill(input, pos_, bos, eos);
if (!prefill_result.ok()) {
return prefill_result.error();
}
prefill_next_token = prefill_result.get();
}

// Prefill all inputs and get the first decode token
auto prefill_result = prefill(inputs, config.num_bos, config.num_eos);
ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
cur_token = prefill_result.get();
prefill_next_token_.reset();
stats_->first_token_ms = time_in_ms();
stats_->prompt_eval_end_ms = time_in_ms();
stats_->num_prompt_tokens = pos_;

auto decode_result =
tokenizer_->decode(prefill_next_token, prefill_next_token);
if (!decode_result.ok()) {
ET_LOG(
Error,
"Tokenizers error code %d",
static_cast<uint32_t>(decode_result.error()));
return Error::InvalidArgument;
}
wrapped_callback(std::move(*decode_result));

RUNNER_ET_LOG(
config.warming,
"RSS after multimodal input processing: %f MiB (0 if unsupported)",
get_rss_bytes() / 1024.0 / 1024.0);

// Resolve max_new_tokens based on config
int64_t max_context_len = metadata_.at(kMaxContextLen);
int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_);

ET_LOG(
Info,
"Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64,
max_new_tokens,
pos_,
max_context_len);

ET_CHECK_OR_RETURN_ERROR(
max_new_tokens > 0,
InvalidArgument,
"Max new tokens %d is less than or equal to 0",
max_new_tokens);

// Set ignore_eos based on config
text_token_generator_->set_ignore_eos(config.ignore_eos);

// Generate tokens using the text token generator
std::vector<uint64_t> prompt_tokens = {prefill_next_token};
auto generate_result = text_token_generator_->generate(
/*tokens=*/prompt_tokens,
/*start_pos=*/pos_,
/*max_new_tokens=*/max_new_tokens -
1, // Subtract 1 because prefill already generated 1 token
/*temperature=*/config.temperature,
/*token_callback=*/wrapped_callback);
if (!generate_result.ok()) {
return generate_result.error();
}
int64_t num_generated_tokens = generate_result.get();

pos_ += num_generated_tokens;
// Update stats
stats_->num_generated_tokens = num_generated_tokens;
// Finalize stats and call callback
stats_->inference_end_ms = time_in_ms();

#ifdef CUDA_AVAILABLE
cuda_memory_tracker_->log_sample("after_generate");
stats_->gpu_free_after_generate_bytes =
cuda_memory_tracker_->last_free_bytes();
// update peak in case it changed after generation
stats_->gpu_peak_usage_mb = cuda_memory_tracker_->peak_usage_mb();
#endif

if (!config.warming) {
printf("\n");
}

if (config.warming) {
ET_LOG(Info, "Warmup run finished!");
} else {
// Empty inputs: consume token from a prior prefill() call
ET_CHECK_OR_RETURN_ERROR(
prefill_next_token_.has_value(),
InvalidState,
"Empty inputs requires a prior prefill() call");
cur_token = prefill_next_token_.value();
prefill_next_token_.reset();
// Do not print report during warmup
print_report(*stats_);
}

if (stats_callback) {
stats_callback(*stats_);
}

return decode_from_token(cur_token, config, wrapped_callback, stats_callback);
return Error::Ok;
}

} // namespace executorch::extension::llm
4 changes: 2 additions & 2 deletions extension/llm/runner/multimodal_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ class ET_EXPERIMENTAL MultimodalRunner : public IRunner {
::executorch::runtime::Error decode_from_token(
uint64_t cur_token,
const GenerationConfig& config,
std::function<void(const std::string&)> wrapped_callback,
std::function<void(const Stats&)> stats_callback);
const std::function<void(const std::string&)>& wrapped_callback,
const std::function<void(const Stats&)>& stats_callback);
};

} // namespace llm
Expand Down
Loading