Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 33 additions & 19 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
from vllm.v1.worker.ubatch_utils import (
UBatchSlice,
UBatchSlices,
check_cudagraph_threshold,
check_ubatch_thresholds,
)
from vllm.v1.worker.utils import is_residual_scattered_for_sp
Expand Down Expand Up @@ -3980,7 +3981,7 @@ def _capture_cudagraphs(
# cudagraph, a uniform decode batch, and the number of tokens
# is above the threshold. Otherwise we just capture a non-ubatched
# version of the graph
allow_microbatching = (
microbatching_enabled = (
self.parallel_config.enable_dbo
and cudagraph_runtime_mode == CUDAGraphMode.FULL
and uniform_decode
Expand All @@ -3991,32 +3992,45 @@ def _capture_cudagraphs(
)
)

for _ in range(self.compilation_config.cudagraph_num_of_warmups):
# Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
# But be careful, warm up with `NONE`is orthogonal to
# if we want to warm up attention or not. This is
# different from the case where `FULL` implies capture
# attention while `PIECEWISE` implies no attention.
force_attention = cudagraph_runtime_mode == CUDAGraphMode.FULL
# When num_tokens is near the dbo_decode_token_threshold, different ranks
# may make different microbatching decisions (some above threshold, some
# below). Since all ranks must agree for DBO to work, they'll all fall back
# to non-DBO execution. To avoid running without cudagraphs in these mixed
# cases, we preemptively compile cudagraphs for both microbatching modes.
microbatching_modes = [microbatching_enabled]
compile_both_modes = check_cudagraph_threshold(
self.parallel_config, num_tokens, uniform_decode
)
if microbatching_enabled and compile_both_modes:
microbatching_modes = [False, True] # Compile both modes

for allow_microbatching in microbatching_modes:
for _ in range(self.compilation_config.cudagraph_num_of_warmups):
# Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
# But be careful, warm up with `NONE`is orthogonal to
# if we want to warm up attention or not. This is
# different from the case where `FULL` implies capture
# attention while `PIECEWISE` implies no attention.
force_attention = cudagraph_runtime_mode == CUDAGraphMode.FULL
self._dummy_run(
num_tokens,
cudagraph_runtime_mode=CUDAGraphMode.NONE,
force_attention=force_attention,
uniform_decode=uniform_decode,
allow_microbatching=allow_microbatching,
skip_eplb=True,
remove_lora=False,
activate_lora=activate_lora,
)
self._dummy_run(
num_tokens,
cudagraph_runtime_mode=CUDAGraphMode.NONE,
force_attention=force_attention,
cudagraph_runtime_mode=cudagraph_runtime_mode,
uniform_decode=uniform_decode,
allow_microbatching=allow_microbatching,
skip_eplb=True,
remove_lora=False,
activate_lora=activate_lora,
)
self._dummy_run(
num_tokens,
cudagraph_runtime_mode=cudagraph_runtime_mode,
uniform_decode=uniform_decode,
allow_microbatching=allow_microbatching,
skip_eplb=True,
remove_lora=False,
activate_lora=activate_lora,
)
self.maybe_remove_all_loras(self.lora_config)

def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
Expand Down
9 changes: 8 additions & 1 deletion vllm/v1/worker/gpu_ubatch_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from vllm.utils.import_utils import has_deep_gemm
from vllm.v1.worker.ubatch_utils import check_cudagraph_threshold
from vllm.v1.worker.ubatching import UBatchContext, make_ubatch_contexts

logger = init_logger(__name__)
Expand Down Expand Up @@ -388,7 +389,13 @@ def __call__(self, *args, **kwargs):
# for this shape during a normal run.
if cudagraph_runtime_mode is CUDAGraphMode.FULL:
assert batch_descriptor is not None
if batch_descriptor.num_tokens in self.cudagraphs:
num_tokens = batch_descriptor.num_tokens
uniform_decode = batch_descriptor.uniform_decode
# Check if the model runner made a non-dbo cudagraph for this shape
cudagraph_exists = check_cudagraph_threshold(
self.vllm_config.parallel_config, num_tokens, uniform_decode
)
if num_tokens in self.cudagraphs and not cudagraph_exists:
cudagraph_runtime_mode = CUDAGraphMode.NONE

if cudagraph_runtime_mode in (CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE):
Expand Down
6 changes: 6 additions & 0 deletions vllm/v1/worker/ubatch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ def check_ubatch_thresholds(
return num_tokens >= config.dbo_prefill_token_threshold


def check_cudagraph_threshold(
config: ParallelConfig, num_tokens: int, uniform_decode: bool
):
return uniform_decode and num_tokens <= config.dbo_decode_token_threshold * 1.5


def create_ubatch_slices(
num_scheduled_tokens: np.ndarray, split_point: int
) -> UBatchSlices:
Expand Down