-
-
Notifications
You must be signed in to change notification settings - Fork 11.2k
[BugFix] Handle unscheduled requests properly when async scheduling #27756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,8 +2,11 @@ | |
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| from dataclasses import dataclass | ||
| from functools import cached_property | ||
| from typing import TYPE_CHECKING | ||
|
|
||
| from typing_extensions import deprecated | ||
|
|
||
| from vllm._bc_linter import bc_linter_include | ||
|
|
||
| if TYPE_CHECKING: | ||
|
|
@@ -94,18 +97,18 @@ | |
|
|
||
| @bc_linter_include | ||
| @dataclass | ||
| class CachedRequestData: | ||
|
Check notice on line 100 in vllm/v1/core/sched/output.py
|
||
| req_ids: list[str] | ||
| # If resumed_from_preemption is False, new_block_ids will be appended to | ||
| # the request's block IDs. If True, new_block_ids will be used as the | ||
| # For request ids not in resumed_req_ids, new_block_ids will be appended to | ||
| # the request's block IDs. For those in the set, new_block_ids will be used as the | ||
| # request's block IDs instead of appending to the existing block IDs. | ||
| resumed_from_preemption: list[bool] | ||
| resumed_req_ids: set[str] | ||
|
Check notice on line 105 in vllm/v1/core/sched/output.py
|
||
| # NOTE(woosuk): new_token_ids is only used for pipeline parallelism. | ||
| # When PP is not used, new_token_ids will be empty. | ||
| new_token_ids: list[list[int]] | ||
| # If resumed_from_preemption is True, propogate the token ids to the | ||
| # connector, otherwise will be empty. | ||
| resumed_req_token_ids: list[list[int] | None] | ||
| # For requests not scheduled in the last step, propagate the token ids to the | ||
| # connector. Won't contain requests that were scheduled in the prior step. | ||
| all_token_ids: dict[str, list[int]] | ||
|
Check notice on line 111 in vllm/v1/core/sched/output.py
|
||
| new_block_ids: list[tuple[list[int], ...] | None] | ||
| num_computed_tokens: list[int] | ||
| num_output_tokens: list[int] | ||
|
|
@@ -114,13 +117,26 @@ | |
| def num_reqs(self) -> int: | ||
| return len(self.req_ids) | ||
|
|
||
| @cached_property | ||
| @deprecated("use resumed_req_ids field") | ||
| def resumed_from_preemption(self) -> list[bool]: | ||
| return [req_id in self.resumed_req_ids for req_id in self.req_ids] | ||
|
|
||
| @cached_property | ||
| @deprecated("use all_token_ids field") | ||
| def resumed_req_token_ids(self) -> list[list[int] | None]: | ||
| return [ | ||
| self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None | ||
| for req_id in self.req_ids | ||
| ] | ||
|
Comment on lines
+120
to
+131
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are for backwards compatibility. |
||
|
|
||
| @classmethod | ||
| def make_empty(cls) -> "CachedRequestData": | ||
| return cls( | ||
| req_ids=[], | ||
| resumed_from_preemption=[], | ||
| resumed_req_ids=set(), | ||
| new_token_ids=[], | ||
| resumed_req_token_ids=[], | ||
| all_token_ids={}, | ||
| new_block_ids=[], | ||
| num_computed_tokens=[], | ||
| num_output_tokens=[], | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,6 +71,7 @@ def __init__( | |
| self.finished_req_ids_dict: dict[int, set[str]] | None = ( | ||
| defaultdict(set) if include_finished_set else None | ||
| ) | ||
| self.prev_step_scheduled_req_ids: set[str] = set() | ||
|
|
||
| # Scheduling constraints. | ||
| self.max_num_running_reqs = self.scheduler_config.max_num_seqs | ||
|
|
@@ -444,14 +445,9 @@ def schedule(self) -> SchedulerOutput: | |
| # `request.num_prompt_tokens` to consider the resumed | ||
| # requests, which have output tokens. | ||
| num_new_tokens = request.num_tokens - num_computed_tokens | ||
| if ( | ||
| 0 | ||
| < self.scheduler_config.long_prefill_token_threshold | ||
| < num_new_tokens | ||
| ): | ||
| num_new_tokens = ( | ||
| self.scheduler_config.long_prefill_token_threshold | ||
| ) | ||
| threshold = self.scheduler_config.long_prefill_token_threshold | ||
| if 0 < threshold < num_new_tokens: | ||
| num_new_tokens = threshold | ||
|
Comment on lines
+448
to
+450
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated simplification, hurt me to look at that formatting :) |
||
|
|
||
| # chunked prefill has to be enabled explicitly to allow | ||
| # pooling requests to be chunked | ||
|
|
@@ -620,6 +616,11 @@ def schedule(self) -> SchedulerOutput: | |
| structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask( | ||
| num_scheduled_tokens.keys(), scheduled_spec_decode_tokens | ||
| ) | ||
|
|
||
| # Record the request ids that were scheduled in this step. | ||
| self.prev_step_scheduled_req_ids.clear() | ||
| self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys()) | ||
|
|
||
| scheduler_output = SchedulerOutput( | ||
| scheduled_new_reqs=new_reqs_data, | ||
| scheduled_cached_reqs=cached_reqs_data, | ||
|
|
@@ -691,14 +692,12 @@ def _make_cached_request_data( | |
| req_ids: list[str] = [] | ||
| new_token_ids: list[list[int]] = [] | ||
| new_block_ids: list[tuple[list[int], ...] | None] = [] | ||
| resumed_req_token_ids: list[list[int] | None] = [] | ||
| all_token_ids: dict[str, list[int]] = {} | ||
| num_computed_tokens: list[int] = [] | ||
| num_output_tokens: list[int] = [] | ||
| resumed_req_ids = set() | ||
|
|
||
| # Because resumed_reqs is usually empty, it is more efficient to do | ||
| # in-place appending so that we don't need to allocate a new list. | ||
| resumed_from_preemption = [False] * len(running_reqs) | ||
| resumed_from_preemption += [True] * len(resumed_reqs) | ||
| num_running_reqs = len(running_reqs) | ||
| for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)): | ||
| req_id = req.request_id | ||
| req_ids.append(req_id) | ||
|
|
@@ -715,12 +714,14 @@ def _make_cached_request_data( | |
| req.num_computed_tokens : req.num_computed_tokens + num_tokens | ||
| ] | ||
| new_token_ids.append(token_ids) | ||
| resumed_token_ids = None | ||
| if resumed_from_preemption[idx]: | ||
| resumed_token_ids = req.all_token_ids[ | ||
| scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids | ||
| if idx >= num_running_reqs: | ||
| assert not scheduled_in_prev_step | ||
| resumed_req_ids.add(req_id) | ||
| if not scheduled_in_prev_step: | ||
| all_token_ids[req_id] = req.all_token_ids[ | ||
| : req.num_computed_tokens + num_tokens | ||
| ] | ||
| resumed_req_token_ids.append(resumed_token_ids) | ||
| new_block_ids.append( | ||
| req_to_new_blocks[req_id].get_block_ids(allow_none=True) | ||
| ) | ||
|
|
@@ -731,9 +732,9 @@ def _make_cached_request_data( | |
|
|
||
| return CachedRequestData( | ||
| req_ids=req_ids, | ||
| resumed_from_preemption=resumed_from_preemption, | ||
| resumed_req_ids=resumed_req_ids, | ||
| new_token_ids=new_token_ids, | ||
| resumed_req_token_ids=resumed_req_token_ids, | ||
| all_token_ids=all_token_ids, | ||
| new_block_ids=new_block_ids, | ||
| num_computed_tokens=num_computed_tokens, | ||
| num_output_tokens=num_output_tokens, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changing this to a set since these will be rare and we currently are creating a
[None] * batch_sizelist every time.