-
-
Notifications
You must be signed in to change notification settings - Fork 11.2k
Cleanup basic and entrypoint test organisation #27747
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ac48cca
04e0269
fe584b3
183ff2e
30369e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,19 +2,18 @@ | |
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| import pytest | ||
| import torch | ||
|
|
||
| from vllm import LLM | ||
|
|
||
| from ...utils import create_new_process_for_each_test | ||
|
|
||
| pytestmark = pytest.mark.multi_gpu_test(num_gpus=2) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("tp_size", [1, 2]) | ||
| @pytest.mark.parametrize("backend", ["mp", "ray"]) | ||
| @create_new_process_for_each_test() | ||
| def test_collective_rpc(tp_size, backend, monkeypatch): | ||
| if torch.cuda.device_count() < tp_size: | ||
| pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") | ||
| if tp_size == 1 and backend == "ray": | ||
| pytest.skip("Skip duplicate test case") | ||
|
Comment on lines
4
to
18
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The test previously skipped when Useful? React with 👍 / 👎. |
||
| if tp_size == 1: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,199 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| import pytest | ||
|
|
||
| from vllm.entrypoints.openai.protocol import ChatCompletionRequest | ||
| from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser | ||
| from vllm.transformers_utils.tokenizer import AnyTokenizer | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def qwen_tokenizer() -> AnyTokenizer: | ||
| from vllm.transformers_utils.tokenizer import get_tokenizer | ||
|
|
||
| return get_tokenizer("Qwen/Qwen3-32B") | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser: | ||
| return Hermes2ProToolParser(qwen_tokenizer) | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def any_chat_request() -> ChatCompletionRequest: | ||
| return ChatCompletionRequest( | ||
| seed=42, | ||
| model="Qwen/Qwen3-32B", | ||
| messages=[], | ||
| ) | ||
|
|
||
|
|
||
| def test_hermes_parser_streaming_just_forward_text( | ||
| qwen_tokenizer: AnyTokenizer, | ||
| hermes_parser: Hermes2ProToolParser, | ||
| any_chat_request: ChatCompletionRequest, | ||
| ) -> None: | ||
| text = """This is some prior text that has nothing to do with tool calling.""" | ||
| tokens = qwen_tokenizer.encode(text) | ||
| previous_text = "" | ||
| delta_messages = [] | ||
| for token in tokens: | ||
| delta_text = qwen_tokenizer.decode([token]) | ||
| current_text = previous_text + delta_text | ||
| delta = hermes_parser.extract_tool_calls_streaming( | ||
| previous_text=previous_text, | ||
| current_text=current_text, | ||
| delta_text=delta_text, | ||
| previous_token_ids=[], | ||
| current_token_ids=[], | ||
| delta_token_ids=[], | ||
| request=any_chat_request, | ||
| ) | ||
| previous_text = current_text | ||
| delta_messages.append(delta) | ||
|
|
||
| for delta in delta_messages: | ||
| assert delta is not None | ||
| assert not delta.tool_calls | ||
|
|
||
| print(delta_messages) | ||
| assert "".join([delta.content for delta in delta_messages]) == text | ||
|
|
||
|
|
||
| def test_hermes_parser_streaming_failure_case_bug_19056( | ||
| qwen_tokenizer: AnyTokenizer, | ||
| hermes_parser: Hermes2ProToolParser, | ||
| any_chat_request: ChatCompletionRequest, | ||
| ) -> None: | ||
| text = """<tool_call> | ||
| {"name": "final_answer", "arguments": {"trigger": true}} | ||
| </tool_call>""" | ||
| tokens = qwen_tokenizer.encode(text) | ||
| previous_text = "" | ||
| delta_messages = [] | ||
| for token in tokens: | ||
| text = qwen_tokenizer.decode([token]) | ||
| current_text = previous_text + text | ||
| delta = hermes_parser.extract_tool_calls_streaming( | ||
| previous_text=previous_text, | ||
| current_text=current_text, | ||
| delta_text=text, | ||
| previous_token_ids=[], | ||
| current_token_ids=[], | ||
| delta_token_ids=[], | ||
| request=any_chat_request, | ||
| ) | ||
| previous_text = current_text | ||
| if delta is not None: | ||
| delta_messages.append(delta) | ||
|
|
||
| assert delta_messages[0].tool_calls[0].function.name == "final_answer" | ||
| tool_call_args = "".join( | ||
| delta.tool_calls[0].function.arguments or "" for delta in delta_messages | ||
| ) | ||
| assert tool_call_args == '{"trigger": true}' | ||
|
|
||
|
|
||
| def test_hermes_parser_streaming( | ||
| qwen_tokenizer: AnyTokenizer, | ||
| hermes_parser: Hermes2ProToolParser, | ||
| any_chat_request: ChatCompletionRequest, | ||
| ) -> None: | ||
| text = '<tool_call>\ | ||
| {"name": "get_current_temperature",\ | ||
| "arguments": {"location":\ | ||
| "San Francisco, California, United States", "unit": "celsius"}}\ | ||
| </tool_call>' | ||
|
|
||
| tokens = qwen_tokenizer.encode(text) | ||
| previous_text = "" | ||
| delta_messages = [] | ||
| for token in tokens: | ||
| text = qwen_tokenizer.decode([token]) | ||
| current_text = previous_text + text | ||
| delta = hermes_parser.extract_tool_calls_streaming( | ||
| previous_text=previous_text, | ||
| current_text=current_text, | ||
| delta_text=text, | ||
| previous_token_ids=[], | ||
| current_token_ids=[], | ||
| delta_token_ids=[], | ||
| request=any_chat_request, | ||
| ) | ||
| previous_text = current_text | ||
| if delta is not None: | ||
| delta_messages.append(delta) | ||
| print(delta_messages) | ||
| assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature" | ||
| tool_call_args = "".join( | ||
| delta.tool_calls[0].function.arguments or "" for delta in delta_messages | ||
| ) | ||
| assert tool_call_args == ( | ||
| '{"location":"San Francisco, California, United States", "unit": "celsius"}' | ||
| ) | ||
|
|
||
|
|
||
| def test_hermes_parser_non_streaming_no_tool_call( | ||
| hermes_parser: Hermes2ProToolParser, | ||
| any_chat_request: ChatCompletionRequest, | ||
| ) -> None: | ||
| text = """This is not a tool call.""" | ||
| tool_call = hermes_parser.extract_tool_calls( | ||
| model_output=text, | ||
| request=any_chat_request, | ||
| ) | ||
|
|
||
| assert tool_call is not None | ||
| assert not tool_call.tools_called | ||
|
|
||
|
|
||
| def test_hermes_parser_non_streaming_tool_call_between_tags( | ||
| hermes_parser: Hermes2ProToolParser, | ||
| any_chat_request: ChatCompletionRequest, | ||
| ) -> None: | ||
| text = """<tool_call> | ||
| {"name": "final_answer", "arguments": {"trigger": true}} | ||
| </tool_call>""" | ||
| tool_call = hermes_parser.extract_tool_calls( | ||
| model_output=text, | ||
| request=any_chat_request, | ||
| ) | ||
|
|
||
| assert tool_call is not None | ||
| assert tool_call.tools_called | ||
| assert tool_call.tool_calls[0].function.name == "final_answer" | ||
| assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' | ||
|
|
||
|
|
||
| def test_hermes_parser_non_streaming_tool_call_until_eos( | ||
| hermes_parser: Hermes2ProToolParser, | ||
| any_chat_request: ChatCompletionRequest, | ||
| ) -> None: | ||
| text = """<tool_call> | ||
| {"name": "final_answer", "arguments": {"trigger": true}}""" | ||
| tool_call = hermes_parser.extract_tool_calls( | ||
| model_output=text, | ||
| request=any_chat_request, | ||
| ) | ||
|
|
||
| assert tool_call is not None | ||
| assert tool_call.tools_called | ||
| assert tool_call.tool_calls[0].function.name == "final_answer" | ||
| assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}' | ||
|
|
||
|
|
||
| def test_hermes_parser_non_streaming_tool_call_invalid_json( | ||
| hermes_parser: Hermes2ProToolParser, | ||
| any_chat_request: ChatCompletionRequest, | ||
| ) -> None: | ||
| # Missing closing brace to trigger exception | ||
| text = """<tool_call> | ||
| {"name": "final_answer", "arguments": {"trigger": true}""" | ||
| tool_call = hermes_parser.extract_tool_calls( | ||
| model_output=text, | ||
| request=any_chat_request, | ||
| ) | ||
|
|
||
| assert tool_call is not None | ||
| assert not tool_call.tools_called |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
test_collective_rpctest is marked as a multi-GPU test, but thepytestmarkis defined after the imports. This could lead to tests being run without the multi-gpu setup if pytest runs before the mark is applied. It's better to definepytestmarkat the top of the file to ensure it's always applied. This is a critical issue as it can lead to tests passing incorrectly when they should be failing due to insufficient GPU resources.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't do this anywhere else that we use
pytestmark