Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,25 +89,20 @@ steps:
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_cumem.py
- tests/basic_correctness/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- pytest -v -s basic_correctness

- label: Entrypoints Unit Tests # 5min
timeout_in_minutes: 10
working_dir: "/vllm-workspace/tests"
fast_check: true
source_file_dependencies:
- vllm/entrypoints
- tests/entrypoints/
- tests/entrypoints/unit
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- pytest -v -s entrypoints/unit

- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
Expand All @@ -121,25 +116,34 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130
- label: Entrypoints Integration Test (API Server) %N # 50min each
timeout_in_minutes: 65
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
# PYTHONPATH is needed to import custom Worker extension
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py \
--shard-id=$$BUILDKITE_PARALLEL_JOB \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s entrypoints/openai \
--ignore=entrypoints/openai/test_chat_with_tool_reasoning.py \
--ignore=entrypoints/openai/test_oot_registration.py \
--ignore=entrypoints/openai/test_tensorizer_entrypoint.py \
--ignore=entrypoints/openai/correctness/ \
--ignore=entrypoints/openai/test_collective_rpc.py \
--shard-id=$$BUILDKITE_PARALLEL_JOB \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2

- label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50
Expand Down
5 changes: 2 additions & 3 deletions tests/entrypoints/llm/test_collective_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
import torch

from vllm import LLM

from ...utils import create_new_process_for_each_test

pytestmark = pytest.mark.multi_gpu_test(num_gpus=2)


@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("backend", ["mp", "ray"])
@create_new_process_for_each_test()
def test_collective_rpc(tp_size, backend, monkeypatch):
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
Comment on lines +11 to -17
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The test_collective_rpc test is marked as a multi-GPU test, but the pytestmark is defined after the imports. This could lead to tests being run without the multi-gpu setup if pytest runs before the mark is applied. It's better to define pytestmark at the top of the file to ensure it's always applied. This is a critical issue as it can lead to tests passing incorrectly when they should be failing due to insufficient GPU resources.

pytestmark = pytest.mark.multi_gpu_test(num_gpus=2)


from ...utils import create_new_process_for_each_test


@pytest.mark.parametrize("tp_size", [1, 2])

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't do this anywhere else that we use pytestmark

if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case")
Comment on lines 4 to 18

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Removing device-count guard lets multi-GPU test run on single GPU

The test previously skipped when torch.cuda.device_count() < tp_size, but the new module-level pytestmark = pytest.mark.multi_gpu_test(num_gpus=2) does not perform that check—the multi_gpu_test skip logic lives in the decorator in tests/utils.py, not in a pytest marker. As a result, on environments with only one GPU this test will now run and attempt to create an LLM with tensor_parallel_size=2, causing a failure instead of a skip. To restore the behaviour, either keep the explicit device-count guard or apply the multi_gpu_test decorator (or its marks) so the skip happens before the LLM is constructed.

Useful? React with 👍 / 👎.

if tp_size == 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer

from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import VLLM_PATH
from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import VLLM_PATH

EXAMPLES_DIR = VLLM_PATH / "examples"

Expand Down
Empty file.
199 changes: 199 additions & 0 deletions tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
from vllm.transformers_utils.tokenizer import AnyTokenizer


@pytest.fixture
def qwen_tokenizer() -> AnyTokenizer:
from vllm.transformers_utils.tokenizer import get_tokenizer

return get_tokenizer("Qwen/Qwen3-32B")


@pytest.fixture
def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser:
return Hermes2ProToolParser(qwen_tokenizer)


@pytest.fixture
def any_chat_request() -> ChatCompletionRequest:
return ChatCompletionRequest(
seed=42,
model="Qwen/Qwen3-32B",
messages=[],
)


def test_hermes_parser_streaming_just_forward_text(
qwen_tokenizer: AnyTokenizer,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """This is some prior text that has nothing to do with tool calling."""
tokens = qwen_tokenizer.encode(text)
previous_text = ""
delta_messages = []
for token in tokens:
delta_text = qwen_tokenizer.decode([token])
current_text = previous_text + delta_text
delta = hermes_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=delta_text,
previous_token_ids=[],
current_token_ids=[],
delta_token_ids=[],
request=any_chat_request,
)
previous_text = current_text
delta_messages.append(delta)

for delta in delta_messages:
assert delta is not None
assert not delta.tool_calls

print(delta_messages)
assert "".join([delta.content for delta in delta_messages]) == text


def test_hermes_parser_streaming_failure_case_bug_19056(
qwen_tokenizer: AnyTokenizer,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}}
</tool_call>"""
tokens = qwen_tokenizer.encode(text)
previous_text = ""
delta_messages = []
for token in tokens:
text = qwen_tokenizer.decode([token])
current_text = previous_text + text
delta = hermes_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=text,
previous_token_ids=[],
current_token_ids=[],
delta_token_ids=[],
request=any_chat_request,
)
previous_text = current_text
if delta is not None:
delta_messages.append(delta)

assert delta_messages[0].tool_calls[0].function.name == "final_answer"
tool_call_args = "".join(
delta.tool_calls[0].function.arguments or "" for delta in delta_messages
)
assert tool_call_args == '{"trigger": true}'


def test_hermes_parser_streaming(
qwen_tokenizer: AnyTokenizer,
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = '<tool_call>\
{"name": "get_current_temperature",\
"arguments": {"location":\
"San Francisco, California, United States", "unit": "celsius"}}\
</tool_call>'

tokens = qwen_tokenizer.encode(text)
previous_text = ""
delta_messages = []
for token in tokens:
text = qwen_tokenizer.decode([token])
current_text = previous_text + text
delta = hermes_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=text,
previous_token_ids=[],
current_token_ids=[],
delta_token_ids=[],
request=any_chat_request,
)
previous_text = current_text
if delta is not None:
delta_messages.append(delta)
print(delta_messages)
assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature"
tool_call_args = "".join(
delta.tool_calls[0].function.arguments or "" for delta in delta_messages
)
assert tool_call_args == (
'{"location":"San Francisco, California, United States", "unit": "celsius"}'
)


def test_hermes_parser_non_streaming_no_tool_call(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """This is not a tool call."""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)

assert tool_call is not None
assert not tool_call.tools_called


def test_hermes_parser_non_streaming_tool_call_between_tags(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}}
</tool_call>"""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)

assert tool_call is not None
assert tool_call.tools_called
assert tool_call.tool_calls[0].function.name == "final_answer"
assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'


def test_hermes_parser_non_streaming_tool_call_until_eos(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}}"""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)

assert tool_call is not None
assert tool_call.tools_called
assert tool_call.tool_calls[0].function.name == "final_answer"
assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'


def test_hermes_parser_non_streaming_tool_call_invalid_json(
hermes_parser: Hermes2ProToolParser,
any_chat_request: ChatCompletionRequest,
) -> None:
# Missing closing brace to trigger exception
text = """<tool_call>
{"name": "final_answer", "arguments": {"trigger": true}"""
tool_call = hermes_parser.extract_tool_calls(
model_output=text,
request=any_chat_request,
)

assert tool_call is not None
assert not tool_call.tools_called
Loading