From 3dfa4d894a6471ff394e4ef05b1a4f1977a288b7 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Mon, 6 Oct 2025 17:55:10 -0400 Subject: [PATCH 1/2] [CI/Build] Add common tool call parser test suite This adds a common test suite for tool call parsers and wires all of the existing tool call parsers that had no tests into the common suite. It doesn't yet adapt existing tool call parser tests to fit into the common suite nor augment tool call parsers that already had tests with the new set of common tests. Those tasks can come later, as this PR is already quite large. Not all of the existing tool call parsers can pass every test in the common test suite. The ones that are not passing today are marked as xfail, and those represent opportunities to identify and fix gaps in all of these tool call parsers in the future until we get down to zero expected to fail tests within the common suite for each parser. Given how many tests are here, the default_tokenizer fixture used by tests when tokenizing strings was also adjusted to be module-scoped, so we don't create a new version of that for every single test. Signed-off-by: Ben Browning --- .../openai/tool_parsers/common_tests.py | 369 ++++++++++++++++++ .../openai/tool_parsers/conftest.py | 2 +- .../test_deepseekv3_tool_parser.py | 92 +++++ .../test_granite_20b_fc_tool_parser.py | 76 ++++ .../tool_parsers/test_granite_tool_parser.py | 118 ++++++ .../test_internlm2_tool_parser.py | 122 ++++++ .../tool_parsers/test_longcat_tool_parser.py | 101 +++++ .../tool_parsers/test_phi4mini_tool_parser.py | 110 ++++++ .../tool_parsers/test_qwen3xml_tool_parser.py | 74 ++++ .../tool_parsers/test_step3_tool_parser.py | 111 ++++++ 10 files changed, 1174 insertions(+), 1 deletion(-) create mode 100644 tests/entrypoints/openai/tool_parsers/common_tests.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_deepseekv3_tool_parser.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_granite_20b_fc_tool_parser.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_granite_tool_parser.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_internlm2_tool_parser.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_longcat_tool_parser.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_phi4mini_tool_parser.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py create mode 100644 tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/common_tests.py b/tests/entrypoints/openai/tool_parsers/common_tests.py new file mode 100644 index 000000000000..3bdefd867dc5 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/common_tests.py @@ -0,0 +1,369 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from dataclasses import dataclass, field +from typing import Any + +import pytest + +from tests.entrypoints.openai.tool_parsers.utils import run_tool_extraction +from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +@dataclass +class ToolParserTestConfig: + """Configuration for a tool parser's common tests. + + This dataclass contains all the test data and expected results needed + to run the common test suite for a parser. Each parser test file + creates one instance of this config with parser-specific values. + + Attributes: + parser_name: Name used with ToolParserManager (e.g., "mistral") + + Test data (model outputs): + no_tool_calls_output: Plain text without any tool syntax + single_tool_call_output: One tool call with simple arguments + parallel_tool_calls_output: Multiple tool calls in one response + various_data_types_output: Tool with various data types + empty_arguments_output: Tool call with no parameters + surrounding_text_output: Tool call mixed with regular text + escaped_strings_output: Tool call with escaped chars + malformed_input_outputs: List of invalid inputs + + Expected results: + single_tool_call_expected_name: Expected function name + single_tool_call_expected_args: Expected arguments dict + parallel_tool_calls_count: Number of tools in parallel test + parallel_tool_calls_names: Function names in order + single_tool_call_expected_content: Content field when tool called + parallel_tool_calls_expected_content: Content for parallel test + + xfail markers: + xfail_streaming: Mapping test name to xfail reason (streaming only) + xfail_nonstreaming: Mapping test name to xfail reason (non-streaming) + + Special flags: + allow_empty_or_json_empty_args: True if "" or "{}" both valid for empty args + """ + + # Parser identification + parser_name: str + + # Test data - model outputs for each common test + no_tool_calls_output: str + single_tool_call_output: str + parallel_tool_calls_output: str + various_data_types_output: str + empty_arguments_output: str + surrounding_text_output: str + escaped_strings_output: str + malformed_input_outputs: list[str] + + # Expected results for specific tests (optional overrides) + single_tool_call_expected_name: str = "get_weather" + single_tool_call_expected_args: dict[str, Any] = field( + default_factory=lambda: {"city": "Tokyo"} + ) + parallel_tool_calls_count: int = 2 + parallel_tool_calls_names: list[str] = field( + default_factory=lambda: ["get_weather", "get_time"] + ) + + # xfail configuration - maps test name to xfail reason + xfail_streaming: dict[str, str] = field(default_factory=dict) + xfail_nonstreaming: dict[str, str] = field(default_factory=dict) + + # Content expectations (some parsers strip content, others don't) + single_tool_call_expected_content: str | None = None + parallel_tool_calls_expected_content: str | None = None + + # Special assertions for edge cases + allow_empty_or_json_empty_args: bool = True # "{}" or "" for empty args + + +class ToolParserTests: + """Mixin class providing common test suite for tool parsers. + + To use this mixin in a parser test file: + + 1. Create a test_config fixture that returns a ToolParserTestConfig instance + 2. Inherit from this class + 3. Add parser-specific tests as additional methods + + Example: + class TestMistralToolParser(ToolParserTests): + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="mistral", + no_tool_calls_output="Plain text...", + # ... other config ... + ) + + # Parser-specific tests + def test_mistral_specific_feature(self, tool_parser): + # Custom test logic + pass + """ + + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + """Override this to provide parser-specific configuration.""" + raise NotImplementedError( + "Subclass must provide test_config fixture returning ToolParserTestConfig" + ) + + @pytest.fixture + def tokenizer(self, default_tokenizer: AnyTokenizer) -> AnyTokenizer: + """Override this to provide parser-specific tokenizer.""" + return default_tokenizer + + @pytest.fixture + def tool_parser(self, test_config: ToolParserTestConfig, tokenizer: AnyTokenizer): + return ToolParserManager.get_tool_parser(test_config.parser_name)(tokenizer) + + @pytest.fixture(params=[True, False]) + def streaming(self, request: pytest.FixtureRequest) -> bool: + return request.param + + def test_no_tool_calls( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser handles plain text without tool syntax.""" + # Apply xfail markers if configured + test_name = "test_no_tool_calls" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + content, tool_calls = run_tool_extraction( + tool_parser, test_config.no_tool_calls_output, streaming=streaming + ) + assert content == test_config.no_tool_calls_output, ( + f"Expected content to match input, got {content}" + ) + assert len(tool_calls) == 0, f"Expected no tool calls, got {len(tool_calls)}" + + def test_single_tool_call_simple_args( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser extracts one tool with simple arguments.""" + # Apply xfail markers if configured + test_name = "test_single_tool_call_simple_args" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + content, tool_calls = run_tool_extraction( + tool_parser, test_config.single_tool_call_output, streaming=streaming + ) + + # Content check (some parsers strip it) + if test_config.single_tool_call_expected_content is not None: + assert content == test_config.single_tool_call_expected_content + + assert len(tool_calls) == 1, f"Expected 1 tool call, got {len(tool_calls)}" + assert tool_calls[0].type == "function" + assert tool_calls[0].function.name == test_config.single_tool_call_expected_name + + args = json.loads(tool_calls[0].function.arguments) + for key, value in test_config.single_tool_call_expected_args.items(): + assert args.get(key) == value, ( + f"Expected {key}={value}, got {args.get(key)}" + ) + + def test_parallel_tool_calls( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser handles multiple tools in one response.""" + # Apply xfail markers if configured + test_name = "test_parallel_tool_calls" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + content, tool_calls = run_tool_extraction( + tool_parser, + test_config.parallel_tool_calls_output, + streaming=streaming, + ) + + assert len(tool_calls) == test_config.parallel_tool_calls_count, ( + f"Expected {test_config.parallel_tool_calls_count} " + f"tool calls, got {len(tool_calls)}" + ) + + # Verify tool names match expected + for i, expected_name in enumerate(test_config.parallel_tool_calls_names): + assert tool_calls[i].type == "function" + assert tool_calls[i].function.name == expected_name + + # Verify unique IDs + ids = [tc.id for tc in tool_calls] + assert len(ids) == len(set(ids)), "Tool call IDs should be unique" + + def test_various_data_types( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser handles all JSON types in arguments.""" + # Apply xfail markers if configured + test_name = "test_various_data_types" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + content, tool_calls = run_tool_extraction( + tool_parser, + test_config.various_data_types_output, + streaming=streaming, + ) + assert len(tool_calls) == 1, f"Expected 1 tool call, got {len(tool_calls)}" + + args = json.loads(tool_calls[0].function.arguments) + # Verify all expected fields present + required_fields = [ + "string_field", + "int_field", + "float_field", + "bool_field", + "null_field", + "array_field", + "object_field", + ] + for required_field in required_fields: + assert required_field in args, ( + f"Expected field '{required_field}' in arguments" + ) + + def test_empty_arguments( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser handles parameterless tool calls.""" + # Apply xfail markers if configured + test_name = "test_empty_arguments" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + content, tool_calls = run_tool_extraction( + tool_parser, test_config.empty_arguments_output, streaming=streaming + ) + assert len(tool_calls) == 1, f"Expected 1 tool call, got {len(tool_calls)}" + + args = tool_calls[0].function.arguments + if test_config.allow_empty_or_json_empty_args: + assert args in ["{}", ""], f"Expected empty args, got {args}" + else: + assert args == "{}", f"Expected {{}}, got {args}" + + def test_surrounding_text( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser extracts tools from mixed content.""" + # Apply xfail markers if configured + test_name = "test_surrounding_text" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + content, tool_calls = run_tool_extraction( + tool_parser, test_config.surrounding_text_output, streaming=streaming + ) + assert len(tool_calls) >= 1, ( + f"Expected at least 1 tool call, got {len(tool_calls)}" + ) + + def test_escaped_strings( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser handles escaped characters in arguments.""" + # Apply xfail markers if configured + test_name = "test_escaped_strings" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + content, tool_calls = run_tool_extraction( + tool_parser, test_config.escaped_strings_output, streaming=streaming + ) + assert len(tool_calls) == 1, f"Expected 1 tool call, got {len(tool_calls)}" + + args = json.loads(tool_calls[0].function.arguments) + # At minimum, verify we can parse and have expected fields + # Exact escaping behavior varies by parser + assert len(args) > 0, "Expected some arguments with escaped strings" + + def test_malformed_input( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + streaming: bool, + ): + """Verify parser gracefully handles invalid syntax.""" + # Apply xfail markers if configured + test_name = "test_malformed_input" + self.apply_xfail_mark(request, test_config, test_name, streaming) + + for malformed_input in test_config.malformed_input_outputs: + # Should not raise exception + content, tool_calls = run_tool_extraction( + tool_parser, malformed_input, streaming=streaming + ) + # Parser should handle gracefully (exact behavior varies) + + def test_streaming_reconstruction( + self, + request: pytest.FixtureRequest, + tool_parser: Any, + test_config: ToolParserTestConfig, + ): + """Verify streaming produces same result as non-streaming.""" + test_name = "test_streaming_reconstruction" + self.apply_xfail_mark(request, test_config, test_name, True) + + test_output = test_config.single_tool_call_output + + # Non-streaming result + content_non, tools_non = run_tool_extraction( + tool_parser, test_output, streaming=False + ) + + # Streaming result + content_stream, tools_stream = run_tool_extraction( + tool_parser, test_output, streaming=True + ) + + # Compare results + assert content_non == content_stream, "Content should match between modes" + assert len(tools_non) == len(tools_stream), "Tool count should match" + if len(tools_non) > 0: + assert tools_non[0].function.name == tools_stream[0].function.name + assert tools_non[0].function.arguments == tools_stream[0].function.arguments + + def apply_xfail_mark(self, request, test_config, test_name, streaming): + reason = None + if streaming and test_name in test_config.xfail_streaming: + reason = test_config.xfail_streaming[test_name] + elif not streaming and test_name in test_config.xfail_nonstreaming: + reason = test_config.xfail_nonstreaming[test_name] + if reason is not None: + mark = pytest.mark.xfail(reason=reason, strict=True) + request.node.add_marker(mark) diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/openai/tool_parsers/conftest.py index f2ac5e5b9a8f..4a1e7d08c2fa 100644 --- a/tests/entrypoints/openai/tool_parsers/conftest.py +++ b/tests/entrypoints/openai/tool_parsers/conftest.py @@ -7,6 +7,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer -@pytest.fixture(scope="function") +@pytest.fixture(scope="module") def default_tokenizer() -> AnyTokenizer: return AutoTokenizer.from_pretrained("gpt2") diff --git a/tests/entrypoints/openai/tool_parsers/test_deepseekv3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_deepseekv3_tool_parser.py new file mode 100644 index 000000000000..c130f446dddf --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_deepseekv3_tool_parser.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer + + +class TestDeepSeekV3ToolParser(ToolParserTests): + @pytest.fixture(scope="class") + def tokenizer(self) -> AnyTokenizer: + return get_tokenizer("deepseek-ai/DeepSeek-V3") + + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="deepseek_v3", + # Test data + no_tool_calls_output=( + "How can I help you today? I can check weather for you." + ), + single_tool_call_output="""<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_weather +```json +{"city": "Tokyo", "unit": "celsius"} +```<|tool▁call▁end|><|tool▁calls▁end|>""", + parallel_tool_calls_output="""<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_weather +```json +{"city": "Tokyo", "unit": "celsius"} +```<|tool▁call▁end|><|tool▁call▁begin|>function<|tool▁sep|>search_hotels +```json +{"location": "Tokyo", "check_in": "2025-01-15"} +```<|tool▁call▁end|><|tool▁calls▁end|>""", + various_data_types_output=( + """<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>test_function +```json +""" + """{"string_field": "hello", "int_field": 42, "float_field": 3.14, """ + """"bool_field": true, "null_field": null, """ + """"array_field": ["a", "b", "c"], """ + """"object_field": {"nested": "value"}, """ + """"empty_array": [], "empty_object": {}} +```<|tool▁call▁end|><|tool▁calls▁end|>""" + ), + empty_arguments_output="""<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_time +```json +{} +```<|tool▁call▁end|><|tool▁calls▁end|>""", + surrounding_text_output=( + """Let me check the weather for you.""" + """<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_weather +```json +{"city": "Paris"} +```<|tool▁call▁end|><|tool▁calls▁end|>""" + ), + escaped_strings_output=( + """<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>send_message +```json +""" + """{"text": "He said \\"hello\\"", "path": "C:\\\\Users\\\\file", """ + """"newline": "line1\\nline2"} +```<|tool▁call▁end|><|tool▁calls▁end|>""" + ), + malformed_input_outputs=[ + """<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_weather +```json +{"city": "Tokyo" +```<|tool▁call▁end|><|tool▁calls▁end|>""", + """<|tool▁calls▁begin|>function<|tool▁sep|>get_weather +```json +{"city": "Tokyo"} +```<|tool▁calls▁end|>""", + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo", "unit": "celsius"}, + single_tool_call_expected_content=None, + parallel_tool_calls_count=2, + parallel_tool_calls_names=["get_weather", "search_hotels"], + # xfail markers + xfail_streaming={}, + xfail_nonstreaming={ + "test_malformed_input": ( + "Parser sets tools_called=True even when tool_calls is " + "empty (detects start token but fails to parse)" + ), + }, + ) diff --git a/tests/entrypoints/openai/tool_parsers/test_granite_20b_fc_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_granite_20b_fc_tool_parser.py new file mode 100644 index 000000000000..d25d2ffd2910 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_granite_20b_fc_tool_parser.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) + + +class TestGranite20bFcToolParser(ToolParserTests): + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="granite-20b-fc", + # Test data + no_tool_calls_output="This is a regular response without any tool calls.", + single_tool_call_output=( + ' {"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}' + ), + parallel_tool_calls_output=( + ' {"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}\n' + ' {"name": "get_time", ' + '"arguments": {"timezone": "Asia/Tokyo"}}' + ), + various_data_types_output=""" { + "name": "test_function", + "arguments": { + "string_field": "hello", + "int_field": 42, + "float_field": 3.14, + "bool_field": true, + "null_field": null, + "array_field": ["a", "b", "c"], + "object_field": {"nested": "value"}, + "empty_array": [], + "empty_object": {} + } +}""", + empty_arguments_output=( + ' {"name": "refresh", "arguments": {}}' + ), + surrounding_text_output="""Let me check the weather for you. + {"name": "get_weather", "arguments": {"city": "Tokyo"}}""", + escaped_strings_output=""" { + "name": "test_function", + "arguments": { + "quoted": "He said \\"hello\\"", + "path": "C:\\\\Users\\\\file.txt", + "newline": "line1\\nline2", + "unicode": "emoji: 🎉" + } +}""", + malformed_input_outputs=[ + ' {"name": "func", "arguments": {', + ' [{"name": "func", "arguments": {}}]', + '{"name": "func", "arguments": {}}', + ' {"name": 123}', + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo"}, + single_tool_call_expected_content=None, + parallel_tool_calls_count=2, + parallel_tool_calls_names=["get_weather", "get_time"], + # xfail markers + xfail_streaming={ + "test_surrounding_text": ( + "Granite 20B FC streaming requires at start" + ), + }, + xfail_nonstreaming={}, + ) diff --git a/tests/entrypoints/openai/tool_parsers/test_granite_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_granite_tool_parser.py new file mode 100644 index 000000000000..3ebc03e821b4 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_granite_tool_parser.py @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) +from tests.entrypoints.openai.tool_parsers.utils import run_tool_extraction + + +class TestGraniteToolParser(ToolParserTests): + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="granite", + # Test data + no_tool_calls_output="This is a regular response without any tool calls.", + single_tool_call_output=( + '<|tool_call|> [{"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}]' + ), + parallel_tool_calls_output="""<|tool_call|> [ + {"name": "get_weather", "arguments": {"city": "Tokyo"}}, + {"name": "get_time", "arguments": {"timezone": "Asia/Tokyo"}} +]""", + various_data_types_output=""" [{ + "name": "test_function", + "arguments": { + "string_field": "hello", + "int_field": 42, + "float_field": 3.14, + "bool_field": true, + "null_field": null, + "array_field": ["a", "b", "c"], + "object_field": {"nested": "value"}, + "empty_array": [], + "empty_object": {} + } +}]""", + empty_arguments_output=( + '<|tool_call|> [{"name": "refresh", "arguments": {}}]' + ), + surrounding_text_output="""Let me check the weather for you. +<|tool_call|> [{"name": "get_weather", "arguments": {"city": "Tokyo"}}] +I'll get that information.""", + escaped_strings_output=""" [{ + "name": "test_function", + "arguments": { + "quoted": "He said \\"hello\\"", + "path": "C:\\\\Users\\\\file.txt", + "newline": "line1\\nline2", + "unicode": "emoji: 🎉" + } +}]""", + malformed_input_outputs=[ + '<|tool_call|> [{"name": "func", "arguments": {', + '<|tool_call|> {"name": "func", "arguments": {}}', # Not an array + '[{"name": "func", "arguments": "not a dict"}]', + 'Some text [{"name": "func"}]', # JSON but not tool call format + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo"}, + # Granite strips content when tool calls present + single_tool_call_expected_content=None, + parallel_tool_calls_count=2, + parallel_tool_calls_names=["get_weather", "get_time"], + # xfail markers + xfail_streaming={ + "test_malformed_input": ( + "Streaming mode incorrectly creates tool call from malformed JSON" + ), + "test_surrounding_text": ( + "Parser doesn't handle surrounding text correctly in streaming" + ), + "test_streaming_reconstruction": ( + "Streaming mode doesn't strip <|tool_call|> marker from content" + ), + }, + xfail_nonstreaming={ + "test_surrounding_text": ( + "Parser doesn't handle surrounding text correctly in non-streaming" + ), + }, + ) + + # Granite-Specific Tests + + @pytest.mark.parametrize("streaming", [True, False]) + def test_granite_token_prefix_format(self, tool_parser, streaming): + """Verify parser handles Granite 3.0 <|tool_call|> token format.""" + single_tool_call_token = ( + '<|tool_call|> [{"name": "get_weather", "arguments": {"city": "Tokyo"}}]' + ) + content, tool_calls = run_tool_extraction( + tool_parser, single_tool_call_token, streaming=streaming + ) + assert len(tool_calls) == 1, ( + f"Expected 1 tool call from token format, got {len(tool_calls)}" + ) + assert tool_calls[0].function.name == "get_weather" + + @pytest.mark.parametrize("streaming", [True, False]) + def test_granite_string_prefix_format(self, tool_parser, streaming): + """Verify parser handles Granite 3.1 string format.""" + single_tool_call_string = ( + ' [{"name": "get_weather", "arguments": {"city": "Tokyo"}}]' + ) + content, tool_calls = run_tool_extraction( + tool_parser, single_tool_call_string, streaming=streaming + ) + assert len(tool_calls) == 1, ( + f"Expected 1 tool call from string format, got {len(tool_calls)}" + ) + assert tool_calls[0].function.name == "get_weather" diff --git a/tests/entrypoints/openai/tool_parsers/test_internlm2_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_internlm2_tool_parser.py new file mode 100644 index 000000000000..3c9a9aaf993e --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_internlm2_tool_parser.py @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +class TestInternLM2ToolParser(ToolParserTests): + @pytest.fixture + def tokenizer(self, default_tokenizer: AnyTokenizer) -> AnyTokenizer: + """Add some internlm2 specific tokens to the default vocab.""" + + tokenizer_vocab = default_tokenizer.get_vocab() + default_tokenizer.get_vocab = MagicMock() + tokenizer_vocab.update( + { + "<|action_start|>": 92540, + "<|plugin|>": 92541, + "<|action_end|>": 92542, + } + ) + default_tokenizer.get_vocab.return_value = tokenizer_vocab + return default_tokenizer + + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="internlm", + # Test data + no_tool_calls_output="This is a regular response without any tool calls.", + single_tool_call_output=( + '<|action_start|><|plugin|>{"name": "get_weather", ' + '"parameters": {"city": "Tokyo"}}<|action_end|>' + ), + # InternLM2 doesn't support parallel calls + parallel_tool_calls_output=( + '<|action_start|><|plugin|>{"name": "get_weather", ' + '"parameters": {"city": "Tokyo"}}<|action_end|>' + ), + various_data_types_output="""<|action_start|><|plugin|>{ + "name": "test_function", + "parameters": { + "string_field": "hello", + "int_field": 42, + "float_field": 3.14, + "bool_field": true, + "null_field": null, + "array_field": ["a", "b", "c"], + "object_field": {"nested": "value"}, + "empty_array": [], + "empty_object": {} + } +}<|action_end|>""", + empty_arguments_output=( + '<|action_start|><|plugin|>{"name": "refresh", ' + '"parameters": {}}<|action_end|>' + ), + surrounding_text_output=( + "Let me check the weather for you. " + '<|action_start|><|plugin|>{"name": "get_weather", ' + '"parameters": {"city": "Tokyo"}}<|action_end|>' + ), + escaped_strings_output="""<|action_start|><|plugin|>{ + "name": "test_function", + "parameters": { + "quoted": "He said \\"hello\\"", + "path": "C:\\\\Users\\\\file.txt", + "newline": "line1\\nline2", + "unicode": "emoji: 🎉" + } +}<|action_end|>""", + malformed_input_outputs=[ + '<|action_start|><|plugin|>{"name": "func", "parameters": {', + ( + '<|action_start|><|plugin|>{"name": "func", ' + '"parameters": "not a dict"}<|action_end|>' + ), + "<|action_start|><|plugin|>not json<|action_end|>", + "<|action_start|><|plugin|>", + '<|action_start|>{"name": "func"}', + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo"}, + single_tool_call_expected_content=None, + parallel_tool_calls_count=1, # InternLM2 only supports single tool calls + parallel_tool_calls_names=["get_weather"], + # Parser-specific settings + allow_empty_or_json_empty_args=True, + # xfail markers + xfail_streaming={ + "test_single_tool_call_simple_args": ( + "InternLM2 streaming not fully implemented" + ), + "test_parallel_tool_calls": ( + "InternLM2 streaming not fully implemented" + ), + "test_various_data_types": ( + "InternLM2 streaming not fully implemented" + ), + "test_empty_arguments": ("InternLM2 streaming not fully implemented"), + "test_surrounding_text": ("InternLM2 streaming not fully implemented"), + "test_escaped_strings": ("InternLM2 streaming not fully implemented"), + "test_streaming_reconstruction": ( + "InternLM2 streaming parser returns '<|action_start|' as " + "content instead of None - streaming/non-streaming inconsistency" + ), + }, + xfail_nonstreaming={ + "test_malformed_input": ( + "InternLM2 parser raises JSONDecodeError on malformed JSON " + "instead of gracefully handling it" + ), + }, + ) diff --git a/tests/entrypoints/openai/tool_parsers/test_longcat_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_longcat_tool_parser.py new file mode 100644 index 000000000000..f934c7f6c69b --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_longcat_tool_parser.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +class TestLongCatToolParser(ToolParserTests): + @pytest.fixture + def tokenizer(self, default_tokenizer: AnyTokenizer) -> AnyTokenizer: + """Add some longcat specific tokens to the default vocab.""" + tokenizer = default_tokenizer + tokenizer_vocab = tokenizer.get_vocab() + tokenizer.get_vocab = MagicMock() + tokenizer_vocab.update( + { + "": 32000, + "": 32001, + } + ) + tokenizer.get_vocab.return_value = tokenizer_vocab + return tokenizer + + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="longcat", + # Test data + no_tool_calls_output="This is a regular response without any tool calls.", + single_tool_call_output=( + '{"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}' + ), + parallel_tool_calls_output=( + '{"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}\n' + '{"name": "get_time", ' + '"arguments": {"timezone": "Asia/Tokyo"}}' + ), + various_data_types_output="""{ + "name": "test_function", + "arguments": { + "string_field": "hello", + "int_field": 42, + "float_field": 3.14, + "bool_field": true, + "null_field": null, + "array_field": ["a", "b", "c"], + "object_field": {"nested": "value"}, + "empty_array": [], + "empty_object": {} + } +}""", + empty_arguments_output=( + '{"name": "refresh", "arguments": {}}' + "" + ), + surrounding_text_output=( + "Let me check the weather for you.\n" + '{"name": "get_weather", ' + '"arguments": {"city": "Tokyo"}}\n' + "Here is the result." + ), + escaped_strings_output="""{ + "name": "test_function", + "arguments": { + "quoted": "He said \\"hello\\"", + "path": "C:\\\\Users\\\\file.txt", + "newline": "line1\\nline2", + "unicode": "emoji: 🎉" + } +}""", + malformed_input_outputs=[ + '{"name": "func", "arguments": {', + ( + '{"name": "func", ' + '"arguments": "not a dict"}' + ), + "Some text with invalid json", + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo"}, + single_tool_call_expected_content=None, + parallel_tool_calls_count=2, + parallel_tool_calls_names=["get_weather", "get_time"], + # xfail markers + xfail_streaming={ + "test_malformed_input": "Streaming has complex buffering behavior", + }, + xfail_nonstreaming={}, + # Configuration + allow_empty_or_json_empty_args=True, + ) diff --git a/tests/entrypoints/openai/tool_parsers/test_phi4mini_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_phi4mini_tool_parser.py new file mode 100644 index 000000000000..85dfe2ffc5a4 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_phi4mini_tool_parser.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +class TestPhi4MiniToolParser(ToolParserTests): + @pytest.fixture + def tokenizer(self, default_tokenizer: AnyTokenizer) -> AnyTokenizer: + """Add some phi4mini specific tokens to the default vocab.""" + + tokenizer = default_tokenizer + tokenizer_vocab = tokenizer.get_vocab() + tokenizer.get_vocab = MagicMock() + tokenizer_vocab.update( + { + "functools": 32000, + } + ) + tokenizer.get_vocab.return_value = tokenizer_vocab + return tokenizer + + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="phi4_mini_json", + # Test data + no_tool_calls_output="This is a regular response without any tool calls.", + single_tool_call_output=( + 'functools[{"name": "get_weather", "arguments": {"city": "Tokyo"}}]' + ), + parallel_tool_calls_output="""functools[ + {"name": "get_weather", "arguments": {"city": "Tokyo"}}, + {"name": "get_time", "arguments": {"timezone": "Asia/Tokyo"}} +]""", + various_data_types_output="""functools[{ + "name": "test_function", + "arguments": { + "string_field": "hello", + "int_field": 42, + "float_field": 3.14, + "bool_field": true, + "null_field": null, + "array_field": ["a", "b", "c"], + "object_field": {"nested": "value"}, + "empty_array": [], + "empty_object": {} + } +}]""", + empty_arguments_output='functools[{"name": "refresh", "arguments": {}}]', + surrounding_text_output="""Let me check the weather for you. +functools[{"name": "get_weather", "arguments": {"city": "Tokyo"}}] +Would you like to know more?""", + escaped_strings_output="""functools[{ + "name": "test_function", + "arguments": { + "quoted": "He said \\"hello\\"", + "path": "C:\\\\Users\\\\file.txt", + "newline": "line1\\nline2", + "unicode": "emoji: 🎉" + } +}]""", + malformed_input_outputs=[ + 'functools[{"name": "func", "arguments": {', + 'functools[{"name": "func", "arguments": "not a dict"}]', + 'functools{"name": "func"}', # Missing brackets + 'functools[{"name": "func"}]', # Missing arguments/parameters + "functools[] This is just text", # Empty functools + "functools[ This is just text ]", # functools with invalid JSON + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo"}, + # Phi-4 Mini strips content when tool calls present + single_tool_call_expected_content=None, + parallel_tool_calls_count=2, + parallel_tool_calls_names=["get_weather", "get_time"], + parallel_tool_calls_expected_content=None, + # xfail markers + xfail_streaming={ + "test_no_tool_calls": "Phi4 Mini streaming not implemented", + "test_single_tool_call_simple_args": ( + "Phi4 Mini streaming not implemented" + ), + "test_parallel_tool_calls": "Phi4 Mini streaming not implemented", + "test_various_data_types": "Phi4 Mini streaming not implemented", + "test_empty_arguments": "Phi4 Mini streaming not implemented", + "test_surrounding_text": "Phi4 Mini streaming not implemented", + "test_escaped_strings": "Phi4 Mini streaming not implemented", + "test_streaming_reconstruction": "Phi4 Mini streaming not implemented", + }, + xfail_nonstreaming={ + "test_various_data_types": ( + "Phi4MiniJsonToolParser regex has nesting limitations " + "with nested objects" + ), + "test_malformed_input": ( + "Phi4MiniJsonToolParser incorrectly sets " + "tools_called=True on empty array" + ), + }, + ) diff --git a/tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py new file mode 100644 index 000000000000..011df64a4733 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) + + +class TestQwen3xmlToolParser(ToolParserTests): + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="qwen3_xml", + # Test data + no_tool_calls_output="This is a regular response without any tool calls.", + single_tool_call_output="\n\nTokyo\n\n", + parallel_tool_calls_output="\n\nTokyo\n\n\n\nAsia/Tokyo\n\n", + various_data_types_output=( + "\n\n" + "hello\n" + "42\n" + "3.14\n" + "true\n" + "null\n" + '["a", "b", "c"]\n' + '{"nested": "value"}\n' + "\n" + ), + empty_arguments_output="\n\n\n", + surrounding_text_output=( + "Let me check the weather for you.\n\n" + "\n\n" + "Tokyo\n" + "\n\n\n" + "I will get that information." + ), + escaped_strings_output=( + "\n\n" + 'He said "hello"\n' + "C:\\Users\\file.txt\n" + "line1\nline2\n" + "\n" + ), + malformed_input_outputs=[ + "", + "", + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo"}, + parallel_tool_calls_count=2, + parallel_tool_calls_names=["get_weather", "get_time"], + # xfail markers - Qwen3XML has systematic streaming issues + xfail_streaming={ + "test_single_tool_call_simple_args": ( + "Qwen3XML streaming has systematic issues" + ), + "test_parallel_tool_calls": "Qwen3XML streaming has systematic issues", + "test_various_data_types": "Qwen3XML streaming has systematic issues", + "test_empty_arguments": "Qwen3XML streaming has systematic issues", + "test_surrounding_text": "Qwen3XML streaming has systematic issues", + "test_escaped_strings": "Qwen3XML streaming has systematic issues", + "test_malformed_input": ( + "Qwen3XML parser is lenient with malformed input" + ), + "test_streaming_reconstruction": ( + "Qwen3XML streaming reconstruction has known issues" + ), + }, + ) diff --git a/tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py new file mode 100644 index 000000000000..d2ae93aeff65 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest + +from tests.entrypoints.openai.tool_parsers.common_tests import ( + ToolParserTestConfig, + ToolParserTests, +) +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer + + +class TestStep3ToolParser(ToolParserTests): + @pytest.fixture(scope="class") + def tokenizer(self) -> AnyTokenizer: + return get_tokenizer("stepfun-ai/step3") + + @pytest.fixture + def test_config(self) -> ToolParserTestConfig: + return ToolParserTestConfig( + parser_name="step3", + # Test data + no_tool_calls_output="This is a regular response without any tool calls.", + single_tool_call_output=( + "<|tool_calls_begin|><|tool_call_begin|>" + '' + 'Tokyo' + "<|tool_call_end|><|tool_calls_end|>" + ), + parallel_tool_calls_output=( + "<|tool_calls_begin|><|tool_call_begin|>" + '' + 'Tokyo' + "<|tool_call_end|><|tool_sep|>" + '<|tool_call_begin|>' + 'Asia/Tokyo' + "<|tool_call_end|><|tool_calls_end|>" + ), + various_data_types_output=( + "<|tool_calls_begin|><|tool_call_begin|>" + '' + 'hello' + '42' + '3.14' + 'true' + 'null' + '' + '["a", "b", "c"]' + '' + '{"nested": "value"}' + "<|tool_call_end|><|tool_calls_end|>" + ), + empty_arguments_output=( + "<|tool_calls_begin|><|tool_call_begin|>" + '' + "<|tool_call_end|><|tool_calls_end|>" + ), + surrounding_text_output=( + "Let me check the weather for you.\n\n" + "<|tool_calls_begin|><|tool_call_begin|>" + '' + 'Tokyo' + "<|tool_call_end|><|tool_calls_end|>\n\n" + "I'll get that information." + ), + escaped_strings_output=( + "<|tool_calls_begin|><|tool_call_begin|>" + '' + 'He said "hello"' + 'C:\\Users\\file.txt' + 'line1\nline2' + "<|tool_call_end|><|tool_calls_end|>" + ), + malformed_input_outputs=[ + ( + "<|tool_calls_begin|><|tool_call_begin|>" + '' + ), + ( + '<|tool_call_begin|>' + "<|tool_call_end|>" + ), + ], + # Expected results + single_tool_call_expected_name="get_weather", + single_tool_call_expected_args={"city": "Tokyo"}, + parallel_tool_calls_count=2, + parallel_tool_calls_names=["get_weather", "get_time"], + # xfail markers + xfail_nonstreaming={ + "test_single_tool_call_simple_args": ( + "Step3 parser non-streaming has bugs" + ), + "test_parallel_tool_calls": ("Step3 parser non-streaming has bugs"), + "test_various_data_types": "Step3 parser non-streaming has bugs", + "test_empty_arguments": "Step3 parser non-streaming has bugs", + "test_surrounding_text": "Step3 parser non-streaming has bugs", + "test_escaped_strings": "Step3 parser non-streaming has bugs", + }, + xfail_streaming={ + "test_parallel_tool_calls": ( + "Step3 parser has significant bugs in both streaming " + "and non-streaming" + ), + "test_streaming_reconstruction": ( + "Step3 parser non-streaming has bugs, so streaming " + "doesn't match non-streaming" + ), + }, + ) From 44855807ec60687fd53d35f4d9b6b8b88051babd Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 30 Oct 2025 11:59:33 -0400 Subject: [PATCH 2/2] [CI/Build] More strict data type checking in common tool parser suite This tightens up the data type checking in the common tool call parser test suite to ensure parsers are not only parsing various data types of function arguments, but also that they are parsed into the expected Python type. The XML-based parsers do not support parsing into any data type but string, so there's flag added to control this stricter behavior so that tool call parsers that cannot deal with parsing different data types into their non-string native types are excluded from this checking. Signed-off-by: Ben Browning --- .../openai/tool_parsers/common_tests.py | 29 ++++++++++++------- .../tool_parsers/test_qwen3xml_tool_parser.py | 1 + .../tool_parsers/test_step3_tool_parser.py | 1 + 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/tests/entrypoints/openai/tool_parsers/common_tests.py b/tests/entrypoints/openai/tool_parsers/common_tests.py index 3bdefd867dc5..c45a19efbe51 100644 --- a/tests/entrypoints/openai/tool_parsers/common_tests.py +++ b/tests/entrypoints/openai/tool_parsers/common_tests.py @@ -3,6 +3,7 @@ import json from dataclasses import dataclass, field +from types import NoneType from typing import Any import pytest @@ -47,6 +48,7 @@ class ToolParserTestConfig: Special flags: allow_empty_or_json_empty_args: True if "" or "{}" both valid for empty args + supports_typed_arguments: True if the parser supports typed function arguments """ # Parser identification @@ -82,6 +84,7 @@ class ToolParserTestConfig: # Special assertions for edge cases allow_empty_or_json_empty_args: bool = True # "{}" or "" for empty args + supports_typed_arguments: bool = True class ToolParserTests: @@ -232,19 +235,25 @@ def test_various_data_types( args = json.loads(tool_calls[0].function.arguments) # Verify all expected fields present - required_fields = [ - "string_field", - "int_field", - "float_field", - "bool_field", - "null_field", - "array_field", - "object_field", - ] - for required_field in required_fields: + required_fields_types = { + "string_field": str, + "int_field": int, + "float_field": float, + "bool_field": bool, + "null_field": NoneType, + "array_field": list, + "object_field": dict, + } + for required_field, expected_type in required_fields_types.items(): assert required_field in args, ( f"Expected field '{required_field}' in arguments" ) + if test_config.supports_typed_arguments: + found_type = type(args[required_field]) + assert found_type is expected_type, ( + f"Expected field '{required_field}' to have type {expected_type}, " + f"got {found_type}" + ) def test_empty_arguments( self, diff --git a/tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py index 011df64a4733..7efa222a2031 100644 --- a/tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_qwen3xml_tool_parser.py @@ -71,4 +71,5 @@ def test_config(self) -> ToolParserTestConfig: "Qwen3XML streaming reconstruction has known issues" ), }, + supports_typed_arguments=False, ) diff --git a/tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py index d2ae93aeff65..4a77891ac8ad 100644 --- a/tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_step3_tool_parser.py @@ -108,4 +108,5 @@ def test_config(self) -> ToolParserTestConfig: "doesn't match non-streaming" ), }, + supports_typed_arguments=False, )