From ac48ccaf8b201bf02f6ec2e652fc7d46f4d4a9df Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 29 Oct 2025 15:43:54 +0100
Subject: [PATCH 1/4] Cleanup entrypoint test organisation

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 | 36 ++++++++++---------
 tests/entrypoints/llm/test_collective_rpc.py  |  4 +--
 .../{ => openai}/test_chat_utils.py           |  4 +--
 .../{openai/tool_parsers => unit}/__init__.py |  0
 .../test_api_server_process_manager.py        |  0
 tests/entrypoints/{ => unit}/test_context.py  |  0
 .../{ => unit}/test_harmony_utils.py          |  0
 tests/entrypoints/{ => unit}/test_renderer.py |  0
 .../{ => unit}/test_ssl_cert_refresher.py     |  0
 .../entrypoints/unit/tool_parsers/__init__.py |  0
 .../{openai => unit}/tool_parsers/conftest.py |  0
 .../tool_parsers/test_hermes_tool_parser.py   |  0
 .../test_hunyuan_a13b_tool_parser.py          |  0
 .../test_llama3_json_tool_parser.py           |  0
 .../test_llama4_pythonic_tool_parser.py       |  0
 .../tool_parsers/test_olmo3_tool_parser.py    |  0
 .../tool_parsers/test_pythonic_tool_parser.py |  0
 .../{openai => unit}/tool_parsers/utils.py    |  0
 18 files changed, 24 insertions(+), 20 deletions(-)
 rename tests/entrypoints/{ => openai}/test_chat_utils.py (99%)
 rename tests/entrypoints/{openai/tool_parsers => unit}/__init__.py (100%)
 rename tests/entrypoints/{ => unit}/test_api_server_process_manager.py (100%)
 rename tests/entrypoints/{ => unit}/test_context.py (100%)
 rename tests/entrypoints/{ => unit}/test_harmony_utils.py (100%)
 rename tests/entrypoints/{ => unit}/test_renderer.py (100%)
 rename tests/entrypoints/{ => unit}/test_ssl_cert_refresher.py (100%)
 create mode 100644 tests/entrypoints/unit/tool_parsers/__init__.py
 rename tests/entrypoints/{openai => unit}/tool_parsers/conftest.py (100%)
 rename tests/entrypoints/{openai => unit}/tool_parsers/test_hermes_tool_parser.py (100%)
 rename tests/entrypoints/{openai => unit}/tool_parsers/test_hunyuan_a13b_tool_parser.py (100%)
 rename tests/entrypoints/{openai => unit}/tool_parsers/test_llama3_json_tool_parser.py (100%)
 rename tests/entrypoints/{openai => unit}/tool_parsers/test_llama4_pythonic_tool_parser.py (100%)
 rename tests/entrypoints/{openai => unit}/tool_parsers/test_olmo3_tool_parser.py (100%)
 rename tests/entrypoints/{openai => unit}/tool_parsers/test_pythonic_tool_parser.py (100%)
 rename tests/entrypoints/{openai => unit}/tool_parsers/utils.py (100%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e166f320f9c3..6d5fb045d8f3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -89,14 +89,10 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/
-  - tests/basic_correctness/test_basic_correctness
-  - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_cumem.py
+  - tests/basic_correctness/
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s basic_correctness/test_cumem.py
-  - pytest -v -s basic_correctness/test_basic_correctness.py
-  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - pytest -v -s basic_correctness
 
 - label: Entrypoints Unit Tests # 5min
   timeout_in_minutes: 10
@@ -104,10 +100,9 @@ steps:
   fast_check: true
   source_file_dependencies:
   - vllm/entrypoints
-  - tests/entrypoints/
+  - tests/entrypoints/unit
   commands:
-  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/unit
 
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
@@ -121,12 +116,12 @@ steps:
   - tests/entrypoints/offline_mode
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration Test (API Server) # 100min
-  timeout_in_minutes: 130
+- label: Entrypoints Integration Test (API Server) %N # 50min each
+  timeout_in_minutes: 65
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -134,12 +129,21 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/openai
-  - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
-  - pytest -v -s entrypoints/test_chat_utils.py
+  # PYTHONPATH is needed to import custom Worker extension
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py \
+    --shard-id=$$BUILDKITE_PARALLEL_JOB \
+    --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  - pytest -v -s entrypoints/openai \
+    --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py \
+    --ignore=entrypoints/openai/test_oot_registration.py \
+    --ignore=entrypoints/openai/test_tensorizer_entrypoint.py \
+    --ignore=entrypoints/openai/correctness/ \
+    --ignore=entrypoints/openai/test_collective_rpc.py \
+    --shard-id=$$BUILDKITE_PARALLEL_JOB \
+    --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
 
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 747676ac9567..6329542cc443 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -8,13 +8,13 @@
 
 from ...utils import create_new_process_for_each_test
 
+pytestmark = pytest.mark.multi_gpu_test(num_gpus=2)
+
 
 @pytest.mark.parametrize("tp_size", [1, 2])
 @pytest.mark.parametrize("backend", ["mp", "ray"])
 @create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend, monkeypatch):
-    if torch.cuda.device_count() < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
     if tp_size == 1 and backend == "ray":
         pytest.skip("Skip duplicate test case")
     if tp_size == 1:
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/openai/test_chat_utils.py
similarity index 99%
rename from tests/entrypoints/test_chat_utils.py
rename to tests/entrypoints/openai/test_chat_utils.py
index ca87b3e76b3f..b48fd21eb132 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/openai/test_chat_utils.py
@@ -31,8 +31,8 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
-from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import VLLM_PATH
+from ...models.registry import HF_EXAMPLE_MODELS
+from ...utils import VLLM_PATH
 
 EXAMPLES_DIR = VLLM_PATH / "examples"
 
diff --git a/tests/entrypoints/openai/tool_parsers/__init__.py b/tests/entrypoints/unit/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/__init__.py
rename to tests/entrypoints/unit/__init__.py
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/unit/test_api_server_process_manager.py
similarity index 100%
rename from tests/entrypoints/test_api_server_process_manager.py
rename to tests/entrypoints/unit/test_api_server_process_manager.py
diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/unit/test_context.py
similarity index 100%
rename from tests/entrypoints/test_context.py
rename to tests/entrypoints/unit/test_context.py
diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/unit/test_harmony_utils.py
similarity index 100%
rename from tests/entrypoints/test_harmony_utils.py
rename to tests/entrypoints/unit/test_harmony_utils.py
diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/unit/test_renderer.py
similarity index 100%
rename from tests/entrypoints/test_renderer.py
rename to tests/entrypoints/unit/test_renderer.py
diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/unit/test_ssl_cert_refresher.py
similarity index 100%
rename from tests/entrypoints/test_ssl_cert_refresher.py
rename to tests/entrypoints/unit/test_ssl_cert_refresher.py
diff --git a/tests/entrypoints/unit/tool_parsers/__init__.py b/tests/entrypoints/unit/tool_parsers/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/unit/tool_parsers/conftest.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/conftest.py
rename to tests/entrypoints/unit/tool_parsers/conftest.py
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
rename to tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py
diff --git a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_hunyuan_a13b_tool_parser.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
rename to tests/entrypoints/unit/tool_parsers/test_hunyuan_a13b_tool_parser.py
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_llama3_json_tool_parser.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
rename to tests/entrypoints/unit/tool_parsers/test_llama3_json_tool_parser.py
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_llama4_pythonic_tool_parser.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
rename to tests/entrypoints/unit/tool_parsers/test_llama4_pythonic_tool_parser.py
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_olmo3_tool_parser.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
rename to tests/entrypoints/unit/tool_parsers/test_olmo3_tool_parser.py
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_pythonic_tool_parser.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
rename to tests/entrypoints/unit/tool_parsers/test_pythonic_tool_parser.py
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/unit/tool_parsers/utils.py
similarity index 100%
rename from tests/entrypoints/openai/tool_parsers/utils.py
rename to tests/entrypoints/unit/tool_parsers/utils.py

From 04e0269c5e119b097b9493d2141a837e4c5aa955 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 29 Oct 2025 15:47:10 +0100
Subject: [PATCH 2/4] Remove unused import

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/entrypoints/llm/test_collective_rpc.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 6329542cc443..d9bf99cb8d35 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-import torch
 
 from vllm import LLM
 

From fe584b35a5c9d25a80214d73c12b90e3bce7b9f9 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:27:30 +0100
Subject: [PATCH 3/4] Move the non unit tests to `tool_use`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../tool_parsers/test_hermes_tool_parser.py   | 261 -----------------
 tests/tool_use/test_hermes_tool_parser.py     | 265 ++++++++++++++++++
 2 files changed, 265 insertions(+), 261 deletions(-)
 create mode 100644 tests/tool_use/test_hermes_tool_parser.py

diff --git a/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py
index 38008dafe32b..14c9db4adbae 100644
--- a/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/unit/tool_parsers/test_hermes_tool_parser.py
@@ -1,273 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import json
-
 import pytest
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from ....utils import RemoteOpenAIServer
-
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
-LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"
-
-SERVER_ARGS = [
-    "--enforce-eager",
-    "--enable-auto-tool-choice",
-    "--tool-call-parser",
-    "hermes",
-    "--enable-lora",
-    "--lora-modules",
-    f"{LORA_MODEL}={LORA_MODEL}",
-    "--tokenizer",
-    f"{LORA_MODEL}",
-]
-
-TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The city and state, e.g. San Francisco, CA",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ["celsius", "fahrenheit"],
-                    },
-                },
-                "required": ["location"],
-            },
-        },
-    }
-]
-
-PRODUCT_TOOLS = [
-    {
-        "type": "function",
-        "function": {
-            "name": "get_product_info",
-            "description": "Get detailed information of a product based on its "
-            "product ID.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "inserted": {
-                        "type": "boolean",
-                        "description": "inserted.",
-                    },
-                    "product_id": {
-                        "type": "integer",
-                        "description": "The product ID of the product.",
-                    },
-                },
-                "required": ["product_id", "inserted"],
-            },
-        },
-    }
-]
-
-MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
-
-PRODUCT_MESSAGES = [
-    {
-        "role": "user",
-        "content": "Hi! Do you have any detailed information about the product id "
-        "7355608 and inserted true?",
-    }
-]
-
-
-@pytest.mark.asyncio
-async def test_non_streaming_tool_call():
-    """Test tool call in non-streaming mode."""
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
-        client = server.get_async_client()
-
-        response = await client.chat.completions.create(
-            model=LORA_MODEL,
-            messages=MESSAGES,
-            tools=TOOLS,
-            tool_choice="auto",
-            temperature=0.0,
-        )
-
-        assert response.choices
-        choice = response.choices[0]
-        message = choice.message
-
-        assert choice.finish_reason == "tool_calls"
-        assert message.tool_calls is not None
-
-        tool_call = message.tool_calls[0]
-        assert tool_call.type == "function"
-        assert tool_call.function.name == "get_current_weather"
-
-        arguments = json.loads(tool_call.function.arguments)
-        assert "location" in arguments
-        assert "Boston" in arguments["location"]
-        print("\n[Non-Streaming Test Passed]")
-        print(f"Tool Call: {tool_call.function.name}")
-        print(f"Arguments: {arguments}")
-
-
-@pytest.mark.asyncio
-async def test_streaming_tool_call():
-    """Test tool call in streaming mode."""
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
-        client = server.get_async_client()
-
-        stream = await client.chat.completions.create(
-            model=LORA_MODEL,
-            messages=MESSAGES,
-            tools=TOOLS,
-            tool_choice="auto",
-            temperature=0.0,
-            stream=True,
-        )
-
-        tool_call_chunks = {}
-        async for chunk in stream:
-            if not chunk.choices:
-                continue
-
-            delta = chunk.choices[0].delta
-            if not delta or not delta.tool_calls:
-                continue
-
-            for tool_chunk in delta.tool_calls:
-                index = tool_chunk.index
-                if index not in tool_call_chunks:
-                    tool_call_chunks[index] = {"name": "", "arguments": ""}
-
-                if tool_chunk.function.name:
-                    tool_call_chunks[index]["name"] += tool_chunk.function.name
-                if tool_chunk.function.arguments:
-                    tool_call_chunks[index]["arguments"] += (
-                        tool_chunk.function.arguments
-                    )
-
-        assert len(tool_call_chunks) == 1
-        reconstructed_tool_call = tool_call_chunks[0]
-
-        assert reconstructed_tool_call["name"] == "get_current_weather"
-
-        arguments = json.loads(reconstructed_tool_call["arguments"])
-        assert "location" in arguments
-        assert "Boston" in arguments["location"]
-        print("\n[Streaming Test Passed]")
-        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
-        print(f"Reconstructed Arguments: {arguments}")
-
-
-@pytest.mark.asyncio
-async def test_non_streaming_product_tool_call():
-    """Test tool call integer and boolean parameters in non-streaming mode."""
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
-        client = server.get_async_client()
-
-        response = await client.chat.completions.create(
-            model=LORA_MODEL,
-            messages=PRODUCT_MESSAGES,
-            tools=PRODUCT_TOOLS,
-            tool_choice="auto",
-            temperature=0.66,
-        )
-
-        assert response.choices
-        choice = response.choices[0]
-        message = choice.message
-
-        assert choice.finish_reason == "tool_calls"
-        assert message.tool_calls is not None
-
-        tool_call = message.tool_calls[0]
-        assert tool_call.type == "function"
-        assert tool_call.function.name == "get_product_info"
-
-        arguments = json.loads(tool_call.function.arguments)
-        assert "product_id" in arguments
-        assert "inserted" in arguments
-
-        product_id = arguments.get("product_id")
-        inserted = arguments.get("inserted")
-
-        assert isinstance(product_id, int)
-        assert product_id == 7355608
-        assert isinstance(inserted, bool)
-        assert inserted is True
-
-        print("\n[Non-Streaming Product Test Passed]")
-        print(f"Tool Call: {tool_call.function.name}")
-        print(f"Arguments: {arguments}")
-
-
-@pytest.mark.asyncio
-async def test_streaming_product_tool_call():
-    """Test tool call integer and boolean parameters in streaming mode."""
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
-        client = server.get_async_client()
-
-        stream = await client.chat.completions.create(
-            model=LORA_MODEL,
-            messages=PRODUCT_MESSAGES,
-            tools=PRODUCT_TOOLS,
-            tool_choice="auto",
-            temperature=0.66,
-            stream=True,
-        )
-
-        tool_call_chunks = {}
-        async for chunk in stream:
-            if not chunk.choices:
-                continue
-
-            delta = chunk.choices[0].delta
-            if not delta or not delta.tool_calls:
-                continue
-
-            for tool_chunk in delta.tool_calls:
-                index = tool_chunk.index
-                if index not in tool_call_chunks:
-                    tool_call_chunks[index] = {"name": "", "arguments": ""}
-
-                if tool_chunk.function.name:
-                    tool_call_chunks[index]["name"] += tool_chunk.function.name
-                if tool_chunk.function.arguments:
-                    tool_call_chunks[index]["arguments"] += (
-                        tool_chunk.function.arguments
-                    )
-
-        assert len(tool_call_chunks) == 1
-        reconstructed_tool_call = tool_call_chunks[0]
-
-        assert reconstructed_tool_call["name"] == "get_product_info"
-
-        arguments = json.loads(reconstructed_tool_call["arguments"])
-        assert "product_id" in arguments
-        assert "inserted" in arguments
-
-        # Handle type coercion for streaming test as well
-        product_id = arguments.get("product_id")
-        inserted = arguments.get("inserted")
-
-        assert isinstance(product_id, int)
-        assert product_id == 7355608
-        assert isinstance(inserted, bool)
-        assert inserted is True
-
-        print("\n[Streaming Product Test Passed]")
-        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
-        print(f"Reconstructed Arguments: {arguments}")
-
 
 @pytest.fixture
 def qwen_tokenizer() -> AnyTokenizer:
diff --git a/tests/tool_use/test_hermes_tool_parser.py b/tests/tool_use/test_hermes_tool_parser.py
new file mode 100644
index 000000000000..af103c7d06f6
--- /dev/null
+++ b/tests/tool_use/test_hermes_tool_parser.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from .utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"
+
+SERVER_ARGS = [
+    "--enforce-eager",
+    "--enable-auto-tool-choice",
+    "--tool-call-parser",
+    "hermes",
+    "--enable-lora",
+    "--lora-modules",
+    f"{LORA_MODEL}={LORA_MODEL}",
+    "--tokenizer",
+    f"{LORA_MODEL}",
+]
+
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+
+PRODUCT_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_product_info",
+            "description": "Get detailed information of a product based on its "
+            "product ID.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "inserted": {
+                        "type": "boolean",
+                        "description": "inserted.",
+                    },
+                    "product_id": {
+                        "type": "integer",
+                        "description": "The product ID of the product.",
+                    },
+                },
+                "required": ["product_id", "inserted"],
+            },
+        },
+    }
+]
+
+MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
+
+PRODUCT_MESSAGES = [
+    {
+        "role": "user",
+        "content": "Hi! Do you have any detailed information about the product id "
+        "7355608 and inserted true?",
+    }
+]
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_tool_call():
+    """Test tool call in non-streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        response = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+        )
+
+        assert response.choices
+        choice = response.choices[0]
+        message = choice.message
+
+        assert choice.finish_reason == "tool_calls"
+        assert message.tool_calls is not None
+
+        tool_call = message.tool_calls[0]
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_current_weather"
+
+        arguments = json.loads(tool_call.function.arguments)
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Non-Streaming Test Passed]")
+        print(f"Tool Call: {tool_call.function.name}")
+        print(f"Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_streaming_tool_call():
+    """Test tool call in streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        stream = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+            stream=True,
+        )
+
+        tool_call_chunks = {}
+        async for chunk in stream:
+            if not chunk.choices:
+                continue
+
+            delta = chunk.choices[0].delta
+            if not delta or not delta.tool_calls:
+                continue
+
+            for tool_chunk in delta.tool_calls:
+                index = tool_chunk.index
+                if index not in tool_call_chunks:
+                    tool_call_chunks[index] = {"name": "", "arguments": ""}
+
+                if tool_chunk.function.name:
+                    tool_call_chunks[index]["name"] += tool_chunk.function.name
+                if tool_chunk.function.arguments:
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments
+                    )
+
+        assert len(tool_call_chunks) == 1
+        reconstructed_tool_call = tool_call_chunks[0]
+
+        assert reconstructed_tool_call["name"] == "get_current_weather"
+
+        arguments = json.loads(reconstructed_tool_call["arguments"])
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Streaming Test Passed]")
+        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
+        print(f"Reconstructed Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_product_tool_call():
+    """Test tool call integer and boolean parameters in non-streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        response = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=PRODUCT_MESSAGES,
+            tools=PRODUCT_TOOLS,
+            tool_choice="auto",
+            temperature=0.66,
+        )
+
+        assert response.choices
+        choice = response.choices[0]
+        message = choice.message
+
+        assert choice.finish_reason == "tool_calls"
+        assert message.tool_calls is not None
+
+        tool_call = message.tool_calls[0]
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_product_info"
+
+        arguments = json.loads(tool_call.function.arguments)
+        assert "product_id" in arguments
+        assert "inserted" in arguments
+
+        product_id = arguments.get("product_id")
+        inserted = arguments.get("inserted")
+
+        assert isinstance(product_id, int)
+        assert product_id == 7355608
+        assert isinstance(inserted, bool)
+        assert inserted is True
+
+        print("\n[Non-Streaming Product Test Passed]")
+        print(f"Tool Call: {tool_call.function.name}")
+        print(f"Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_streaming_product_tool_call():
+    """Test tool call integer and boolean parameters in streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        stream = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=PRODUCT_MESSAGES,
+            tools=PRODUCT_TOOLS,
+            tool_choice="auto",
+            temperature=0.66,
+            stream=True,
+        )
+
+        tool_call_chunks = {}
+        async for chunk in stream:
+            if not chunk.choices:
+                continue
+
+            delta = chunk.choices[0].delta
+            if not delta or not delta.tool_calls:
+                continue
+
+            for tool_chunk in delta.tool_calls:
+                index = tool_chunk.index
+                if index not in tool_call_chunks:
+                    tool_call_chunks[index] = {"name": "", "arguments": ""}
+
+                if tool_chunk.function.name:
+                    tool_call_chunks[index]["name"] += tool_chunk.function.name
+                if tool_chunk.function.arguments:
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments
+                    )
+
+        assert len(tool_call_chunks) == 1
+        reconstructed_tool_call = tool_call_chunks[0]
+
+        assert reconstructed_tool_call["name"] == "get_product_info"
+
+        arguments = json.loads(reconstructed_tool_call["arguments"])
+        assert "product_id" in arguments
+        assert "inserted" in arguments
+
+        # Handle type coercion for streaming test as well
+        product_id = arguments.get("product_id")
+        inserted = arguments.get("inserted")
+
+        assert isinstance(product_id, int)
+        assert product_id == 7355608
+        assert isinstance(inserted, bool)
+        assert inserted is True
+
+        print("\n[Streaming Product Test Passed]")
+        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
+        print(f"Reconstructed Arguments: {arguments}")

From 183ff2eda1ff274a21580d767f3e40e39be91407 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:42:50 +0100
Subject: [PATCH 4/4] import from the correct utils

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/tool_use/test_hermes_tool_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tool_use/test_hermes_tool_parser.py b/tests/tool_use/test_hermes_tool_parser.py
index af103c7d06f6..e396ab5d8dbb 100644
--- a/tests/tool_use/test_hermes_tool_parser.py
+++ b/tests/tool_use/test_hermes_tool_parser.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from .utils import RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"