From 9b5a034837fd63f2b036cd0bb8a8e0ee9ce58ecf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 2 Mar 2026 03:02:47 +0000 Subject: [PATCH 1/2] Initial plan From 746cc97ae57ece197febf8dcd6f55deb8dc4e74f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 2 Mar 2026 03:11:29 +0000 Subject: [PATCH 2/2] Add llamacpp support with LlamaCpp model and from_llamacpp factory Co-authored-by: Ki-Seki <60967965+Ki-Seki@users.noreply.github.com> --- pyproject.toml | 3 ++ src/gimkit/__init__.py | 3 +- src/gimkit/models/__init__.py | 3 +- src/gimkit/models/llamacpp.py | 58 +++++++++++++++++++++++ tests/models/test_llamacpp.py | 89 +++++++++++++++++++++++++++++++++++ uv.lock | 18 ++++++- 6 files changed, 171 insertions(+), 3 deletions(-) create mode 100644 src/gimkit/models/llamacpp.py create mode 100644 tests/models/test_llamacpp.py diff --git a/pyproject.toml b/pyproject.toml index f32cb17..90f3bc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,9 @@ dependencies = [ ] [project.optional-dependencies] +llamacpp = [ + "llama-cpp-python>=0.3.0", +] vllm = [ "vllm>=0.14.0", ] diff --git a/src/gimkit/__init__.py b/src/gimkit/__init__.py index 37e53fe..505d1fb 100644 --- a/src/gimkit/__init__.py +++ b/src/gimkit/__init__.py @@ -1,7 +1,7 @@ from importlib.metadata import PackageNotFoundError, version from gimkit.guides import guide -from gimkit.models import from_openai, from_vllm, from_vllm_offline +from gimkit.models import from_llamacpp, from_openai, from_vllm, from_vllm_offline try: @@ -11,6 +11,7 @@ __all__ = [ + "from_llamacpp", "from_openai", "from_vllm", "from_vllm_offline", diff --git a/src/gimkit/models/__init__.py b/src/gimkit/models/__init__.py index 26d35ce..31daa9b 100644 --- a/src/gimkit/models/__init__.py +++ b/src/gimkit/models/__init__.py @@ -1,6 +1,7 @@ +from .llamacpp import from_llamacpp from .openai import from_openai from .vllm import from_vllm from .vllm_offline import from_vllm_offline -__all__ = ["from_openai", "from_vllm", "from_vllm_offline"] +__all__ = ["from_llamacpp", "from_openai", "from_vllm", "from_vllm_offline"] diff --git a/src/gimkit/models/llamacpp.py b/src/gimkit/models/llamacpp.py new file mode 100644 index 0000000..3b39ee2 --- /dev/null +++ b/src/gimkit/models/llamacpp.py @@ -0,0 +1,58 @@ +# Adapted from https://github.com/dottxt-ai/outlines/blob/main/outlines/models/llamacpp.py + + +from typing import TYPE_CHECKING, Any, Literal, cast + +from outlines.generator import Generator +from outlines.models.llamacpp import LlamaCpp as OutlinesLlamaCpp + +from gimkit.contexts import Query, Result +from gimkit.log import get_logger +from gimkit.models.utils import get_outlines_model_input, get_outlines_output_type, infill_responses +from gimkit.schemas import RESPONSE_SUFFIX, ContextInput + + +logger = get_logger(__name__) + +if TYPE_CHECKING: + from llama_cpp import Llama + + +class LlamaCpp(OutlinesLlamaCpp): + def __call__( + self, + model_input: ContextInput | Query, + output_type: Literal["cfg", "json"] | None = "cfg", + backend: str | None = None, + use_gim_prompt: bool = False, + **inference_kwargs: Any, + ) -> Result | list[Result]: + # Using `stop=RESPONSE_SUFFIX` is preferred for two reasons: + # 1. The model might not be trained well enough to generate EOS tokens immediately after RESPONSE_SUFFIX. + # 2. Even with CFG, inference engines may not guarantee termination when the CFG is satisfied. + inference_kwargs = self._ensure_response_suffix(inference_kwargs) + + outlines_model_input = get_outlines_model_input(model_input, output_type, use_gim_prompt) + outlines_output_type = get_outlines_output_type(model_input, output_type) + generator = Generator(self, outlines_output_type, backend) + raw_responses = generator(outlines_model_input, **inference_kwargs) + logger.debug(f"Raw responses of {self}: {raw_responses}") + return infill_responses( + model_input, + cast("str | list[str]", raw_responses), + json_responses=(output_type == "json"), + ) + + def _ensure_response_suffix(self, inference_kwargs: dict[str, Any]) -> dict[str, Any]: + stop = inference_kwargs.get("stop") + if stop is None: + inference_kwargs["stop"] = [RESPONSE_SUFFIX] + elif isinstance(stop, list) and RESPONSE_SUFFIX not in stop: + inference_kwargs["stop"] = [*stop, RESPONSE_SUFFIX] + elif isinstance(stop, str) and stop != RESPONSE_SUFFIX: + inference_kwargs["stop"] = [stop, RESPONSE_SUFFIX] + return inference_kwargs + + +def from_llamacpp(model: "Llama") -> LlamaCpp: + return LlamaCpp(model) diff --git a/tests/models/test_llamacpp.py b/tests/models/test_llamacpp.py new file mode 100644 index 0000000..6f69040 --- /dev/null +++ b/tests/models/test_llamacpp.py @@ -0,0 +1,89 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from outlines.models.llamacpp import LlamaCpp as OutlinesLlamaCpp + +from gimkit.contexts import Result +from gimkit.models.llamacpp import LlamaCpp as GIMLlamaCpp +from gimkit.models.llamacpp import from_llamacpp +from gimkit.schemas import RESPONSE_SUFFIX, MaskedTag + + +@pytest.fixture(autouse=True) +def patch_tokenizer(): + """Patch LlamaCppTokenizer so tests run without llama-cpp-python installed.""" + with patch("outlines.models.llamacpp.LlamaCppTokenizer"): + yield + + +def test_from_llamacpp(): + mock_llama = MagicMock() + model = from_llamacpp(mock_llama) + assert type(model) is GIMLlamaCpp + assert type(model) is not OutlinesLlamaCpp + assert model.model is mock_llama + + +def test_llamacpp_call(): + mock_llama = MagicMock() + model = from_llamacpp(mock_llama) + + with patch("gimkit.models.llamacpp.Generator") as mock_generator: + generator_instance = MagicMock() + generator_instance.return_value = '<|MASKED id="m_0"|>hi<|/MASKED|>' + mock_generator.return_value = generator_instance + + returned = model(MaskedTag()) + assert isinstance(returned, Result) + assert returned.tags[0].content == "hi" + + # Verify RESPONSE_SUFFIX is added to stop + call_kwargs = generator_instance.call_args[1] + assert RESPONSE_SUFFIX in call_kwargs["stop"] + + +def test_llamacpp_call_invalid_response(): + mock_llama = MagicMock() + model = from_llamacpp(mock_llama) + + with patch("gimkit.models.llamacpp.Generator") as mock_generator: + generator_instance = MagicMock() + generator_instance.return_value = set() + mock_generator.return_value = generator_instance + with pytest.raises(TypeError, match="Expected responses to be str or list of str, got"): + model(MaskedTag()) + + with patch("gimkit.models.llamacpp.Generator") as mock_generator: + generator_instance = MagicMock() + generator_instance.return_value = [] + mock_generator.return_value = generator_instance + with pytest.raises(ValueError, match="Response list is empty"): + model(MaskedTag()) + + +def test_ensure_response_suffix(): + mock_llama = MagicMock() + model = from_llamacpp(mock_llama) + + # No stop provided — should add RESPONSE_SUFFIX + kwargs = model._ensure_response_suffix({}) + assert kwargs["stop"] == [RESPONSE_SUFFIX] + + # stop is a list without RESPONSE_SUFFIX — should append it + kwargs = model._ensure_response_suffix({"stop": ["other"]}) + assert RESPONSE_SUFFIX in kwargs["stop"] + assert "other" in kwargs["stop"] + + # stop is a list already containing RESPONSE_SUFFIX — unchanged + kwargs = model._ensure_response_suffix({"stop": [RESPONSE_SUFFIX]}) + assert kwargs["stop"] == [RESPONSE_SUFFIX] + + # stop is a string different from RESPONSE_SUFFIX — should wrap both + kwargs = model._ensure_response_suffix({"stop": "other"}) + assert RESPONSE_SUFFIX in kwargs["stop"] + assert "other" in kwargs["stop"] + + # stop is already RESPONSE_SUFFIX string — unchanged + kwargs = model._ensure_response_suffix({"stop": RESPONSE_SUFFIX}) + assert kwargs["stop"] == RESPONSE_SUFFIX diff --git a/uv.lock b/uv.lock index e30cf5d..15ee81c 100644 --- a/uv.lock +++ b/uv.lock @@ -1485,6 +1485,9 @@ dependencies = [ ] [package.optional-dependencies] +llamacpp = [ + { name = "llama-cpp-python" }, +] vllm = [ { name = "vllm" }, ] @@ -1503,11 +1506,12 @@ dev = [ [package.metadata] requires-dist = [ { name = "json-repair", specifier = ">=0.55.1" }, + { name = "llama-cpp-python", marker = "extra == 'llamacpp'", specifier = ">=0.3.0" }, { name = "llguidance", specifier = ">=1.3.0" }, { name = "outlines", extras = ["openai"], specifier = ">=1.2.9" }, { name = "vllm", marker = "extra == 'vllm'", specifier = ">=0.14.0" }, ] -provides-extras = ["vllm"] +provides-extras = ["llamacpp", "vllm"] [package.metadata.requires-dev] dev = [ @@ -2235,6 +2239,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fc/85/69f92b2a7b3c0f88ffe107c86b952b397004b5b8ea5a81da3d9c04c04422/librt-0.7.8-cp314-cp314t-win_arm64.whl", hash = "sha256:8766ece9de08527deabcd7cb1b4f1a967a385d26e33e536d6d8913db6ef74f06", size = 40550, upload-time = "2026-01-14T12:56:01.542Z" }, ] +[[package]] +name = "llama-cpp-python" +version = "0.3.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "diskcache" }, + { name = "jinja2" }, + { name = "numpy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e4/b4/c8cd17629ced0b9644a71d399a91145aedef109c0333443bef015e45b704/llama_cpp_python-0.3.16.tar.gz", hash = "sha256:34ed0f9bd9431af045bb63d9324ae620ad0536653740e9bb163a2e1fcb973be6", size = 50688636, upload-time = "2025-08-15T04:58:29.212Z" } + [[package]] name = "llguidance" version = "1.3.0"