After switching the benchmarking model from Qwen 3 to gpt-oss 120b, the LLM generates invalid JSON in tool call arguments (e.g., missing quotes), causing Ollama to fail with a parsing error. This is likely the same class of issue as #348, where models hallucinate malformed tool calls. Qwen 3.5 did not exhibit this problem.
Traceback (most recent call last):
File "/root/archi/src/bin/service_benchmark.py", line 635, in <module>
benchmarker.run()
File "/root/archi/src/bin/service_benchmark.py", line 481, in run
result = self.chain(history=formatted_question)
File "/usr/local/lib/python3.10/site-packages/src/archi/archi.py", line 114, in __call__
return self.invoke(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/src/archi/archi.py", line 90, in invoke
result = self.pipeline.invoke(*args, **call_kwargs)
File "/usr/local/lib/python3.10/site-packages/src/archi/pipelines/agents/base_react.py", line 261, in invoke
answer_output = self.agent.invoke(agent_inputs, {"recursion_limit": recursion_limit})
File "/usr/local/lib/python3.10/site-packages/langgraph/pregel/main.py", line 3094, in invoke
for chunk in self.stream(
File "/usr/local/lib/python3.10/site-packages/langgraph/pregel/main.py", line 2679, in stream
for _ in runner.tick(
File "/usr/local/lib/python3.10/site-packages/langgraph/pregel/_runner.py", line 167, in tick
run_with_retry(
File "/usr/local/lib/python3.10/site-packages/langgraph/pregel/_retry.py", line 42, in run_with_retry
return task.proc.invoke(task.input, config)
File "/usr/local/lib/python3.10/site-packages/langgraph/_internal/_runnable.py", line 656, in invoke
input = context.run(step.invoke, input, config, **kwargs)
File "/usr/local/lib/python3.10/site-packages/langgraph/_internal/_runnable.py", line 400, in invoke
ret = self.func(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/langchain/agents/factory.py", line 1065, in model_node
response = _execute_model_sync(request)
File "/usr/local/lib/python3.10/site-packages/langchain/agents/factory.py", line 1038, in _execute_model_sync
output = model_.invoke(messages)
File "/usr/local/lib/python3.10/site-packages/langchain_core/runnables/base.py", line 5695, in invoke
return self.bound.invoke(
File "/usr/local/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py", line 402, in invoke
self.generate_prompt(
File "/usr/local/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py", line 1121, in generate_prompt
return self.generate(prompt_messages, stop=stop, callbacks=callbacks, **kwargs)
File "/usr/local/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py", line 931, in generate
self._generate_with_cache(
File "/usr/local/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py", line 1233, in _generate_with_cache
result = self._generate(
File "/usr/local/lib/python3.10/site-packages/langchain_ollama/chat_models.py", line 1025, in _generate
final_chunk = self._chat_stream_with_aggregation(
File "/usr/local/lib/python3.10/site-packages/langchain_ollama/chat_models.py", line 960, in _chat_stream_with_aggregation
for chunk in self._iterate_over_stream(messages, stop, **kwargs):
File "/usr/local/lib/python3.10/site-packages/langchain_ollama/chat_models.py", line 1049, in _iterate_over_stream
for stream_resp in self._create_chat_stream(messages, stop, **kwargs):
File "/usr/local/lib/python3.10/site-packages/langchain_ollama/chat_models.py", line 947, in _create_chat_stream
yield from self._client.chat(**chat_params)
File "/usr/local/lib/python3.10/site-packages/ollama/_client.py", line 184, in inner
raise ResponseError(err)
ollama._types.ResponseError: error parsing tool call: raw='{"after":0,"before":0,"case_sensitive":false,"max_matches_per_file":20,"max_results_override":1,"query":"Resolution",regex":false}', err=invalid character 'r' looking for beginning of object key string (status code: -1)
After switching the benchmarking model from Qwen 3 to gpt-oss 120b, the LLM generates invalid JSON in tool call arguments (e.g., missing quotes), causing Ollama to fail with a parsing error. This is likely the same class of issue as #348, where models hallucinate malformed tool calls. Qwen 3.5 did not exhibit this problem.