From 6dd03d7831d96de97e6401f141c2953bc3f0ee26 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Wed, 8 Oct 2025 16:27:27 +0200
Subject: [PATCH 01/35] use claude 4.5

---
 cog_safe_push/ai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index db4ec3b..e559f1f 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -73,7 +73,7 @@ async def call(
     if not api_key:
         raise ArgumentError("ANTHROPIC_API_KEY is not defined")
 
-    model = "claude-sonnet-4-20250514"
+    model = "claude-sonnet-4-5"
     client = anthropic.AsyncAnthropic(api_key=api_key)
 
     try:

From 872c500f4d9396dfa5efa3fbaafad59edb5c5ccf Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Wed, 8 Oct 2025 16:27:34 +0200
Subject: [PATCH 02/35] stricter boolean prompt

---
 cog_safe_push/ai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index e559f1f..c78ff3c 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -38,7 +38,7 @@ async def wrapper_retry(*args, **kwargs):
 async def boolean(
     prompt: str, files: list[Path] | None = None, include_file_metadata: bool = False
 ) -> bool:
-    system_prompt = "You only answer YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it's important that you only ever answer with either the string YES or the string NO."
+    system_prompt = "You are a boolean classifier. You must only respond with either YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it is critical that you only ever answer with either the string YES or the string NO."
     # system_prompt = "You are a helpful assistant"
     output = await call(
         system_prompt=system_prompt,

From f1073c6fda7a37e2ed22697ed057ca2a34546897 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Fri, 10 Oct 2025 13:36:11 -0500
Subject: [PATCH 03/35] fix bool prompt

---
 cog_safe_push/ai.py                            | 1 -
 cog_safe_push/match_outputs.py                 | 8 +++++++-
 integration-test/test_output_matches_prompt.py | 4 ++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index c78ff3c..7345da0 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -39,7 +39,6 @@ async def boolean(
     prompt: str, files: list[Path] | None = None, include_file_metadata: bool = False
 ) -> bool:
     system_prompt = "You are a boolean classifier. You must only respond with either YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it is critical that you only ever answer with either the string YES or the string NO."
-    # system_prompt = "You are a helpful assistant"
     output = await call(
         system_prompt=system_prompt,
         prompt=prompt.strip(),
diff --git a/cog_safe_push/match_outputs.py b/cog_safe_push/match_outputs.py
index 09edce2..f7508a4 100644
--- a/cog_safe_push/match_outputs.py
+++ b/cog_safe_push/match_outputs.py
@@ -22,12 +22,18 @@ async def output_matches_prompt(output: Any, prompt: str) -> tuple[bool, str]:
         urls = output if isinstance(output, list) else list(output.values())
 
     with download_many(urls) as tmp_files:
-        claude_prompt = """You are part of an automatic evaluation that compares media (text, audio, image, video, etc.) to captions. I want to know if the caption matches the text or file..
+        claude_prompt = """You are part of an automatic evaluation that compares media (text, audio, image, video, etc.) to descriptions. I want to know if the description matches the text or file..
 
 """
         if urls:
             claude_prompt += f"""Does this file(s) and the attached content of the file(s) match the description? Pay close attention to the metadata about the attached files which is included below, especially if the description mentions file type, image dimensions, or any other aspect that is described in the metadata. Do not infer file type or image dimensions from the image content, but from the attached metadata.
 
+The description may be specific or vague, but you should match on whatever is in the description. For example:
+* If the description is 'a jpg image' and it's a jpg image of a cat, that's still a match.
+* If the description is 'an image of a cat' and the image is actually of a dog, it's not a match.
+* If the description is 'an audio file' it should match any audio files regardless of content.
+* etc.
+
 Description to evaluate: {prompt}
 
 Filename(s): {output}"""
diff --git a/integration-test/test_output_matches_prompt.py b/integration-test/test_output_matches_prompt.py
index 749124f..92e28d3 100644
--- a/integration-test/test_output_matches_prompt.py
+++ b/integration-test/test_output_matches_prompt.py
@@ -13,14 +13,14 @@
         "A webp image of a bird",
         "A webp image of a red bird",
     ],
-    "https://replicate.delivery/czjl/QFrZ9RF8VroFM5Ml9MKt3rm0vP8ZHTWaqfO1oT6bouj0m76JA/tmpn888w5a8.jpg": [
+    "https://replicate.delivery/xezq/7mpYHTkoCW5hFhYeyZHhc8pbNGjpZrSVypReBr4JsbXeLd7qA/tmpqsp1ykrz.jpg": [
         "A jpg image of a formula one car",
         "a jpg image of a car",
         "A jpg image",
         "Formula 1 car",
         "car",
     ],
-    "https://replicate.delivery/czjl/8C4OJCR6w7rQEFeernSerHH5e3xe2f9cYYsGTW8k5Eob57d9E/tmpjwitpu7f.png": [
+    "https://replicate.delivery/xezq/Gf7onwGGPDzgVaiQfARPWEOJZGzq94QKS3qsqlRfL9xUfi1VB/tmppfxwubub.png": [
         "480x320px png image",
         "480x320px image of a formula one car",
     ],

From 7f3b04ab1766dd22bed0a50d7c4fb9039782c1a0 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Fri, 10 Oct 2025 14:30:38 -0500
Subject: [PATCH 04/35] Strip wrapping backticks and json markers from AI
 response

---
 cog_safe_push/ai.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 7345da0..8f66cf5 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -56,6 +56,15 @@ async def boolean(
 async def json_object(prompt: str, files: list[Path] | None = None) -> dict:
     system_prompt = "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
     output = await call(system_prompt=system_prompt, prompt=prompt.strip(), files=files)
+    
+    if output.startswith("```json"):
+        output = output[7:]
+    elif output.startswith("```"):
+        output = output[3:]
+    if output.endswith("```"):
+        output = output[:-3]
+    output = output.strip()
+    
     try:
         return json.loads(output)
     except json.JSONDecodeError:

From 4aeec3b2b7b23e712eec37872cc9eca123e6f7b7 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Fri, 10 Oct 2025 14:36:11 -0500
Subject: [PATCH 05/35] lint

---
 cog_safe_push/ai.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 8f66cf5..a9a1e87 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -56,7 +56,7 @@ async def boolean(
 async def json_object(prompt: str, files: list[Path] | None = None) -> dict:
     system_prompt = "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
     output = await call(system_prompt=system_prompt, prompt=prompt.strip(), files=files)
-    
+
     if output.startswith("```json"):
         output = output[7:]
     elif output.startswith("```"):
@@ -64,7 +64,7 @@ async def json_object(prompt: str, files: list[Path] | None = None) -> dict:
     if output.endswith("```"):
         output = output[:-3]
     output = output.strip()
-    
+
     try:
         return json.loads(output)
     except json.JSONDecodeError:

From 7d1d9591360388edf7d7807637b9c7bd5100747a Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 07:59:06 -0500
Subject: [PATCH 06/35] Fuzz system prompt

---
 cog_safe_push/ai.py      |   6 +-
 cog_safe_push/predict.py | 125 ++++++++++++++++++---------------------
 cog_safe_push/tasks.py   |   6 +-
 test/test_predict.py     |  22 +++----
 4 files changed, 77 insertions(+), 82 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index a9a1e87..e49441a 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -53,8 +53,10 @@ async def boolean(
 
 
 @async_retry(3)
-async def json_object(prompt: str, files: list[Path] | None = None) -> dict:
-    system_prompt = "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
+async def json_object(prompt: str, files: list[Path] | None = None, system_prompt: str = "") -> dict:
+    if system_prompt:
+        system_prompt = system_prompt.strip() + "\n\n"
+    system_prompt += "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
     output = await call(system_prompt=system_prompt, prompt=prompt.strip(), files=files)
 
     if output.startswith("```json"):
diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index b56ca96..8fba111 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -3,6 +3,7 @@
 import time
 from typing import Any, cast
 
+import httpx
 import replicate
 from replicate.exceptions import ReplicateError
 from replicate.model import Model
@@ -16,38 +17,19 @@
 from .utils import truncate
 
 
-async def make_predict_inputs(
-    schemas: dict,
-    train: bool,
-    only_required: bool,
-    seed: int | None,
-    fixed_inputs: dict[str, Any],
-    disabled_inputs: list[str],
-    fuzz_prompt: str | None,
-    inputs_history: list[dict] | None = None,
-    attempt=0,
-) -> tuple[dict, bool]:
-    input_name = "TrainingInput" if train else "Input"
-    input_schema = schemas[input_name]
-    properties = input_schema["properties"]
-    required = input_schema.get("required", [])
+async def make_fuzz_system_prompt() -> str:
+    async with httpx.AsyncClient() as client:
+        multimedia_example_files = await client.get("https://multimedia-example-files.replicate.dev/index.txt")
+    return """# Replicate model fuzzing inputs
 
-    is_deterministic = False
-    if "seed" in properties and seed is not None:
-        is_deterministic = True
-        del properties["seed"]
+Your task is to generate inputs for model fuzzing of a Replicate model.
 
-    fixed_inputs = {k: v for k, v in fixed_inputs.items() if k not in disabled_inputs}
+Given a model input JSON schema, return a valid JSON payload for this model.
 
-    schemas_str = json.dumps(schemas, indent=2)
-    prompt = (
-        '''
-Below is an example of an OpenAPI schema for a Cog model:
+For example,
 
 {
-  "'''
-        + input_name
-        + '''": {
+  "Input": {
     "properties": {
       "my_bool": {
         "description": "A bool.",
@@ -99,9 +81,7 @@ async def make_predict_inputs(
       "my_choice",
       "my_constrained_int"
     ],
-    "title": "'''
-        + input_name
-        + """",
+    "title": "Input",
     "type": "object"
   },
   "my_choice": {
@@ -116,7 +96,7 @@ async def make_predict_inputs(
   }
 }
 
-A valid json payload for that input schema would be:
+A valid JSON payload for that input schema would be:
 
 {
   "my_bool": true,
@@ -127,42 +107,54 @@ async def make_predict_inputs(
   "text": "world",
 }
 
-"""
-        + f"""
-Now, given the following OpenAPI schemas:
+## Respect constraints
+
+Be careful to respect constraints. For example:
+* If there is a "maximum" or "minimum" constraint on a number input, your generated input value must not be below the minimum or above the maximum
+* If there is an allOf constraint, your input values must be one of the valid enumeration values
+* If the description of an input describes constraints, your generated input must respect those constraints
+* etc.
+
+## Multimedia file inputs
+
+If an input have format=uri and you decide to populate that input, you should use one of the media URLs from the Multimedia example files section below.
+
+Make sure you pick an appropriate URL for the the input, e.g. pick one of the image examples below if the input expects represents an image.
+
+""" + multimedia_example_files
+
+
+async def make_fuzz_inputs(
+    schemas: dict,
+    train: bool,
+    only_required: bool,
+    seed: int | None,
+    fixed_inputs: dict[str, Any],
+    disabled_inputs: list[str],
+    fuzz_prompt: str | None,
+    inputs_history: list[dict] | None = None,
+    attempt=0,
+) -> tuple[dict, bool]:
+    input_name = "TrainingInput" if train else "Input"
+    input_schema = schemas[input_name]
+    properties = input_schema["properties"]
+    required = input_schema.get("required", [])
+
+    is_deterministic = False
+    if "seed" in properties and seed is not None:
+        is_deterministic = True
+        del properties["seed"]
+
+    fixed_inputs = {k: v for k, v in fixed_inputs.items() if k not in disabled_inputs}
+
+    schemas_str = json.dumps(schemas, indent=2)
+    prompt = f"""Given the following OpenAPI schemas:
 
 {schemas_str}
 
-Generate a json payload for the {input_name} schema.
-
-If an input have format=uri and you decide to populate that input, you should use one of the following media URLs. Make sure you pick an appropriate URL for the the input, e.g. pick one of the image examples below if the input expects represents an image.
-
-Image:
-* https://storage.googleapis.com/cog-safe-push-public/skull.jpg
-* https://storage.googleapis.com/cog-safe-push-public/fast-car.jpg
-* https://storage.googleapis.com/cog-safe-push-public/forest.png
-* https://storage.googleapis.com/cog-safe-push-public/face.gif
-Video:
-* https://storage.googleapis.com/cog-safe-push-public/harry-truman.webm
-* https://storage.googleapis.com/cog-safe-push-public/mariner-launch.ogv
-Music audio:
-* https://storage.googleapis.com/cog-safe-push-public/folk-music.mp3
-* https://storage.googleapis.com/cog-safe-push-public/ocarina.ogg
-* https://storage.googleapis.com/cog-safe-push-public/nu-style-kick.wav
-Test audio:
-* https://storage.googleapis.com/cog-safe-push-public/clap.ogg
-* https://storage.googleapis.com/cog-safe-push-public/beeps.mp3
-Long speech:
-* https://storage.googleapis.com/cog-safe-push-public/chekhov-article.ogg
-* https://storage.googleapis.com/cog-safe-push-public/momentos-spanish.ogg
-Short speech:
-* https://storage.googleapis.com/cog-safe-push-public/de-experiment-german-word.ogg
-* https://storage.googleapis.com/cog-safe-push-public/de-ionendosis-german-word.ogg
-
-If the schema has default values for some of the inputs, feel free to either use the defaults or come up with new values.
-
-    """
-    )
+Generate a valid JSON payload for the {input_name} schema.
+
+"""
 
     if fixed_inputs:
         fixed_inputs_str = json.dumps(fixed_inputs)
@@ -192,14 +184,15 @@ async def make_predict_inputs(
 
 You must follow these instructions: {fuzz_prompt}"""
 
-    inputs = await ai.json_object(prompt)
+    system_prompt = await make_fuzz_system_prompt()
+    inputs = await ai.json_object(prompt, system_prompt=system_prompt)
     if set(required) - set(inputs.keys()):
         max_attempts = 5
         if attempt == max_attempts:
             raise AIError(
                 f"Failed to generate a json payload with the correct keys after {max_attempts} attempts, giving up"
             )
-        return await make_predict_inputs(
+        return await make_fuzz_inputs(
             schemas=schemas,
             train=train,
             only_required=only_required,
diff --git a/cog_safe_push/tasks.py b/cog_safe_push/tasks.py
index af0ff32..7590034 100644
--- a/cog_safe_push/tasks.py
+++ b/cog_safe_push/tasks.py
@@ -11,7 +11,7 @@
 )
 from .match_outputs import outputs_match
 from .output_checkers import OutputChecker
-from .predict import make_predict_inputs, predict
+from .predict import make_fuzz_inputs, predict
 from .task_context import TaskContext
 
 
@@ -41,7 +41,7 @@ async def run(self) -> None:
             schemas = schema.get_schemas(
                 self.context.model, train=self.context.is_train()
             )
-            inputs, is_deterministic = await make_predict_inputs(
+            inputs, is_deterministic = await make_fuzz_inputs(
                 schemas,
                 train=self.context.is_train(),
                 only_required=True,
@@ -130,7 +130,7 @@ async def run(self) -> None:
         )
         inputs_history = []
         for _ in range(self.num_inputs):
-            inputs, _ = await make_predict_inputs(
+            inputs, _ = await make_fuzz_inputs(
                 schemas,
                 train=self.context.is_train(),
                 only_required=False,
diff --git a/test/test_predict.py b/test/test_predict.py
index 140874d..dfd894d 100644
--- a/test/test_predict.py
+++ b/test/test_predict.py
@@ -3,7 +3,7 @@
 import pytest
 
 from cog_safe_push.exceptions import AIError
-from cog_safe_push.predict import make_predict_inputs
+from cog_safe_push.predict import make_fuzz_inputs
 
 
 @pytest.fixture
@@ -34,7 +34,7 @@ def sample_schemas():
 async def test_make_predict_inputs_basic(mock_json_object, sample_schemas):
     mock_json_object.return_value = {"text": "hello", "number": 42, "choice": "A"}
 
-    inputs, is_deterministic = await make_predict_inputs(
+    inputs, is_deterministic = await make_fuzz_inputs(
         sample_schemas,
         train=False,
         only_required=True,
@@ -52,7 +52,7 @@ async def test_make_predict_inputs_with_seed(sample_schemas):
     with patch("cog_safe_push.predict.ai.json_object") as mock_json_object:
         mock_json_object.return_value = {"text": "hello", "number": 42, "choice": "A"}
 
-        inputs, is_deterministic = await make_predict_inputs(
+        inputs, is_deterministic = await make_fuzz_inputs(
             sample_schemas,
             train=False,
             only_required=True,
@@ -70,7 +70,7 @@ async def test_make_predict_inputs_with_fixed_inputs(sample_schemas):
     with patch("cog_safe_push.predict.ai.json_object") as mock_json_object:
         mock_json_object.return_value = {"text": "hello", "number": 42, "choice": "A"}
 
-        inputs, _ = await make_predict_inputs(
+        inputs, _ = await make_fuzz_inputs(
             sample_schemas,
             train=False,
             only_required=True,
@@ -92,7 +92,7 @@ async def test_make_predict_inputs_with_disabled_inputs(sample_schemas):
             "optional": True,
         }
 
-        inputs, _ = await make_predict_inputs(
+        inputs, _ = await make_fuzz_inputs(
             sample_schemas,
             train=False,
             only_required=False,
@@ -114,7 +114,7 @@ async def test_make_predict_inputs_with_inputs_history(sample_schemas):
             {"text": "older", "number": 21, "choice": "B"},
         ]
 
-        inputs, _ = await make_predict_inputs(
+        inputs, _ = await make_fuzz_inputs(
             sample_schemas,
             train=False,
             only_required=True,
@@ -136,7 +136,7 @@ async def test_make_predict_inputs_ai_error(sample_schemas):
             {"text": "hello", "number": 42, "choice": "A"},  # Correct input
         ]
 
-        inputs, _ = await make_predict_inputs(
+        inputs, _ = await make_fuzz_inputs(
             sample_schemas,
             train=False,
             only_required=True,
@@ -157,7 +157,7 @@ async def test_make_predict_inputs_max_attempts_reached(sample_schemas):
         }  # Always missing required fields
 
         with pytest.raises(AIError):
-            await make_predict_inputs(
+            await make_fuzz_inputs(
                 sample_schemas,
                 train=False,
                 only_required=True,
@@ -179,7 +179,7 @@ async def test_make_predict_inputs_filters_null_values(sample_schemas):
             "input_image": None,  # This should be filtered out
         }
 
-        inputs, _ = await make_predict_inputs(
+        inputs, _ = await make_fuzz_inputs(
             sample_schemas,
             train=False,
             only_required=False,
@@ -220,7 +220,7 @@ async def test_make_predict_inputs_filters_various_null_representations():
             "optional_field": None,  # Optional field with null that should be filtered
         }
 
-        inputs, _ = await make_predict_inputs(
+        inputs, _ = await make_fuzz_inputs(
             schemas,
             train=False,
             only_required=False,
@@ -263,7 +263,7 @@ async def test_make_predict_inputs_preserves_valid_values():
             "null_field": None,  # Should be filtered out
         }
 
-        inputs, _ = await make_predict_inputs(
+        inputs, _ = await make_fuzz_inputs(
             schemas,
             train=False,
             only_required=False,

From 16398d20fffe65f887550fcc584385e51b9d17a4 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 08:05:11 -0500
Subject: [PATCH 07/35] format

---
 cog_safe_push/ai.py      |  4 +++-
 cog_safe_push/predict.py | 11 ++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index e49441a..3a23f50 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -53,7 +53,9 @@ async def boolean(
 
 
 @async_retry(3)
-async def json_object(prompt: str, files: list[Path] | None = None, system_prompt: str = "") -> dict:
+async def json_object(
+    prompt: str, files: list[Path] | None = None, system_prompt: str = ""
+) -> dict:
     if system_prompt:
         system_prompt = system_prompt.strip() + "\n\n"
     system_prompt += "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 8fba111..a44f6d6 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -19,8 +19,11 @@
 
 async def make_fuzz_system_prompt() -> str:
     async with httpx.AsyncClient() as client:
-        multimedia_example_files = await client.get("https://multimedia-example-files.replicate.dev/index.txt")
-    return """# Replicate model fuzzing inputs
+        multimedia_example_files = await client.get(
+            "https://multimedia-example-files.replicate.dev/index.txt"
+        )
+    return (
+        """# Replicate model fuzzing inputs
 
 Your task is to generate inputs for model fuzzing of a Replicate model.
 
@@ -121,7 +124,9 @@ async def make_fuzz_system_prompt() -> str:
 
 Make sure you pick an appropriate URL for the the input, e.g. pick one of the image examples below if the input expects represents an image.
 
-""" + multimedia_example_files
+"""
+        + multimedia_example_files
+    )
 
 
 async def make_fuzz_inputs(

From a11517730511bc0e5cd4aab67ba5d197837e0e0e Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 08:10:21 -0500
Subject: [PATCH 08/35] fix response issue

---
 cog_safe_push/predict.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index a44f6d6..0e1086d 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -19,9 +19,10 @@
 
 async def make_fuzz_system_prompt() -> str:
     async with httpx.AsyncClient() as client:
-        multimedia_example_files = await client.get(
+        response = await client.get(
             "https://multimedia-example-files.replicate.dev/index.txt"
         )
+    multimedia_example_files = response.text
     return (
         """# Replicate model fuzzing inputs
 

From fa3af27e23b2bcb4143bf10a761831319f364344 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 08:50:20 -0500
Subject: [PATCH 09/35] Add integration test for non-matching
 replicate.delivery images

This test reproduces the issue where two different image URLs from
replicate.delivery are correctly identified as not similar when compared
using AI-based image matching.
---
 integration-test/test_non_matching_images.py | 38 ++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 integration-test/test_non_matching_images.py

diff --git a/integration-test/test_non_matching_images.py b/integration-test/test_non_matching_images.py
new file mode 100644
index 0000000..d9ffb3a
--- /dev/null
+++ b/integration-test/test_non_matching_images.py
@@ -0,0 +1,38 @@
+import pytest
+
+from cog_safe_push.match_outputs import outputs_match
+
+
+@pytest.mark.asyncio
+async def test_non_matching_replicate_delivery_images():
+    """
+    Test that reproduces the issue where two different replicate.delivery URLs
+    for different images are correctly identified as not matching.
+    
+    This test uses the exact URLs from the reported error:
+    - test output: https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg
+    - model output: https://replicate.delivery/xezq/Nm38Rbi6wiqgJxTaqPw6Lwh58LWbJe8SruZnCVpD40HYfAeqA/out.jpg
+    
+    Expected behavior: Images should not match because they are different images.
+    """
+    test_output = "https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg"
+    model_output = "https://replicate.delivery/xezq/Nm38Rbi6wiqgJxTaqPw6Lwh58LWbJe8SruZnCVpD40HYfAeqA/out.jpg"
+    
+    # Test with is_deterministic=False (uses AI comparison)
+    matches, error_message = await outputs_match(test_output, model_output, is_deterministic=False)
+    
+    assert not matches, f"Images should not match. URLs are different: {test_output} vs {model_output}"
+    assert error_message == "Images are not similar", f"Expected 'Images are not similar' but got: {error_message}"
+
+
+@pytest.mark.asyncio
+async def test_matching_replicate_delivery_images():
+    """
+    Test that the same URL matches itself (sanity check).
+    """
+    url = "https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg"
+    
+    # Test with is_deterministic=False (uses AI comparison)
+    matches, error_message = await outputs_match(url, url, is_deterministic=False)
+    
+    assert matches, f"Same URL should match itself. Error: {error_message}"

From 3ab4b5bf923294a156315ce1d7b37de8c3fa0dd6 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:01:28 -0500
Subject: [PATCH 10/35] Implement normalize_suffix to normalize file extensions
 like jpeg to jpg

---
 cog_safe_push/match_outputs.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/cog_safe_push/match_outputs.py b/cog_safe_push/match_outputs.py
index f7508a4..21afa54 100644
--- a/cog_safe_push/match_outputs.py
+++ b/cog_safe_push/match_outputs.py
@@ -181,11 +181,19 @@ def is_video(url: str) -> bool:
 
 
 def extensions_match(url1: str, url2: str) -> bool:
-    ext1 = Path(urlparse(url1).path).suffix
-    ext2 = Path(urlparse(url2).path).suffix
+    ext1 = normalize_suffix(Path(urlparse(url1).path).suffix)
+    ext2 = normalize_suffix(Path(urlparse(url2).path).suffix)
     return ext1.lower() == ext2.lower()
 
 
+def normalize_suffix(suffix: str) -> str:
+    suffix = suffix.lower()
+    normalizations = {
+        ".jpeg": ".jpg",
+    }
+    return normalizations.get(suffix, suffix)
+
+
 def is_url(s: str) -> bool:
     return s.startswith(("http://", "https://"))
 

From 39a49c0d9c431a81192450379ecd4d58a25f95b9 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:01:55 -0500
Subject: [PATCH 11/35] Add more file extension normalizations for common
 variants

---
 cog_safe_push/match_outputs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cog_safe_push/match_outputs.py b/cog_safe_push/match_outputs.py
index 21afa54..1bdde8f 100644
--- a/cog_safe_push/match_outputs.py
+++ b/cog_safe_push/match_outputs.py
@@ -190,6 +190,10 @@ def normalize_suffix(suffix: str) -> str:
     suffix = suffix.lower()
     normalizations = {
         ".jpeg": ".jpg",
+        ".jpe": ".jpg",
+        ".tiff": ".tif",
+        ".mpeg": ".mpg",
+        ".htm": ".html",
     }
     return normalizations.get(suffix, suffix)
 

From 0f7382beffce353cf2fcb02608569f70534ed167 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:13:58 -0500
Subject: [PATCH 12/35] Add thinking parameter to call() function signature

---
 cog_safe_push/ai.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 3a23f50..2202276 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -38,13 +38,16 @@ async def wrapper_retry(*args, **kwargs):
 async def boolean(
     prompt: str, files: list[Path] | None = None, include_file_metadata: bool = False
 ) -> bool:
-    system_prompt = "You are a boolean classifier. You must only respond with either YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it is critical that you only ever answer with either the string YES or the string NO."
+    system_prompt = "You are a boolean classifier. You must only respond with either YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it is critical that you only ever answer with either the string YES or the string NO. After a newline, include a description of why you answered like you did."
     output = await call(
         system_prompt=system_prompt,
         prompt=prompt.strip(),
         files=files,
         include_file_metadata=include_file_metadata,
+        thinking=True,
     )
+    print(f"{output=}")  # TODO(andreas): remove debug
+
     if output == "YES":
         return True
     if output == "NO":
@@ -80,6 +83,7 @@ async def call(
     prompt: str,
     files: list[Path] | None = None,
     include_file_metadata: bool = False,
+    thinking: bool = False,
 ) -> str:
     api_key = os.environ.get("ANTHROPIC_API_KEY")
     if not api_key:

From cf4ec1a88cb3f8594be772e7e7367ae3cad676fb Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:14:05 -0500
Subject: [PATCH 13/35] Pass thinking parameter to Anthropic API with proper
 format

---
 cog_safe_push/ai.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 2202276..15a2b78 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -112,6 +112,10 @@ async def call(
             {"role": "user", "content": content}
         ]
 
+        thinking_config = None
+        if thinking:
+            thinking_config = {"type": "enabled", "budget_tokens": 2048}
+
         response = await client.messages.create(
             model=model,
             messages=messages,
@@ -119,6 +123,7 @@ async def call(
             max_tokens=4096,
             stream=False,
             temperature=1.0,
+            thinking=thinking_config,
         )
         content = cast("anthropic.types.TextBlock", response.content[0])
 

From a8b87e4bbebbc2a6acd8a9ac0bbe7ceadb9b4868 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:14:19 -0500
Subject: [PATCH 14/35] Handle thinking blocks in response - extract only text
 content

---
 cog_safe_push/ai.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 15a2b78..9b1feb2 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -125,7 +125,11 @@ async def call(
             temperature=1.0,
             thinking=thinking_config,
         )
-        content = cast("anthropic.types.TextBlock", response.content[0])
+        
+        text_blocks = [block for block in response.content if block.type == "text"]
+        if not text_blocks:
+            raise AIError("No text content in response")
+        content = cast("anthropic.types.TextBlock", text_blocks[0])
 
     finally:
         await client.close()

From 98bd8056cca6a7f03e6b782ed693fd886817cd2c Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:15:30 -0500
Subject: [PATCH 15/35] make non matching test smarter

---
 cog_safe_push/ai.py                          |  2 +-
 cog_safe_push/match_outputs.py               |  6 ++-
 integration-test/test_non_matching_images.py | 43 ++++++++------------
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 9b1feb2..6f8640f 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -38,7 +38,7 @@ async def wrapper_retry(*args, **kwargs):
 async def boolean(
     prompt: str, files: list[Path] | None = None, include_file_metadata: bool = False
 ) -> bool:
-    system_prompt = "You are a boolean classifier. You must only respond with either YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it is critical that you only ever answer with either the string YES or the string NO. After a newline, include a description of why you answered like you did."
+    system_prompt = "You are a boolean classifier. You must only respond with either YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it is critical that you only ever answer with either the string YES or the string NO."
     output = await call(
         system_prompt=system_prompt,
         prompt=prompt.strip(),
diff --git a/cog_safe_push/match_outputs.py b/cog_safe_push/match_outputs.py
index 1bdde8f..6bad888 100644
--- a/cog_safe_push/match_outputs.py
+++ b/cog_safe_push/match_outputs.py
@@ -222,7 +222,11 @@ async def images_match(
             return True, ""
 
         fuzzy_match = await ai.boolean(
-            "These two images have been generated by or modified by an AI model. Is it highly likely that those two predictions of the model had the same inputs?",
+            """I provide you with _two_ input images. These two images have been generated by or modified by an AI model. Is it highly likely that those two predictions of the model had the same inputs?
+
+* If the two images are identical, respond with YES.
+* If the two images have very similar subject matters that have probably been generated by the same prompt, respond with YES.
+            """,
             files=[tmp1, tmp2],
         )
         if fuzzy_match:
diff --git a/integration-test/test_non_matching_images.py b/integration-test/test_non_matching_images.py
index d9ffb3a..6162fce 100644
--- a/integration-test/test_non_matching_images.py
+++ b/integration-test/test_non_matching_images.py
@@ -4,35 +4,26 @@
 
 
 @pytest.mark.asyncio
-async def test_non_matching_replicate_delivery_images():
-    """
-    Test that reproduces the issue where two different replicate.delivery URLs
-    for different images are correctly identified as not matching.
-    
-    This test uses the exact URLs from the reported error:
-    - test output: https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg
-    - model output: https://replicate.delivery/xezq/Nm38Rbi6wiqgJxTaqPw6Lwh58LWbJe8SruZnCVpD40HYfAeqA/out.jpg
-    
-    Expected behavior: Images should not match because they are different images.
-    """
-    test_output = "https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg"
-    model_output = "https://replicate.delivery/xezq/Nm38Rbi6wiqgJxTaqPw6Lwh58LWbJe8SruZnCVpD40HYfAeqA/out.jpg"
-    
-    # Test with is_deterministic=False (uses AI comparison)
-    matches, error_message = await outputs_match(test_output, model_output, is_deterministic=False)
-    
-    assert not matches, f"Images should not match. URLs are different: {test_output} vs {model_output}"
-    assert error_message == "Images are not similar", f"Expected 'Images are not similar' but got: {error_message}"
+async def test_output_match_similar_images():
+    url1 = "https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg"
+    url2 = "https://replicate.delivery/xezq/Nm38Rbi6wiqgJxTaqPw6Lwh58LWbJe8SruZnCVpD40HYfAeqA/out.jpg"
+    matches, error_message = await outputs_match(url1, url2, is_deterministic=False)
+    assert matches, error_message
 
 
 @pytest.mark.asyncio
-async def test_matching_replicate_delivery_images():
-    """
-    Test that the same URL matches itself (sanity check).
-    """
+async def test_output_match_same_image():
     url = "https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg"
-    
-    # Test with is_deterministic=False (uses AI comparison)
     matches, error_message = await outputs_match(url, url, is_deterministic=False)
+    assert matches, error_message
+
+
+@pytest.mark.asyncio
+async def test_output_match_not_similar_images():
+    url1 = "https://replicate.delivery/xezq/FC8AoQT9RlL1LNxCdM1scYfBKsk4A1rmOb67lYxfLYNcYBeqA/out-0.webp"
+    url2 = "https://replicate.delivery/xezq/Zj0SX6yRmHbSM1SWXL583l4jg0N5UtiBPINOylKwq4zKWgXF/out-0.webp"
+    matches, error_message = await outputs_match(url1, url2, is_deterministic=False)
+    assert not matches
+    assert error_message == "Images are not similar", f"Expected 'Images are not similar' but got: {error_message}"
+
     
-    assert matches, f"Same URL should match itself. Error: {error_message}"

From b2c28a3457f85bbad9a99ad0847717f0dc5f8d61 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:16:17 -0500
Subject: [PATCH 16/35] Import anthropic types for thinking parameter

---
 cog_safe_push/ai.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 6f8640f..c3db0d0 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -8,6 +8,8 @@
 from typing import cast
 
 import anthropic
+from anthropic._types import NOT_GIVEN, NotGiven
+from anthropic.types import ThinkingConfigParam
 
 from . import log
 from .exceptions import AIError, ArgumentError
@@ -125,7 +127,7 @@ async def call(
             temperature=1.0,
             thinking=thinking_config,
         )
-        
+
         text_blocks = [block for block in response.content if block.type == "text"]
         if not text_blocks:
             raise AIError("No text content in response")

From 83c4d24a114bde61457f52d472aee0a265dd5903 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:16:24 -0500
Subject: [PATCH 17/35] Fix thinking parameter types to use proper anthropic
 types

---
 cog_safe_push/ai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index c3db0d0..d1120c4 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -114,7 +114,7 @@ async def call(
             {"role": "user", "content": content}
         ]
 
-        thinking_config = None
+        thinking_config: ThinkingConfigParam | NotGiven = NOT_GIVEN
         if thinking:
             thinking_config = {"type": "enabled", "budget_tokens": 2048}
 

From c11c7c4c5cc614e7aee84ae8b4e98d218ee129ae Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:16:35 -0500
Subject: [PATCH 18/35] Move ThinkingConfigParam import into type-checking
 block

---
 cog_safe_push/ai.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index d1120c4..dc70a2b 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -5,11 +5,13 @@
 import os
 import subprocess
 from pathlib import Path
-from typing import cast
+from typing import TYPE_CHECKING, cast
 
 import anthropic
 from anthropic._types import NOT_GIVEN, NotGiven
-from anthropic.types import ThinkingConfigParam
+
+if TYPE_CHECKING:
+    from anthropic.types import ThinkingConfigParam
 
 from . import log
 from .exceptions import AIError, ArgumentError

From b333221c639a8572ccf22f076724434453bf0bd0 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:17:27 -0500
Subject: [PATCH 19/35] lint

---
 integration-test/test_non_matching_images.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/integration-test/test_non_matching_images.py b/integration-test/test_non_matching_images.py
index 6162fce..9601037 100644
--- a/integration-test/test_non_matching_images.py
+++ b/integration-test/test_non_matching_images.py
@@ -24,6 +24,6 @@ async def test_output_match_not_similar_images():
     url2 = "https://replicate.delivery/xezq/Zj0SX6yRmHbSM1SWXL583l4jg0N5UtiBPINOylKwq4zKWgXF/out-0.webp"
     matches, error_message = await outputs_match(url1, url2, is_deterministic=False)
     assert not matches
-    assert error_message == "Images are not similar", f"Expected 'Images are not similar' but got: {error_message}"
-
-    
+    assert error_message == "Images are not similar", (
+        f"Expected 'Images are not similar' but got: {error_message}"
+    )

From c7602dd009ab6858617ae6631d361559822f0ec1 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:20:31 -0500
Subject: [PATCH 20/35] Conditionally pass thinking parameter to avoid type
 issues

---
 cog_safe_push/ai.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index dc70a2b..e7f073c 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -116,19 +116,25 @@ async def call(
             {"role": "user", "content": content}
         ]
 
-        thinking_config: ThinkingConfigParam | NotGiven = NOT_GIVEN
         if thinking:
-            thinking_config = {"type": "enabled", "budget_tokens": 2048}
-
-        response = await client.messages.create(
-            model=model,
-            messages=messages,
-            system=system_prompt,
-            max_tokens=4096,
-            stream=False,
-            temperature=1.0,
-            thinking=thinking_config,
-        )
+            response = await client.messages.create(
+                model=model,
+                messages=messages,
+                system=system_prompt,
+                max_tokens=4096,
+                stream=False,
+                temperature=1.0,
+                thinking={"type": "enabled", "budget_tokens": 2048},
+            )
+        else:
+            response = await client.messages.create(
+                model=model,
+                messages=messages,
+                system=system_prompt,
+                max_tokens=4096,
+                stream=False,
+                temperature=1.0,
+            )
 
         text_blocks = [block for block in response.content if block.type == "text"]
         if not text_blocks:

From cf7399b1275d5822d8beb5c5bca3e2f72fddaf9b Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:20:43 -0500
Subject: [PATCH 21/35] Remove unused imports after refactoring thinking
 parameter

---
 cog_safe_push/ai.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index e7f073c..4b785af 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -5,13 +5,9 @@
 import os
 import subprocess
 from pathlib import Path
-from typing import TYPE_CHECKING, cast
+from typing import cast
 
 import anthropic
-from anthropic._types import NOT_GIVEN, NotGiven
-
-if TYPE_CHECKING:
-    from anthropic.types import ThinkingConfigParam
 
 from . import log
 from .exceptions import AIError, ArgumentError

From e297736e00a2a85f6b5168ade0c4c39334e66b6a Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:22:00 -0500
Subject: [PATCH 22/35] better string matching

---
 cog_safe_push/match_outputs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cog_safe_push/match_outputs.py b/cog_safe_push/match_outputs.py
index 6bad888..a3dbae2 100644
--- a/cog_safe_push/match_outputs.py
+++ b/cog_safe_push/match_outputs.py
@@ -138,6 +138,9 @@ async def strings_match(s1: str, s2: str, is_deterministic: bool) -> tuple[bool,
         f"""
 Have these two strings been generated by the same generative AI model inputs/prompt?
 
+* If the two strings are identical, respond with YES
+* If the two strings have very similar content, respond with YES
+
 String 1: '{s1}'
 String 2: '{s2}'
     """

From ab1c717cb556675fe40ecf1e29fa93f943cbdb23 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:31:24 -0500
Subject: [PATCH 23/35] add debugging

---
 cog_safe_push/predict.py | 3 +++
 script/integration-test  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 0e1086d..3e97e22 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -224,6 +224,9 @@ async def make_fuzz_inputs(
     # Filter out null values as Replicate API doesn't accept null for optional fields
     inputs = {k: v for k, v in inputs.items() if v is not None}
 
+    print(f"{schemas_str=}, {inputs=}")  # TODO(andreas): remove debug
+
+
     return inputs, is_deterministic
 
 
diff --git a/script/integration-test b/script/integration-test
index c90b239..16da579 100755
--- a/script/integration-test
+++ b/script/integration-test
@@ -1,3 +1,3 @@
 #!/bin/bash -eu
 
-pytest -n4 -s integration-test/
+pytest -n8 -s integration-test/

From 8d934259ccc0eabb42b64e3dfc38bea59fc8fd25 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:42:19 -0500
Subject: [PATCH 24/35] thinking in fuzz generation

---
 cog_safe_push/ai.py      | 12 ++++++++++--
 cog_safe_push/predict.py |  7 ++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 4b785af..62dc67a 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -57,12 +57,20 @@ async def boolean(
 
 @async_retry(3)
 async def json_object(
-    prompt: str, files: list[Path] | None = None, system_prompt: str = ""
+    prompt: str,
+    files: list[Path] | None = None,
+    system_prompt: str = "",
+    thinking: bool = False,
 ) -> dict:
     if system_prompt:
         system_prompt = system_prompt.strip() + "\n\n"
     system_prompt += "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
-    output = await call(system_prompt=system_prompt, prompt=prompt.strip(), files=files)
+    output = await call(
+        system_prompt=system_prompt,
+        prompt=prompt.strip(),
+        files=files,
+        thinking=thinking,
+    )
 
     if output.startswith("```json"):
         output = output[7:]
diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 3e97e22..91b5892 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -180,7 +180,7 @@ async def make_fuzz_inputs(
         inputs_history_str = "\n".join(["* " + json.dumps(i) for i in inputs_history])
         prompt += f"""
 
-Return a new combination of inputs that you haven't used before, ideally that's quite diverse from inputs you've used before. You have previously used these inputs:
+Return a new combination of inputs that you haven't used before, ideally that's quite diverse from inputs you've used before -- but still make sure you respect the constraints in the input schema (respecting those constraints is very important!). You have previously used these inputs:
 {inputs_history_str}"""
 
     if fuzz_prompt:
@@ -191,7 +191,7 @@ async def make_fuzz_inputs(
 You must follow these instructions: {fuzz_prompt}"""
 
     system_prompt = await make_fuzz_system_prompt()
-    inputs = await ai.json_object(prompt, system_prompt=system_prompt)
+    inputs = await ai.json_object(prompt, system_prompt=system_prompt, thinking=True)
     if set(required) - set(inputs.keys()):
         max_attempts = 5
         if attempt == max_attempts:
@@ -224,9 +224,6 @@ async def make_fuzz_inputs(
     # Filter out null values as Replicate API doesn't accept null for optional fields
     inputs = {k: v for k, v in inputs.items() if v is not None}
 
-    print(f"{schemas_str=}, {inputs=}")  # TODO(andreas): remove debug
-
-
     return inputs, is_deterministic
 
 

From 5489001de5eb4d5659e51988c1b765cd015f0d4c Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 09:42:51 -0500
Subject: [PATCH 25/35] increase max tokens

---
 cog_safe_push/ai.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 62dc67a..9caccb4 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -12,6 +12,8 @@
 from . import log
 from .exceptions import AIError, ArgumentError
 
+MAX_TOKENS = 8192
+
 
 def async_retry(attempts=3):
     def decorator_retry(func):
@@ -125,7 +127,7 @@ async def call(
                 model=model,
                 messages=messages,
                 system=system_prompt,
-                max_tokens=4096,
+                max_tokens=MAX_TOKENS,
                 stream=False,
                 temperature=1.0,
                 thinking={"type": "enabled", "budget_tokens": 2048},
@@ -135,7 +137,7 @@ async def call(
                 model=model,
                 messages=messages,
                 system=system_prompt,
-                max_tokens=4096,
+                max_tokens=MAX_TOKENS,
                 stream=False,
                 temperature=1.0,
             )

From 85d6b12492dbc3971829fbb589b7f23e346a285e Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 15:40:34 -0500
Subject: [PATCH 26/35] Be even more explicit about constraints in fuzz prompt

---
 cog_safe_push/predict.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 91b5892..50552ea 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -30,6 +30,8 @@ async def make_fuzz_system_prompt() -> str:
 
 Given a model input JSON schema, return a valid JSON payload for this model.
 
+## Example
+
 For example,
 
 {
@@ -111,6 +113,19 @@ async def make_fuzz_system_prompt() -> str:
   "text": "world",
 }
 
+The following is be be a valid JSON payload:
+
+{
+  "my_bool": true,
+  "my_choice": "foo",
+  "my_constrained_int": 11,
+  "my_float": 3.14,
+  "my_int": 10,
+  "text": "world",
+}
+
+...because my_constrained_int is greater than the maximum in the schema.
+
 ## Respect constraints
 
 Be careful to respect constraints. For example:

From f2e50b6ddb64c6721da4ec9dd6102eb46087769e Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 15:58:36 -0500
Subject: [PATCH 27/35] debingging

---
 cog_safe_push/predict.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 50552ea..1053899 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -278,6 +278,9 @@ async def predict(
                 # Assume it's an official model
                 prediction = replicate.predictions.create(model=model, input=inputs)
             else:
+                print(f"{version.openapi_schema=}")  # TODO(andreas): remove debug
+                print(f"{inputs=}")  # TODO(andreas): remove debug
+
                 raise
 
     log.v(f"{prefix}Prediction URL: https://replicate.com/p/{prediction.id}")

From 9b48ec0be79b128a6f1f19786ff1d3b9d64775cf Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 16:22:14 -0500
Subject: [PATCH 28/35] more debugging

---
 cog_safe_push/predict.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 1053899..288ab0c 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -281,6 +281,10 @@ async def predict(
                 print(f"{version.openapi_schema=}")  # TODO(andreas): remove debug
                 print(f"{inputs=}")  # TODO(andreas): remove debug
 
+                import sys
+
+                sys.exit(1)
+
                 raise
 
     log.v(f"{prefix}Prediction URL: https://replicate.com/p/{prediction.id}")

From f2930a781a75c1008dd150caab9b0dfa748e31d7 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sat, 11 Oct 2025 16:35:04 -0500
Subject: [PATCH 29/35] more debugging

---
 cog_safe_push/predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 288ab0c..ebc2781 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -278,7 +278,7 @@ async def predict(
                 # Assume it's an official model
                 prediction = replicate.predictions.create(model=model, input=inputs)
             else:
-                print(f"{version.openapi_schema=}")  # TODO(andreas): remove debug
+                print("version.openapi_schema:", json.dumps(version.openapi_schema, indent=2))  # TODO(andreas): remove debug
                 print(f"{inputs=}")  # TODO(andreas): remove debug
 
                 import sys

From 5390bfe3e7f6f2470b261e07450d20eb16ad3180 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Sun, 12 Oct 2025 04:35:08 -0500
Subject: [PATCH 30/35] more debugging

---
 cog_safe_push/predict.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index ebc2781..230b116 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -282,6 +282,7 @@ async def predict(
                 print(f"{inputs=}")  # TODO(andreas): remove debug
 
                 import sys
+                sys.stdout.flush()
 
                 sys.exit(1)
 

From b4de425b134e0c8a4d1d392c22a94c72ce4b5165 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Mon, 13 Oct 2025 11:56:47 +0200
Subject: [PATCH 31/35] remove debugging, fix integration tests

---
 cog_safe_push/predict.py                     |  8 --------
 integration-test/test_non_matching_images.py | 10 +++++-----
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 230b116..50552ea 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -278,14 +278,6 @@ async def predict(
                 # Assume it's an official model
                 prediction = replicate.predictions.create(model=model, input=inputs)
             else:
-                print("version.openapi_schema:", json.dumps(version.openapi_schema, indent=2))  # TODO(andreas): remove debug
-                print(f"{inputs=}")  # TODO(andreas): remove debug
-
-                import sys
-                sys.stdout.flush()
-
-                sys.exit(1)
-
                 raise
 
     log.v(f"{prefix}Prediction URL: https://replicate.com/p/{prediction.id}")
diff --git a/integration-test/test_non_matching_images.py b/integration-test/test_non_matching_images.py
index 9601037..68f1df0 100644
--- a/integration-test/test_non_matching_images.py
+++ b/integration-test/test_non_matching_images.py
@@ -5,23 +5,23 @@
 
 @pytest.mark.asyncio
 async def test_output_match_similar_images():
-    url1 = "https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg"
-    url2 = "https://replicate.delivery/xezq/Nm38Rbi6wiqgJxTaqPw6Lwh58LWbJe8SruZnCVpD40HYfAeqA/out.jpg"
+    url1 = "https://replicate.delivery/xezq/OrGhA2j4ACZ8FdbZgTxyaav6EKSxZ4jBnNzZwXIZZleq8TvKA/out-0.webp"
+    url2 = "https://replicate.delivery/xezq/Z4UKfUkAqp0RRaGQRIerW3ZGansA1Rqg6eodiOfYTfedZeTvKA/out-0.webp"
     matches, error_message = await outputs_match(url1, url2, is_deterministic=False)
     assert matches, error_message
 
 
 @pytest.mark.asyncio
 async def test_output_match_same_image():
-    url = "https://replicate.delivery/xezq/AwVT92BrC2LjMph3Qr84eoTOBfUY14ms10oN0pr6GhXqeB8qA/out.jpg"
+    url = "https://replicate.delivery/xezq/OrGhA2j4ACZ8FdbZgTxyaav6EKSxZ4jBnNzZwXIZZleq8TvKA/out-0.webp"
     matches, error_message = await outputs_match(url, url, is_deterministic=False)
     assert matches, error_message
 
 
 @pytest.mark.asyncio
 async def test_output_match_not_similar_images():
-    url1 = "https://replicate.delivery/xezq/FC8AoQT9RlL1LNxCdM1scYfBKsk4A1rmOb67lYxfLYNcYBeqA/out-0.webp"
-    url2 = "https://replicate.delivery/xezq/Zj0SX6yRmHbSM1SWXL583l4jg0N5UtiBPINOylKwq4zKWgXF/out-0.webp"
+    url1 = "https://replicate.delivery/xezq/OrGhA2j4ACZ8FdbZgTxyaav6EKSxZ4jBnNzZwXIZZleq8TvKA/out-0.webp"
+    url2 = "https://replicate.delivery/xezq/NtEEOzxwpTaFFF5fhalpLevI1HwrmGc3bNX799EzWmf51P9qA/out-0.webp"
     matches, error_message = await outputs_match(url1, url2, is_deterministic=False)
     assert not matches
     assert error_message == "Images are not similar", (

From 6a6a1267ebd569e71be8eb737b785a06d69b93ea Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Mon, 13 Oct 2025 11:57:42 +0200
Subject: [PATCH 32/35] remove debug

---
 cog_safe_push/ai.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
index 9caccb4..9ea7526 100644
--- a/cog_safe_push/ai.py
+++ b/cog_safe_push/ai.py
@@ -48,8 +48,6 @@ async def boolean(
         include_file_metadata=include_file_metadata,
         thinking=True,
     )
-    print(f"{output=}")  # TODO(andreas): remove debug
-
     if output == "YES":
         return True
     if output == "NO":

From 08b4076a051b84030bf3c4af7478a471cc38e80d Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Mon, 13 Oct 2025 12:28:03 +0200
Subject: [PATCH 33/35] document valid image types in fixture

---
 end-to-end-test/fixtures/image-base/predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/end-to-end-test/fixtures/image-base/predict.py b/end-to-end-test/fixtures/image-base/predict.py
index 8c92f78..8a90ac1 100644
--- a/end-to-end-test/fixtures/image-base/predict.py
+++ b/end-to-end-test/fixtures/image-base/predict.py
@@ -11,7 +11,7 @@ def setup(self):
 
     def predict(
         self,
-        image: Path = Input(description="Input image."),
+        image: Path = Input(description="Input image. Valid file types are: jpg, png, webp, bmp, gif (not animated)"),
         width: int = Input(description="New width.", ge=1, le=2000),
         height: int = Input(description="New height.", ge=1, le=1000),
     ) -> Path:

From 027129f496449129e3d2cfd52efeb579d7f3808a Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Mon, 13 Oct 2025 13:05:12 +0200
Subject: [PATCH 34/35] lint

---
 end-to-end-test/fixtures/image-base/predict.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/end-to-end-test/fixtures/image-base/predict.py b/end-to-end-test/fixtures/image-base/predict.py
index 8a90ac1..682f447 100644
--- a/end-to-end-test/fixtures/image-base/predict.py
+++ b/end-to-end-test/fixtures/image-base/predict.py
@@ -11,7 +11,9 @@ def setup(self):
 
     def predict(
         self,
-        image: Path = Input(description="Input image. Valid file types are: jpg, png, webp, bmp, gif (not animated)"),
+        image: Path = Input(
+            description="Input image. Valid file types are: jpg, png, webp, bmp, gif (not animated)"
+        ),
         width: int = Input(description="New width.", ge=1, le=2000),
         height: int = Input(description="New height.", ge=1, le=1000),
     ) -> Path:

From e8b7c0d8800ddef71ca274a504d4a41f35ba9af5 Mon Sep 17 00:00:00 2001
From: andreasjansson <andreas@replicate.ai>
Date: Mon, 13 Oct 2025 15:22:30 +0200
Subject: [PATCH 35/35] Fix nits

---
 cog_safe_push/predict.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cog_safe_push/predict.py b/cog_safe_push/predict.py
index 50552ea..82e23e1 100644
--- a/cog_safe_push/predict.py
+++ b/cog_safe_push/predict.py
@@ -113,7 +113,7 @@ async def make_fuzz_system_prompt() -> str:
   "text": "world",
 }
 
-The following is be be a valid JSON payload:
+The following is NOT a valid JSON payload:
 
 {
   "my_bool": true,
@@ -138,7 +138,7 @@ async def make_fuzz_system_prompt() -> str:
 
 If an input have format=uri and you decide to populate that input, you should use one of the media URLs from the Multimedia example files section below.
 
-Make sure you pick an appropriate URL for the the input, e.g. pick one of the image examples below if the input expects represents an image.
+Make sure you pick an appropriate URL for the the input, e.g. pick one of the image examples below if the input expects an image. Also make sure you respect any hints or documentation about file types.
 
 """
         + multimedia_example_files
@@ -221,6 +221,7 @@ async def make_fuzz_inputs(
             fixed_inputs=fixed_inputs,
             disabled_inputs=disabled_inputs,
             fuzz_prompt=fuzz_prompt,
+            inputs_history=inputs_history,
             attempt=attempt + 1,
         )