replicate · andreasjansson · Oct 13, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 10, 2025
diff --git a/cog_safe_push/ai.py b/cog_safe_push/ai.py
@@ -12,6 +12,8 @@
 from . import log
 from .exceptions import AIError, ArgumentError
 
+MAX_TOKENS = 8192
+
 
 def async_retry(attempts=3):
     def decorator_retry(func):
@@ -38,13 +40,13 @@ async def wrapper_retry(*args, **kwargs):
 async def boolean(
     prompt: str, files: list[Path] | None = None, include_file_metadata: bool = False
 ) -> bool:
-    system_prompt = "You only answer YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it's important that you only ever answer with either the string YES or the string NO."
-    # system_prompt = "You are a helpful assistant"
+    system_prompt = "You are a boolean classifier. You must only respond with either YES or NO, and absolutely nothing else. Your response will be used in a programmatic context so it is critical that you only ever answer with either the string YES or the string NO."
     output = await call(
         system_prompt=system_prompt,
         prompt=prompt.strip(),
         files=files,
         include_file_metadata=include_file_metadata,
+        thinking=True,
     )
     if output == "YES":
         return True
@@ -54,9 +56,30 @@ async def boolean(
 
 
 @async_retry(3)
-async def json_object(prompt: str, files: list[Path] | None = None) -> dict:
-    system_prompt = "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
-    output = await call(system_prompt=system_prompt, prompt=prompt.strip(), files=files)
+async def json_object(
+    prompt: str,
+    files: list[Path] | None = None,
+    system_prompt: str = "",
+    thinking: bool = False,
+) -> dict:
+    if system_prompt:
+        system_prompt = system_prompt.strip() + "\n\n"
+    system_prompt += "You always respond with valid JSON, and nothing else (no backticks, etc.). Your outputs will be used in a programmatic context."
+    output = await call(
+        system_prompt=system_prompt,
+        prompt=prompt.strip(),
+        files=files,
+        thinking=thinking,
+    )
+
+    if output.startswith("```json"):
+        output = output[7:]
+    elif output.startswith("```"):
+        output = output[3:]
+    if output.endswith("```"):
+        output = output[:-3]
+    output = output.strip()
+
     try:
         return json.loads(output)
     except json.JSONDecodeError:
@@ -68,12 +91,13 @@ async def call(
     prompt: str,
     files: list[Path] | None = None,
     include_file_metadata: bool = False,
+    thinking: bool = False,
 ) -> str:
     api_key = os.environ.get("ANTHROPIC_API_KEY")
     if not api_key:
         raise ArgumentError("ANTHROPIC_API_KEY is not defined")
 
-    model = "claude-sonnet-4-20250514"
+    model = "claude-sonnet-4-5"
     client = anthropic.AsyncAnthropic(api_key=api_key)
 
     try:
@@ -96,15 +120,30 @@ async def call(
             {"role": "user", "content": content}
         ]
 
-        response = await client.messages.create(
-            model=model,
-            messages=messages,
-            system=system_prompt,
-            max_tokens=4096,
-            stream=False,
-            temperature=1.0,
-        )
-        content = cast("anthropic.types.TextBlock", response.content[0])
+        if thinking:
+            response = await client.messages.create(
+                model=model,
+                messages=messages,
+                system=system_prompt,
+                max_tokens=MAX_TOKENS,
+                stream=False,
+                temperature=1.0,
+                thinking={"type": "enabled", "budget_tokens": 2048},
+            )
+        else:
+            response = await client.messages.create(
+                model=model,
+                messages=messages,
+                system=system_prompt,
+                max_tokens=MAX_TOKENS,
+                stream=False,
+                temperature=1.0,
+            )
+
+        text_blocks = [block for block in response.content if block.type == "text"]
+        if not text_blocks:
+            raise AIError("No text content in response")
+        content = cast("anthropic.types.TextBlock", text_blocks[0])
 
     finally:
         await client.close()

diff --git a/cog_safe_push/match_outputs.py b/cog_safe_push/match_outputs.py
@@ -22,12 +22,18 @@ async def output_matches_prompt(output: Any, prompt: str) -> tuple[bool, str]:
         urls = output if isinstance(output, list) else list(output.values())
 
     with download_many(urls) as tmp_files:
-        claude_prompt = """You are part of an automatic evaluation that compares media (text, audio, image, video, etc.) to captions. I want to know if the caption matches the text or file..
+        claude_prompt = """You are part of an automatic evaluation that compares media (text, audio, image, video, etc.) to descriptions. I want to know if the description matches the text or file..
 
 """
         if urls:
             claude_prompt += f"""Does this file(s) and the attached content of the file(s) match the description? Pay close attention to the metadata about the attached files which is included below, especially if the description mentions file type, image dimensions, or any other aspect that is described in the metadata. Do not infer file type or image dimensions from the image content, but from the attached metadata.
 
+The description may be specific or vague, but you should match on whatever is in the description. For example:
+* If the description is 'a jpg image' and it's a jpg image of a cat, that's still a match.
+* If the description is 'an image of a cat' and the image is actually of a dog, it's not a match.
+* If the description is 'an audio file' it should match any audio files regardless of content.
+* etc.
+
 Description to evaluate: {prompt}
 
 Filename(s): {output}"""
@@ -132,6 +138,9 @@ async def strings_match(s1: str, s2: str, is_deterministic: bool) -> tuple[bool,
         f"""
 Have these two strings been generated by the same generative AI model inputs/prompt?
 
+* If the two strings are identical, respond with YES
+* If the two strings have very similar content, respond with YES
+
 String 1: '{s1}'
 String 2: '{s2}'
     """
@@ -175,11 +184,23 @@ def is_video(url: str) -> bool:
 
 
 def extensions_match(url1: str, url2: str) -> bool:
-    ext1 = Path(urlparse(url1).path).suffix
-    ext2 = Path(urlparse(url2).path).suffix
+    ext1 = normalize_suffix(Path(urlparse(url1).path).suffix)
+    ext2 = normalize_suffix(Path(urlparse(url2).path).suffix)
     return ext1.lower() == ext2.lower()
 
 
+def normalize_suffix(suffix: str) -> str:
+    suffix = suffix.lower()
+    normalizations = {
+        ".jpeg": ".jpg",
+        ".jpe": ".jpg",
+        ".tiff": ".tif",
+        ".mpeg": ".mpg",
+        ".htm": ".html",
+    }
+    return normalizations.get(suffix, suffix)
+
+
 def is_url(s: str) -> bool:
     return s.startswith(("http://", "https://"))
 
@@ -204,7 +225,11 @@ async def images_match(
             return True, ""
 
         fuzzy_match = await ai.boolean(
-            "These two images have been generated by or modified by an AI model. Is it highly likely that those two predictions of the model had the same inputs?",
+            """I provide you with _two_ input images. These two images have been generated by or modified by an AI model. Is it highly likely that those two predictions of the model had the same inputs?
+
+* If the two images are identical, respond with YES.
+* If the two images have very similar subject matters that have probably been generated by the same prompt, respond with YES.
+            """,
             files=[tmp1, tmp2],
         )
         if fuzzy_match: