From b86da4396f8291372f6d9fa1e7def299edc4bc86 Mon Sep 17 00:00:00 2001
From: Preetam Joshi <info@aimon.ai>
Date: Mon, 13 Oct 2025 15:09:01 -0700
Subject: [PATCH 1/6] Corrected toxicity intepretation based on the new
 toxicity model behavior

---
 aimon/reprompting_api/reprompter.py |  6 +++++-
 aimon/reprompting_api/utils.py      | 23 +++++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/aimon/reprompting_api/reprompter.py b/aimon/reprompting_api/reprompter.py
index e229621..35ca1da 100644
--- a/aimon/reprompting_api/reprompter.py
+++ b/aimon/reprompting_api/reprompter.py
@@ -84,6 +84,9 @@ def get_toxicity_reprompt(self, result) -> str:
 
         Returns:
             str: Toxicity-specific feedback, or None if no toxicity detected.
+        
+        Note:
+            For toxicity, lower scores indicate higher toxicity. We invert the score to show confidence.
         """
         try:
             failed_instructions = get_failed_toxicity_instructions(result)
@@ -92,7 +95,8 @@ def get_toxicity_reprompt(self, result) -> str:
             logger.info(f"Toxicity violations detected: {len(failed_instructions)}")
             lines = ["Your reply contained toxic content. Remove any harmful, abusive, or unsafe language."]
             for i, failed_instruction in enumerate(failed_instructions, start=1):
-                confidence = failed_instruction.get("score", 0.0) * 100
+                # For toxicity, lower score = more toxic, so invert to show confidence
+                confidence = (1.0 - failed_instruction.get("score", 0.0)) * 100
                 confidence_str = f"{confidence:.2f}%"
                 lines.append(
                     f"{i}. We are {confidence_str} confident that your response had the following issue:\n"
diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py
index 20740fc..f7925f9 100644
--- a/aimon/reprompting_api/utils.py
+++ b/aimon/reprompting_api/utils.py
@@ -71,12 +71,13 @@ def f_retry(*args, **kwargs):
         return f_retry
     return deco_retry
 
-# toxicity threshold for AIMon detection; Follow probabilities above this are considered failures
-TOXICITY_THRESHOLD = 0.25
+# toxicity threshold for AIMon detection; Follow probabilities below this are considered failures (lower score = more toxic)
+TOXICITY_THRESHOLD = 0.4
 
 def _count_toxicity_failures(result) -> int:
     """
-    Count the number of toxicity instructions whose follow probability exceeds the threshold.
+    Count the number of toxicity instructions whose follow probability is below the threshold.
+    Lower scores indicate higher toxicity.
 
     Args:
         result: AIMon detection result containing a `toxicity` section.
@@ -87,25 +88,27 @@ def _count_toxicity_failures(result) -> int:
     return sum(
         1
         for inst in result.detect_response.toxicity.get("instructions_list", [])
-        if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD
+        if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD
     )
 
 def toxicity_check(result) -> bool:
     """
-    Check whether any toxicity instructions exceed the threshold.
+    Check whether any toxicity instructions fall below the threshold.
+    Lower scores indicate higher toxicity.
 
     Args:
         result: AIMon detection result containing a `toxicity` section.
 
     Returns:
-        bool: True if at least one toxicity instruction exceeds the threshold, False otherwise.
+        bool: True if at least one toxicity instruction is below the threshold, False otherwise.
     """
     return _count_toxicity_failures(result) > 0
 
 
 def get_failed_toxicity_instructions(result) -> List[dict]:
     """
-    Extract failed toxicity instructions exceeding the threshold.
+    Extract failed toxicity instructions below the threshold.
+    Lower scores indicate higher toxicity.
 
     Args:
         result: AIMon detection result containing a `toxicity` section.
@@ -120,7 +123,7 @@ def get_failed_toxicity_instructions(result) -> List[dict]:
     """
     failed = []
     for inst in result.detect_response.toxicity.get("instructions_list", []):
-        if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD:
+        if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD:
             failed.append({
                 "type": "toxicity_failure",
                 "source": "toxicity",
@@ -188,11 +191,11 @@ def get_residual_error_score(result):
     Compute a normalized residual error score (0–1) based on:
     - Groundedness follow probabilities
     - Instruction adherence follow probabilities
-    - Toxicity (inverted: 1 - follow_probability)
+    - Toxicity (inverted: 1 - follow_probability, since lower toxicity scores indicate higher toxicity)
 
     Logic:
     1. Collect follow probabilities for groundedness & adherence.
-    2. For toxicity, use 1 - follow_probability (since high follow = low error).
+    2. For toxicity, use 1 - follow_probability (since lower scores = higher toxicity, inverting gives higher error).
     3. Compute a penalized average using the helper.
     4. Clamp the final score to [0,1].
     """

From e3ecb99c9a1b78b38f1548cc847d476b80f78142 Mon Sep 17 00:00:00 2001
From: Preetam Joshi <info@aimon.ai>
Date: Mon, 13 Oct 2025 15:10:27 -0700
Subject: [PATCH 2/6] Bumping version of the package

---
 aimon/_version.py | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aimon/_version.py b/aimon/_version.py
index e52c251..02c60c0 100644
--- a/aimon/_version.py
+++ b/aimon/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "aimon"
-__version__ = "0.12.1"
+__version__ = "0.12.2"
diff --git a/setup.py b/setup.py
index 43af0ff..1d5e681 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
     name='aimon',
     python_requires='>3.8.0',
     packages=find_packages(),
-    version="0.12.1",
+    version="0.12.2",
     install_requires=[
         "annotated-types~=0.6.0",
         "anyio~=4.9.0",

From d0740b5c318d67e7bcdbfcc1d98de8eccc760ad2 Mon Sep 17 00:00:00 2001
From: Preetam Joshi <info@aimon.ai>
Date: Mon, 13 Oct 2025 15:23:08 -0700
Subject: [PATCH 3/6] Corrected doc string for ttoxicity

---
 aimon/reprompting_api/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py
index f7925f9..2294941 100644
--- a/aimon/reprompting_api/utils.py
+++ b/aimon/reprompting_api/utils.py
@@ -11,7 +11,7 @@
 - Guide corrective re-prompting logic.
 
 Key conventions:
-- Toxicity failures are flagged when follow_probability > TOXICITY_THRESHOLD (default 0.25).
+- Toxicity failures are flagged when follow_probability < TOXICITY_THRESHOLD (default 0.5). Lower scores indicate higher toxicity.
 - Residual error scoring penalizes low follow probabilities more heavily and adds a flat penalty for any toxicity failures.
 """
 from typing import Callable, Type, Union, Tuple, Optional, List

From 07dd1123a3d0427e5dab197f2eb726b37261baf3 Mon Sep 17 00:00:00 2001
From: Preetam Joshi <info@aimon.ai>
Date: Mon, 13 Oct 2025 15:25:48 -0700
Subject: [PATCH 4/6] Added unit tests for reprompting utils

---
 tests/test_reprompting_utils.py | 436 ++++++++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100644 tests/test_reprompting_utils.py

diff --git a/tests/test_reprompting_utils.py b/tests/test_reprompting_utils.py
new file mode 100644
index 0000000..85a386c
--- /dev/null
+++ b/tests/test_reprompting_utils.py
@@ -0,0 +1,436 @@
+import pytest
+from unittest.mock import MagicMock
+from aimon.reprompting_api.utils import (
+    _count_toxicity_failures,
+    toxicity_check,
+    get_failed_toxicity_instructions,
+    get_failed_instructions,
+    get_failed_instructions_count,
+    get_residual_error_score,
+    penalized_average,
+    TOXICITY_THRESHOLD
+)
+
+
+class TestToxicityThreshold:
+    """Test suite for toxicity threshold logic."""
+    
+    def test_toxicity_threshold_value(self):
+        """Test that TOXICITY_THRESHOLD is set to 0.5."""
+        assert TOXICITY_THRESHOLD == 0.5
+    
+    def test_toxic_score_below_threshold_is_flagged(self):
+        """Test that toxicity scores below 0.5 are flagged as failures."""
+        # Create a mock result with toxicity score = 0.3 (toxic)
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "Do not use offensive language",
+                    "follow_probability": 0.3,  # Below threshold = toxic
+                    "explanation": "Content contains inappropriate language"
+                }
+            ]
+        }
+        
+        # Should be flagged as toxic
+        assert toxicity_check(result) is True
+        assert _count_toxicity_failures(result) == 1
+    
+    def test_non_toxic_score_above_threshold_not_flagged(self):
+        """Test that toxicity scores above 0.5 are NOT flagged as failures."""
+        # Create a mock result with toxicity score = 0.7 (not toxic)
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "Do not use offensive language",
+                    "follow_probability": 0.7,  # Above threshold = not toxic
+                    "explanation": "Content is appropriate"
+                }
+            ]
+        }
+        
+        # Should NOT be flagged as toxic
+        assert toxicity_check(result) is False
+        assert _count_toxicity_failures(result) == 0
+    
+    def test_score_exactly_at_threshold(self):
+        """Test that a score exactly at 0.5 is NOT flagged (boundary test)."""
+        # Create a mock result with toxicity score = 0.5 (exactly at threshold)
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "Do not use offensive language",
+                    "follow_probability": 0.5,  # Exactly at threshold
+                    "explanation": "Content is borderline"
+                }
+            ]
+        }
+        
+        # Should NOT be flagged (we use < not <=)
+        assert toxicity_check(result) is False
+        assert _count_toxicity_failures(result) == 0
+    
+    def test_multiple_toxic_instructions(self):
+        """Test counting multiple toxic instructions."""
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "Do not use offensive language",
+                    "follow_probability": 0.2,  # Toxic
+                    "explanation": "Contains offensive terms"
+                },
+                {
+                    "instruction": "Avoid hate speech",
+                    "follow_probability": 0.1,  # Very toxic
+                    "explanation": "Contains hate speech"
+                },
+                {
+                    "instruction": "Be respectful",
+                    "follow_probability": 0.8,  # Not toxic
+                    "explanation": "Content is respectful"
+                }
+            ]
+        }
+        
+        # Should count 2 toxic instructions
+        assert toxicity_check(result) is True
+        assert _count_toxicity_failures(result) == 2
+    
+    def test_no_toxicity_instructions(self):
+        """Test behavior when there are no toxicity instructions."""
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": []
+        }
+        
+        assert toxicity_check(result) is False
+        assert _count_toxicity_failures(result) == 0
+    
+    def test_get_failed_toxicity_instructions_structure(self):
+        """Test that get_failed_toxicity_instructions returns correct structure."""
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "Do not use offensive language",
+                    "follow_probability": 0.3,
+                    "explanation": "Content contains inappropriate language"
+                },
+                {
+                    "instruction": "Avoid hate speech",
+                    "follow_probability": 0.1,
+                    "explanation": "Contains hate speech"
+                },
+                {
+                    "instruction": "Be respectful",
+                    "follow_probability": 0.8,  # Not toxic
+                    "explanation": "Content is respectful"
+                }
+            ]
+        }
+        
+        failed = get_failed_toxicity_instructions(result)
+        
+        # Should return only the 2 failed instructions
+        assert len(failed) == 2
+        
+        # Check structure of first failed instruction
+        assert failed[0]["type"] == "toxicity_failure"
+        assert failed[0]["source"] == "toxicity"
+        assert failed[0]["instruction"] == "Do not use offensive language"
+        assert failed[0]["score"] == 0.3
+        assert failed[0]["explanation"] == "Content contains inappropriate language"
+        
+        # Check second failed instruction
+        assert failed[1]["instruction"] == "Avoid hate speech"
+        assert failed[1]["score"] == 0.1
+
+
+class TestResidualErrorScore:
+    """Test suite for residual error score calculation."""
+    
+    def test_residual_error_with_toxic_content(self):
+        """Test residual error calculation with toxic content (low toxicity scores)."""
+        result = MagicMock()
+        
+        # Mock groundedness and instruction_adherence
+        result.detect_response.groundedness = {
+            "instructions_list": [
+                {"follow_probability": 0.9}
+            ]
+        }
+        result.detect_response.instruction_adherence = {
+            "instructions_list": [
+                {"follow_probability": 0.8}
+            ]
+        }
+        
+        # Mock toxicity with low score (high toxicity)
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {"follow_probability": 0.2}  # Low score = high toxicity
+            ]
+        }
+        
+        score = get_residual_error_score(result)
+        
+        # Score should be between 0 and 1
+        assert 0 <= score <= 1
+        # Should have some error due to toxic content
+        # The toxicity score gets inverted: 1 - 0.2 = 0.8 (high error)
+        assert score > 0
+    
+    def test_residual_error_with_non_toxic_content(self):
+        """Test residual error calculation with non-toxic content (high toxicity scores)."""
+        result = MagicMock()
+        
+        # Mock all detectors with high scores (good content)
+        result.detect_response.groundedness = {
+            "instructions_list": [
+                {"follow_probability": 0.9}
+            ]
+        }
+        result.detect_response.instruction_adherence = {
+            "instructions_list": [
+                {"follow_probability": 0.95}
+            ]
+        }
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {"follow_probability": 0.9}  # High score = low toxicity
+            ]
+        }
+        
+        score = get_residual_error_score(result)
+        
+        # Score should be low (little error)
+        assert 0 <= score <= 1
+        # Should have minimal error
+        assert score < 0.3
+    
+    def test_residual_error_toxicity_inversion(self):
+        """Test that toxicity scores are correctly inverted in residual error calculation."""
+        result = MagicMock()
+        
+        # Empty groundedness and instruction_adherence
+        result.detect_response.groundedness = {"instructions_list": []}
+        result.detect_response.instruction_adherence = {"instructions_list": []}
+        
+        # Only toxicity score
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {"follow_probability": 0.1}  # Very toxic (low score)
+            ]
+        }
+        
+        score = get_residual_error_score(result)
+        
+        # Inverted score should be: 1 - 0.1 = 0.9
+        # penalized_average with p=0.9 (>= 0.5) gives penalty=0
+        # Result should be 0
+        assert score == 0.0
+
+
+class TestPenalizedAverage:
+    """Test suite for penalized average calculation."""
+    
+    def test_penalized_average_with_high_probabilities(self):
+        """Test penalized average with probabilities above 0.5."""
+        probs = [0.8, 0.9, 0.7]
+        result = penalized_average(probs)
+        
+        # All probs >= 0.5, so no penalty
+        assert result == 0.0
+    
+    def test_penalized_average_with_low_probabilities(self):
+        """Test penalized average with probabilities below 0.5."""
+        probs = [0.3, 0.2]
+        result = penalized_average(probs)
+        
+        # Penalties: (1-0.3)*2 = 1.4, (1-0.2)*2 = 1.6
+        # Average: (1.4 + 1.6) / 2 = 1.5
+        assert result == 1.5
+    
+    def test_penalized_average_mixed_probabilities(self):
+        """Test penalized average with mixed probabilities."""
+        probs = [0.8, 0.3]
+        result = penalized_average(probs)
+        
+        # Penalties: 0 (for 0.8), (1-0.3)*2 = 1.4 (for 0.3)
+        # Average: (0 + 1.4) / 2 = 0.7
+        assert result == 0.7
+    
+    def test_penalized_average_empty_list(self):
+        """Test penalized average with empty list."""
+        probs = []
+        result = penalized_average(probs)
+        
+        # Should return -1 for empty list
+        assert result == -1
+
+
+class TestGetFailedInstructions:
+    """Test suite for get_failed_instructions function."""
+    
+    def test_get_failed_instructions_includes_toxicity(self):
+        """Test that get_failed_instructions includes toxicity failures from toxicity detector."""
+        result = MagicMock()
+        
+        # Mock instruction_adherence with one failure
+        result.detect_response.instruction_adherence = {
+            "instructions_list": [
+                {
+                    "instruction": "Be concise",
+                    "label": False,  # Failed
+                    "follow_probability": 0.3,
+                    "explanation": "Response was too verbose"
+                }
+            ]
+        }
+        
+        # Mock groundedness with no failures
+        result.detect_response.groundedness = {
+            "instructions_list": []
+        }
+        
+        # Mock toxicity with one failure (handled separately by get_failed_toxicity_instructions)
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "Do not use offensive language",
+                    "follow_probability": 0.2,  # Below threshold = toxic
+                    "explanation": "Contains inappropriate language"
+                }
+            ]
+        }
+        
+        failed = get_failed_instructions(result)
+        
+        # Should return only the instruction_adherence failure
+        # (toxicity is handled separately by get_failed_toxicity_instructions)
+        assert len(failed) == 1
+        assert failed[0]["type"] == "instruction_adherence_failure"
+    
+    def test_get_failed_instructions_count_includes_toxicity(self):
+        """Test that get_failed_instructions_count includes toxicity failures."""
+        result = MagicMock()
+        
+        # Mock instruction_adherence with one failure
+        result.detect_response.instruction_adherence = {
+            "instructions_list": [
+                {
+                    "instruction": "Be concise",
+                    "label": False,
+                    "follow_probability": 0.3,
+                    "explanation": "Too verbose"
+                }
+            ]
+        }
+        
+        # Mock groundedness with one failure
+        result.detect_response.groundedness = {
+            "instructions_list": [
+                {
+                    "instruction": "Base answer on context",
+                    "label": False,
+                    "follow_probability": 0.4,
+                    "explanation": "Not grounded"
+                }
+            ]
+        }
+        
+        # Mock toxicity with two failures
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "No offensive language",
+                    "follow_probability": 0.2,  # Toxic
+                    "explanation": "Offensive"
+                },
+                {
+                    "instruction": "Be respectful",
+                    "follow_probability": 0.3,  # Toxic
+                    "explanation": "Disrespectful"
+                }
+            ]
+        }
+        
+        count = get_failed_instructions_count(result)
+        
+        # Should count all failures: 1 adherence + 1 groundedness + 2 toxicity = 4
+        assert count == 4
+
+
+class TestEdgeCases:
+    """Test suite for edge cases and boundary conditions."""
+    
+    def test_very_low_toxicity_score(self):
+        """Test with very low toxicity score (0.0)."""
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "No hate speech",
+                    "follow_probability": 0.0,  # Extremely toxic
+                    "explanation": "Contains severe violations"
+                }
+            ]
+        }
+        
+        assert toxicity_check(result) is True
+        failed = get_failed_toxicity_instructions(result)
+        assert len(failed) == 1
+        assert failed[0]["score"] == 0.0
+    
+    def test_very_high_toxicity_score(self):
+        """Test with very high toxicity score (1.0)."""
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "No hate speech",
+                    "follow_probability": 1.0,  # Perfectly clean
+                    "explanation": "No issues found"
+                }
+            ]
+        }
+        
+        assert toxicity_check(result) is False
+        failed = get_failed_toxicity_instructions(result)
+        assert len(failed) == 0
+    
+    def test_missing_toxicity_field(self):
+        """Test behavior when toxicity field is missing."""
+        result = MagicMock()
+        result.detect_response.toxicity = {}
+        
+        # Should handle missing instructions_list gracefully
+        assert toxicity_check(result) is False
+        assert _count_toxicity_failures(result) == 0
+        assert get_failed_toxicity_instructions(result) == []
+    
+    def test_missing_follow_probability(self):
+        """Test behavior when follow_probability is missing."""
+        result = MagicMock()
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {
+                    "instruction": "No hate speech",
+                    # Missing follow_probability
+                    "explanation": "Test"
+                }
+            ]
+        }
+        
+        # Should default to 0.0, which is below threshold (toxic)
+        assert toxicity_check(result) is True
+        assert _count_toxicity_failures(result) == 1
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
+

From b77ab54163f62a0784989450809d99fb2e3c11da Mon Sep 17 00:00:00 2001
From: Preetam Joshi <info@aimon.ai>
Date: Mon, 13 Oct 2025 15:35:15 -0700
Subject: [PATCH 5/6] Updated tests and fixed a few bugs

---
 aimon/reprompting_api/utils.py  | 15 +++++++++------
 tests/test_reprompting_utils.py | 28 +++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py
index 2294941..504b642 100644
--- a/aimon/reprompting_api/utils.py
+++ b/aimon/reprompting_api/utils.py
@@ -72,7 +72,7 @@ def f_retry(*args, **kwargs):
     return deco_retry
 
 # toxicity threshold for AIMon detection; Follow probabilities below this are considered failures (lower score = more toxic)
-TOXICITY_THRESHOLD = 0.4
+TOXICITY_THRESHOLD = 0.5
 
 def _count_toxicity_failures(result) -> int:
     """
@@ -191,13 +191,16 @@ def get_residual_error_score(result):
     Compute a normalized residual error score (0–1) based on:
     - Groundedness follow probabilities
     - Instruction adherence follow probabilities
-    - Toxicity (inverted: 1 - follow_probability, since lower toxicity scores indicate higher toxicity)
+    - Toxicity follow probabilities (lower scores indicate higher toxicity)
 
     Logic:
-    1. Collect follow probabilities for groundedness & adherence.
-    2. For toxicity, use 1 - follow_probability (since lower scores = higher toxicity, inverting gives higher error).
+    1. Collect follow probabilities for groundedness, adherence, and toxicity.
+    2. For toxicity, use follow_probability directly (since lower scores = higher toxicity = higher error).
     3. Compute a penalized average using the helper.
     4. Clamp the final score to [0,1].
+    
+    Note: Unlike groundedness/adherence where high scores are good, toxicity scores are already
+    in the "error" direction (low score = toxic = bad), so no inversion is needed.
     """
     combined_probs = []
 
@@ -207,9 +210,9 @@ def get_residual_error_score(result):
             for item in getattr(result.detect_response, source, {}).get("instructions_list", [])
         ])
 
-    # For toxicity, invert the follow probability
+    # For toxicity, use the follow probability directly (lower = more toxic = higher error)
     combined_probs.extend([
-        1 - item["follow_probability"]
+        item["follow_probability"]
         for item in getattr(result.detect_response, "toxicity", {}).get("instructions_list", [])
     ])
 
diff --git a/tests/test_reprompting_utils.py b/tests/test_reprompting_utils.py
index 85a386c..14713c4 100644
--- a/tests/test_reprompting_utils.py
+++ b/tests/test_reprompting_utils.py
@@ -212,8 +212,8 @@ def test_residual_error_with_non_toxic_content(self):
         # Should have minimal error
         assert score < 0.3
     
-    def test_residual_error_toxicity_inversion(self):
-        """Test that toxicity scores are correctly inverted in residual error calculation."""
+    def test_residual_error_toxicity_no_inversion(self):
+        """Test that toxicity scores are used directly (not inverted) in residual error calculation."""
         result = MagicMock()
         
         # Empty groundedness and instruction_adherence
@@ -229,7 +229,29 @@ def test_residual_error_toxicity_inversion(self):
         
         score = get_residual_error_score(result)
         
-        # Inverted score should be: 1 - 0.1 = 0.9
+        # Low toxicity score (0.1) means high toxicity, which should give high error
+        # penalized_average with p=0.1 (< 0.5) gives penalty=(1-0.1)*2=1.8
+        # Result should be 1.8
+        assert score == 1.8
+    
+    def test_residual_error_high_toxicity_score(self):
+        """Test that high toxicity scores (non-toxic content) give low error."""
+        result = MagicMock()
+        
+        # Empty groundedness and instruction_adherence
+        result.detect_response.groundedness = {"instructions_list": []}
+        result.detect_response.instruction_adherence = {"instructions_list": []}
+        
+        # Only toxicity score
+        result.detect_response.toxicity = {
+            "instructions_list": [
+                {"follow_probability": 0.9}  # Not toxic (high score)
+            ]
+        }
+        
+        score = get_residual_error_score(result)
+        
+        # High toxicity score (0.9) means NOT toxic, which should give low/no error
         # penalized_average with p=0.9 (>= 0.5) gives penalty=0
         # Result should be 0
         assert score == 0.0

From 1938353398dec81223c97f59c7384cedfef6f8ad Mon Sep 17 00:00:00 2001
From: Preetam Joshi <info@aimon.ai>
Date: Mon, 13 Oct 2025 16:46:51 -0700
Subject: [PATCH 6/6] Fixing tests

---
 tests/test_detect.py            | 86 ++++++++++++++++-----------------
 tests/test_reprompting_utils.py |  4 +-
 2 files changed, 43 insertions(+), 47 deletions(-)

diff --git a/tests/test_detect.py b/tests/test_detect.py
index 8fc8dfc..06d932e 100644
--- a/tests/test_detect.py
+++ b/tests/test_detect.py
@@ -39,8 +39,8 @@ def log_info(self, title, data):
 
     def test_basic_detect_functionality(self, caplog):
         """Test that the Detect decorator works with basic functionality without raising exceptions."""
-        # Create the decorator
-        config = {'hallucination': {'detector_name': 'default'}}
+        # Create the decorator (using groundedness instead of deprecated hallucination)
+        config = {'groundedness': {'detector_name': 'default'}}
         values_returned = ["context", "generated_text", "user_query"]
         
         self.log_info("TEST", "Basic detect functionality")
@@ -71,11 +71,10 @@ def generate_summary(context, query):
         self.log_info("OUTPUT_GENERATED_TEXT", generated_text)
         self.log_info("OUTPUT_STATUS", result.status)
         
-        if hasattr(result.detect_response, 'hallucination'):
-            self.log_info("OUTPUT_HALLUCINATION", {
-                "is_hallucinated": result.detect_response.hallucination.get("is_hallucinated", ""),
-                "score": result.detect_response.hallucination.get("score", ""),
-                "sentences_count": len(result.detect_response.hallucination.get("sentences", []))
+        if hasattr(result.detect_response, 'groundedness'):
+            self.log_info("OUTPUT_GROUNDEDNESS", {
+                "score": result.detect_response.groundedness.get("score", ""),
+                "instructions_list": result.detect_response.groundedness.get("instructions_list", [])
             })
         
         # Verify return values
@@ -86,16 +85,14 @@ def generate_summary(context, query):
         # Verify response structure
         assert isinstance(result, DetectResult)
         assert result.status == 200
-        assert hasattr(result.detect_response, 'hallucination')
-        assert "is_hallucinated" in result.detect_response.hallucination
-        assert "score" in result.detect_response.hallucination
-        assert "sentences" in result.detect_response.hallucination
+        assert hasattr(result.detect_response, 'groundedness')
+        assert "score" in result.detect_response.groundedness
 
     def test_detect_with_multiple_detectors(self):
         """Test the Detect decorator with multiple detectors without raising exceptions."""
-        # Create the decorator with multiple detectors
+        # Create the decorator with multiple detectors (using groundedness instead of deprecated hallucination)
         config = {
-            'hallucination': {'detector_name': 'default'},
+            'groundedness': {'detector_name': 'default'},
             'instruction_adherence': {'detector_name': 'default'},
             'toxicity': {'detector_name': 'default'}
         }
@@ -131,25 +128,25 @@ def generate_response(context, query, instructions):
         self.log_info("Output - Generated Text", generated_text)
         self.log_info("Output - Status", result.status)
         
-        for detector in ['hallucination', 'instruction_adherence', 'toxicity']:
+        for detector in ['groundedness', 'instruction_adherence', 'toxicity']:
             if hasattr(result.detect_response, detector):
                 self.log_info(f"Output - {detector.capitalize()} Response", 
                                 getattr(result.detect_response, detector))
         
         # Verify response structure
-        assert hasattr(result.detect_response, 'hallucination')
+        assert hasattr(result.detect_response, 'groundedness')
         assert hasattr(result.detect_response, 'instruction_adherence')
         assert hasattr(result.detect_response, 'toxicity')
         
         # Check key fields without verifying values
-        assert "score" in result.detect_response.hallucination
+        assert "score" in result.detect_response.groundedness
         assert "instructions_list" in result.detect_response.instruction_adherence
         assert "score" in result.detect_response.toxicity
 
     def test_detect_with_different_iterables(self):
         """Test the Detect decorator with different iterable types for values_returned."""
         # Create the decorator with a tuple for values_returned
-        config = {'hallucination': {'detector_name': 'default'}}
+        config = {'groundedness': {'detector_name': 'default'}}
         values_returned = ("context", "generated_text")
         
         self.log_info("Test", "Detect with different iterables (tuple)")
@@ -176,16 +173,16 @@ def simple_function():
         self.log_info("Output - Generated Text", generated_text)
         self.log_info("Output - Status", result.status)
         
-        if hasattr(result.detect_response, 'hallucination'):
-            self.log_info("Output - Hallucination Response", 
-                          result.detect_response.hallucination)
+        if hasattr(result.detect_response, 'groundedness'):
+            self.log_info("Output - Groundedness Response", 
+                          result.detect_response.groundedness)
         
         # Verify return values and structure
         assert "Python" in context
         assert "data science" in generated_text
         assert isinstance(result, DetectResult)
-        assert hasattr(result.detect_response, 'hallucination')
-        assert "score" in result.detect_response.hallucination
+        assert hasattr(result.detect_response, 'groundedness')
+        assert "score" in result.detect_response.groundedness
 
     def test_detect_with_non_tuple_return(self):
         """Test the Detect decorator when the wrapped function returns a single value."""
@@ -235,7 +232,7 @@ def test_validate_iterable_values_returned(self):
         detect_with_list = Detect(
             values_returned=list_values,
             api_key=self.api_key,
-            config={'hallucination': {'detector_name': 'default'}}
+            config={'groundedness': {'detector_name': 'default'}}
         )
         
         # Test with a tuple
@@ -245,7 +242,7 @@ def test_validate_iterable_values_returned(self):
         detect_with_tuple = Detect(
             values_returned=tuple_values,
             api_key=self.api_key, 
-            config={'hallucination': {'detector_name': 'default'}}
+            config={'groundedness': {'detector_name': 'default'}}
         )
         
         # Test with a custom iterable
@@ -266,7 +263,7 @@ def __len__(self):
         detect_with_custom = Detect(
             values_returned=custom_iterable,
             api_key=self.api_key,
-            config={'hallucination': {'detector_name': 'default'}}
+            config={'groundedness': {'detector_name': 'default'}}
         )
         
         # If we got here without exceptions, the test passes
@@ -380,7 +377,7 @@ def test_missing_required_fields(self):
                 values_returned=["context", "generated_text"],
                 api_key=self.api_key,
                 publish=True,  # publish requires application_name and model_name
-                config={'hallucination': {'detector_name': 'default'}}
+                config={'groundedness': {'detector_name': 'default'}}
             )
         self.log_info("Error message (publish)", str(exc_info1.value))
         
@@ -391,7 +388,7 @@ def test_missing_required_fields(self):
                 values_returned=["context", "generated_text"],
                 api_key=self.api_key,
                 async_mode=True,  # async_mode requires application_name and model_name
-                config={'hallucination': {'detector_name': 'default'}}
+                config={'groundedness': {'detector_name': 'default'}}
             )
         self.log_info("Error message (async_mode)", str(exc_info2.value))
         
@@ -434,15 +431,15 @@ def generate_text():
         assert hasattr(result.detect_response, 'toxicity')
         assert "score" in result.detect_response.toxicity
         
-    def test_hallucination_context_relevance_combination(self):
-        """Test the Detect decorator with a combination of hallucination and retrieval relevance detectors."""
+    def test_groundedness_context_relevance_combination(self):
+        """Test the Detect decorator with a combination of groundedness and retrieval relevance detectors."""
         config = {
-            'hallucination': {'detector_name': 'default'},
+            'groundedness': {'detector_name': 'default'},
             'retrieval_relevance': {'detector_name': 'default'}
         }
         values_returned = ["context", "generated_text", "user_query", "task_definition"]
         
-        self.log_info("Test", "Hallucination and Retrieval Relevance combination")
+        self.log_info("Test", "Groundedness and Retrieval Relevance combination")
         self.log_info("Configuration", config)
         self.log_info("Values returned", values_returned)
         
@@ -469,7 +466,7 @@ def generate_summary(context, query):
         self.log_info("Output - Generated Text", generated_text)
         self.log_info("Output - Status", result.status)
         
-        for detector in ['hallucination', 'retrieval_relevance']:
+        for detector in ['groundedness', 'retrieval_relevance']:
             if hasattr(result.detect_response, detector):
                 self.log_info(f"Output - {detector.capitalize()} Response", 
                               getattr(result.detect_response, detector))
@@ -477,7 +474,7 @@ def generate_summary(context, query):
         # Verify response structure
         assert isinstance(result, DetectResult)
         assert result.status == 200
-        assert hasattr(result.detect_response, 'hallucination')
+        assert hasattr(result.detect_response, 'groundedness')
         assert hasattr(result.detect_response, 'retrieval_relevance')
 
     def test_instruction_adherence_v1(self):
@@ -593,7 +590,7 @@ def generate_with_instructions(context, instructions, query):
     def test_all_detectors_combination(self):
         """Test the Detect decorator with all available detectors."""
         config = {
-            'hallucination': {'detector_name': 'default'},
+            'groundedness': {'detector_name': 'default'},
             'toxicity': {'detector_name': 'default'},
             'instruction_adherence': {'detector_name': 'default'},
             'retrieval_relevance': {'detector_name': 'default'},
@@ -637,7 +634,7 @@ def comprehensive_response(context, query, instructions):
         self.log_info("Output - Status", result.status)
         
         # Log all detector responses
-        for detector in ['hallucination', 'toxicity', 'instruction_adherence', 
+        for detector in ['groundedness', 'toxicity', 'instruction_adherence', 
                         'retrieval_relevance', 'conciseness', 'completeness']:
             if hasattr(result.detect_response, detector):
                 self.log_info(f"Output - {detector.capitalize()} Response", 
@@ -648,7 +645,7 @@ def comprehensive_response(context, query, instructions):
         assert result.status == 200
         
         # Verify all detectors are present in the response
-        assert hasattr(result.detect_response, 'hallucination')
+        assert hasattr(result.detect_response, 'groundedness')
         assert hasattr(result.detect_response, 'toxicity')
         assert hasattr(result.detect_response, 'instruction_adherence')
         assert hasattr(result.detect_response, 'retrieval_relevance')
@@ -772,7 +769,7 @@ def test_evaluate_with_new_model(self):
             
             # Configure evaluation
             eval_config = {
-                'hallucination': {'detector_name': 'default'},
+                'groundedness': {'detector_name': 'default'},
                 'toxicity': {'detector_name': 'default'}
             }
             
@@ -829,9 +826,9 @@ def test_must_compute_validation(self):
         """Test that the must_compute parameter is properly validated."""
         print("\n=== Testing must_compute validation ===")
         
-        # Test config with both hallucination and completeness
+        # Test config with both groundedness and completeness
         test_config = {
-            "hallucination": {
+            "groundedness": {
                 "detector_name": "default"
             },
             "completeness": {
@@ -903,9 +900,9 @@ def test_must_compute_with_actual_service(self):
         """Test must_compute functionality with actual service calls."""
         print("\n=== Testing must_compute with actual service ===")
         
-        # Test config with both hallucination and completeness
+        # Test config with both groundedness and completeness
         test_config = {
-            "hallucination": {
+            "groundedness": {
                 "detector_name": "default"
             },
             "completeness": {
@@ -947,10 +944,9 @@ def generate_summary(context, query):
                 print(f"Generated Text: {generated_text}")
                 
                 # Display response details
-                if hasattr(result.detect_response, 'hallucination'):
-                    hallucination = result.detect_response.hallucination
-                    print(f"Hallucination Score: {hallucination.get('score', 'N/A')}")
-                    print(f"Is Hallucinated: {hallucination.get('is_hallucinated', 'N/A')}")
+                if hasattr(result.detect_response, 'groundedness'):
+                    groundedness = result.detect_response.groundedness
+                    print(f"Groundedness Score: {groundedness.get('score', 'N/A')}")
                 
                 if hasattr(result.detect_response, 'completeness'):
                     completeness = result.detect_response.completeness
diff --git a/tests/test_reprompting_utils.py b/tests/test_reprompting_utils.py
index 14713c4..bca1582 100644
--- a/tests/test_reprompting_utils.py
+++ b/tests/test_reprompting_utils.py
@@ -231,8 +231,8 @@ def test_residual_error_toxicity_no_inversion(self):
         
         # Low toxicity score (0.1) means high toxicity, which should give high error
         # penalized_average with p=0.1 (< 0.5) gives penalty=(1-0.1)*2=1.8
-        # Result should be 1.8
-        assert score == 1.8
+        # But get_residual_error_score clamps to max 1.0, so result is 1.0
+        assert score == 1.0
     
     def test_residual_error_high_toxicity_score(self):
         """Test that high toxicity scores (non-toxic content) give low error."""