From b86da4396f8291372f6d9fa1e7def299edc4bc86 Mon Sep 17 00:00:00 2001 From: Preetam Joshi Date: Mon, 13 Oct 2025 15:09:01 -0700 Subject: [PATCH 1/6] Corrected toxicity intepretation based on the new toxicity model behavior --- aimon/reprompting_api/reprompter.py | 6 +++++- aimon/reprompting_api/utils.py | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/aimon/reprompting_api/reprompter.py b/aimon/reprompting_api/reprompter.py index e229621..35ca1da 100644 --- a/aimon/reprompting_api/reprompter.py +++ b/aimon/reprompting_api/reprompter.py @@ -84,6 +84,9 @@ def get_toxicity_reprompt(self, result) -> str: Returns: str: Toxicity-specific feedback, or None if no toxicity detected. + + Note: + For toxicity, lower scores indicate higher toxicity. We invert the score to show confidence. """ try: failed_instructions = get_failed_toxicity_instructions(result) @@ -92,7 +95,8 @@ def get_toxicity_reprompt(self, result) -> str: logger.info(f"Toxicity violations detected: {len(failed_instructions)}") lines = ["Your reply contained toxic content. Remove any harmful, abusive, or unsafe language."] for i, failed_instruction in enumerate(failed_instructions, start=1): - confidence = failed_instruction.get("score", 0.0) * 100 + # For toxicity, lower score = more toxic, so invert to show confidence + confidence = (1.0 - failed_instruction.get("score", 0.0)) * 100 confidence_str = f"{confidence:.2f}%" lines.append( f"{i}. We are {confidence_str} confident that your response had the following issue:\n" diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py index 20740fc..f7925f9 100644 --- a/aimon/reprompting_api/utils.py +++ b/aimon/reprompting_api/utils.py @@ -71,12 +71,13 @@ def f_retry(*args, **kwargs): return f_retry return deco_retry -# toxicity threshold for AIMon detection; Follow probabilities above this are considered failures -TOXICITY_THRESHOLD = 0.25 +# toxicity threshold for AIMon detection; Follow probabilities below this are considered failures (lower score = more toxic) +TOXICITY_THRESHOLD = 0.4 def _count_toxicity_failures(result) -> int: """ - Count the number of toxicity instructions whose follow probability exceeds the threshold. + Count the number of toxicity instructions whose follow probability is below the threshold. + Lower scores indicate higher toxicity. Args: result: AIMon detection result containing a `toxicity` section. @@ -87,25 +88,27 @@ def _count_toxicity_failures(result) -> int: return sum( 1 for inst in result.detect_response.toxicity.get("instructions_list", []) - if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD + if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD ) def toxicity_check(result) -> bool: """ - Check whether any toxicity instructions exceed the threshold. + Check whether any toxicity instructions fall below the threshold. + Lower scores indicate higher toxicity. Args: result: AIMon detection result containing a `toxicity` section. Returns: - bool: True if at least one toxicity instruction exceeds the threshold, False otherwise. + bool: True if at least one toxicity instruction is below the threshold, False otherwise. """ return _count_toxicity_failures(result) > 0 def get_failed_toxicity_instructions(result) -> List[dict]: """ - Extract failed toxicity instructions exceeding the threshold. + Extract failed toxicity instructions below the threshold. + Lower scores indicate higher toxicity. Args: result: AIMon detection result containing a `toxicity` section. @@ -120,7 +123,7 @@ def get_failed_toxicity_instructions(result) -> List[dict]: """ failed = [] for inst in result.detect_response.toxicity.get("instructions_list", []): - if inst.get("follow_probability", 0.0) > TOXICITY_THRESHOLD: + if inst.get("follow_probability", 0.0) < TOXICITY_THRESHOLD: failed.append({ "type": "toxicity_failure", "source": "toxicity", @@ -188,11 +191,11 @@ def get_residual_error_score(result): Compute a normalized residual error score (0–1) based on: - Groundedness follow probabilities - Instruction adherence follow probabilities - - Toxicity (inverted: 1 - follow_probability) + - Toxicity (inverted: 1 - follow_probability, since lower toxicity scores indicate higher toxicity) Logic: 1. Collect follow probabilities for groundedness & adherence. - 2. For toxicity, use 1 - follow_probability (since high follow = low error). + 2. For toxicity, use 1 - follow_probability (since lower scores = higher toxicity, inverting gives higher error). 3. Compute a penalized average using the helper. 4. Clamp the final score to [0,1]. """ From e3ecb99c9a1b78b38f1548cc847d476b80f78142 Mon Sep 17 00:00:00 2001 From: Preetam Joshi Date: Mon, 13 Oct 2025 15:10:27 -0700 Subject: [PATCH 2/6] Bumping version of the package --- aimon/_version.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aimon/_version.py b/aimon/_version.py index e52c251..02c60c0 100644 --- a/aimon/_version.py +++ b/aimon/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "aimon" -__version__ = "0.12.1" +__version__ = "0.12.2" diff --git a/setup.py b/setup.py index 43af0ff..1d5e681 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ name='aimon', python_requires='>3.8.0', packages=find_packages(), - version="0.12.1", + version="0.12.2", install_requires=[ "annotated-types~=0.6.0", "anyio~=4.9.0", From d0740b5c318d67e7bcdbfcc1d98de8eccc760ad2 Mon Sep 17 00:00:00 2001 From: Preetam Joshi Date: Mon, 13 Oct 2025 15:23:08 -0700 Subject: [PATCH 3/6] Corrected doc string for ttoxicity --- aimon/reprompting_api/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py index f7925f9..2294941 100644 --- a/aimon/reprompting_api/utils.py +++ b/aimon/reprompting_api/utils.py @@ -11,7 +11,7 @@ - Guide corrective re-prompting logic. Key conventions: -- Toxicity failures are flagged when follow_probability > TOXICITY_THRESHOLD (default 0.25). +- Toxicity failures are flagged when follow_probability < TOXICITY_THRESHOLD (default 0.5). Lower scores indicate higher toxicity. - Residual error scoring penalizes low follow probabilities more heavily and adds a flat penalty for any toxicity failures. """ from typing import Callable, Type, Union, Tuple, Optional, List From 07dd1123a3d0427e5dab197f2eb726b37261baf3 Mon Sep 17 00:00:00 2001 From: Preetam Joshi Date: Mon, 13 Oct 2025 15:25:48 -0700 Subject: [PATCH 4/6] Added unit tests for reprompting utils --- tests/test_reprompting_utils.py | 436 ++++++++++++++++++++++++++++++++ 1 file changed, 436 insertions(+) create mode 100644 tests/test_reprompting_utils.py diff --git a/tests/test_reprompting_utils.py b/tests/test_reprompting_utils.py new file mode 100644 index 0000000..85a386c --- /dev/null +++ b/tests/test_reprompting_utils.py @@ -0,0 +1,436 @@ +import pytest +from unittest.mock import MagicMock +from aimon.reprompting_api.utils import ( + _count_toxicity_failures, + toxicity_check, + get_failed_toxicity_instructions, + get_failed_instructions, + get_failed_instructions_count, + get_residual_error_score, + penalized_average, + TOXICITY_THRESHOLD +) + + +class TestToxicityThreshold: + """Test suite for toxicity threshold logic.""" + + def test_toxicity_threshold_value(self): + """Test that TOXICITY_THRESHOLD is set to 0.5.""" + assert TOXICITY_THRESHOLD == 0.5 + + def test_toxic_score_below_threshold_is_flagged(self): + """Test that toxicity scores below 0.5 are flagged as failures.""" + # Create a mock result with toxicity score = 0.3 (toxic) + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "Do not use offensive language", + "follow_probability": 0.3, # Below threshold = toxic + "explanation": "Content contains inappropriate language" + } + ] + } + + # Should be flagged as toxic + assert toxicity_check(result) is True + assert _count_toxicity_failures(result) == 1 + + def test_non_toxic_score_above_threshold_not_flagged(self): + """Test that toxicity scores above 0.5 are NOT flagged as failures.""" + # Create a mock result with toxicity score = 0.7 (not toxic) + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "Do not use offensive language", + "follow_probability": 0.7, # Above threshold = not toxic + "explanation": "Content is appropriate" + } + ] + } + + # Should NOT be flagged as toxic + assert toxicity_check(result) is False + assert _count_toxicity_failures(result) == 0 + + def test_score_exactly_at_threshold(self): + """Test that a score exactly at 0.5 is NOT flagged (boundary test).""" + # Create a mock result with toxicity score = 0.5 (exactly at threshold) + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "Do not use offensive language", + "follow_probability": 0.5, # Exactly at threshold + "explanation": "Content is borderline" + } + ] + } + + # Should NOT be flagged (we use < not <=) + assert toxicity_check(result) is False + assert _count_toxicity_failures(result) == 0 + + def test_multiple_toxic_instructions(self): + """Test counting multiple toxic instructions.""" + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "Do not use offensive language", + "follow_probability": 0.2, # Toxic + "explanation": "Contains offensive terms" + }, + { + "instruction": "Avoid hate speech", + "follow_probability": 0.1, # Very toxic + "explanation": "Contains hate speech" + }, + { + "instruction": "Be respectful", + "follow_probability": 0.8, # Not toxic + "explanation": "Content is respectful" + } + ] + } + + # Should count 2 toxic instructions + assert toxicity_check(result) is True + assert _count_toxicity_failures(result) == 2 + + def test_no_toxicity_instructions(self): + """Test behavior when there are no toxicity instructions.""" + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [] + } + + assert toxicity_check(result) is False + assert _count_toxicity_failures(result) == 0 + + def test_get_failed_toxicity_instructions_structure(self): + """Test that get_failed_toxicity_instructions returns correct structure.""" + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "Do not use offensive language", + "follow_probability": 0.3, + "explanation": "Content contains inappropriate language" + }, + { + "instruction": "Avoid hate speech", + "follow_probability": 0.1, + "explanation": "Contains hate speech" + }, + { + "instruction": "Be respectful", + "follow_probability": 0.8, # Not toxic + "explanation": "Content is respectful" + } + ] + } + + failed = get_failed_toxicity_instructions(result) + + # Should return only the 2 failed instructions + assert len(failed) == 2 + + # Check structure of first failed instruction + assert failed[0]["type"] == "toxicity_failure" + assert failed[0]["source"] == "toxicity" + assert failed[0]["instruction"] == "Do not use offensive language" + assert failed[0]["score"] == 0.3 + assert failed[0]["explanation"] == "Content contains inappropriate language" + + # Check second failed instruction + assert failed[1]["instruction"] == "Avoid hate speech" + assert failed[1]["score"] == 0.1 + + +class TestResidualErrorScore: + """Test suite for residual error score calculation.""" + + def test_residual_error_with_toxic_content(self): + """Test residual error calculation with toxic content (low toxicity scores).""" + result = MagicMock() + + # Mock groundedness and instruction_adherence + result.detect_response.groundedness = { + "instructions_list": [ + {"follow_probability": 0.9} + ] + } + result.detect_response.instruction_adherence = { + "instructions_list": [ + {"follow_probability": 0.8} + ] + } + + # Mock toxicity with low score (high toxicity) + result.detect_response.toxicity = { + "instructions_list": [ + {"follow_probability": 0.2} # Low score = high toxicity + ] + } + + score = get_residual_error_score(result) + + # Score should be between 0 and 1 + assert 0 <= score <= 1 + # Should have some error due to toxic content + # The toxicity score gets inverted: 1 - 0.2 = 0.8 (high error) + assert score > 0 + + def test_residual_error_with_non_toxic_content(self): + """Test residual error calculation with non-toxic content (high toxicity scores).""" + result = MagicMock() + + # Mock all detectors with high scores (good content) + result.detect_response.groundedness = { + "instructions_list": [ + {"follow_probability": 0.9} + ] + } + result.detect_response.instruction_adherence = { + "instructions_list": [ + {"follow_probability": 0.95} + ] + } + result.detect_response.toxicity = { + "instructions_list": [ + {"follow_probability": 0.9} # High score = low toxicity + ] + } + + score = get_residual_error_score(result) + + # Score should be low (little error) + assert 0 <= score <= 1 + # Should have minimal error + assert score < 0.3 + + def test_residual_error_toxicity_inversion(self): + """Test that toxicity scores are correctly inverted in residual error calculation.""" + result = MagicMock() + + # Empty groundedness and instruction_adherence + result.detect_response.groundedness = {"instructions_list": []} + result.detect_response.instruction_adherence = {"instructions_list": []} + + # Only toxicity score + result.detect_response.toxicity = { + "instructions_list": [ + {"follow_probability": 0.1} # Very toxic (low score) + ] + } + + score = get_residual_error_score(result) + + # Inverted score should be: 1 - 0.1 = 0.9 + # penalized_average with p=0.9 (>= 0.5) gives penalty=0 + # Result should be 0 + assert score == 0.0 + + +class TestPenalizedAverage: + """Test suite for penalized average calculation.""" + + def test_penalized_average_with_high_probabilities(self): + """Test penalized average with probabilities above 0.5.""" + probs = [0.8, 0.9, 0.7] + result = penalized_average(probs) + + # All probs >= 0.5, so no penalty + assert result == 0.0 + + def test_penalized_average_with_low_probabilities(self): + """Test penalized average with probabilities below 0.5.""" + probs = [0.3, 0.2] + result = penalized_average(probs) + + # Penalties: (1-0.3)*2 = 1.4, (1-0.2)*2 = 1.6 + # Average: (1.4 + 1.6) / 2 = 1.5 + assert result == 1.5 + + def test_penalized_average_mixed_probabilities(self): + """Test penalized average with mixed probabilities.""" + probs = [0.8, 0.3] + result = penalized_average(probs) + + # Penalties: 0 (for 0.8), (1-0.3)*2 = 1.4 (for 0.3) + # Average: (0 + 1.4) / 2 = 0.7 + assert result == 0.7 + + def test_penalized_average_empty_list(self): + """Test penalized average with empty list.""" + probs = [] + result = penalized_average(probs) + + # Should return -1 for empty list + assert result == -1 + + +class TestGetFailedInstructions: + """Test suite for get_failed_instructions function.""" + + def test_get_failed_instructions_includes_toxicity(self): + """Test that get_failed_instructions includes toxicity failures from toxicity detector.""" + result = MagicMock() + + # Mock instruction_adherence with one failure + result.detect_response.instruction_adherence = { + "instructions_list": [ + { + "instruction": "Be concise", + "label": False, # Failed + "follow_probability": 0.3, + "explanation": "Response was too verbose" + } + ] + } + + # Mock groundedness with no failures + result.detect_response.groundedness = { + "instructions_list": [] + } + + # Mock toxicity with one failure (handled separately by get_failed_toxicity_instructions) + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "Do not use offensive language", + "follow_probability": 0.2, # Below threshold = toxic + "explanation": "Contains inappropriate language" + } + ] + } + + failed = get_failed_instructions(result) + + # Should return only the instruction_adherence failure + # (toxicity is handled separately by get_failed_toxicity_instructions) + assert len(failed) == 1 + assert failed[0]["type"] == "instruction_adherence_failure" + + def test_get_failed_instructions_count_includes_toxicity(self): + """Test that get_failed_instructions_count includes toxicity failures.""" + result = MagicMock() + + # Mock instruction_adherence with one failure + result.detect_response.instruction_adherence = { + "instructions_list": [ + { + "instruction": "Be concise", + "label": False, + "follow_probability": 0.3, + "explanation": "Too verbose" + } + ] + } + + # Mock groundedness with one failure + result.detect_response.groundedness = { + "instructions_list": [ + { + "instruction": "Base answer on context", + "label": False, + "follow_probability": 0.4, + "explanation": "Not grounded" + } + ] + } + + # Mock toxicity with two failures + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "No offensive language", + "follow_probability": 0.2, # Toxic + "explanation": "Offensive" + }, + { + "instruction": "Be respectful", + "follow_probability": 0.3, # Toxic + "explanation": "Disrespectful" + } + ] + } + + count = get_failed_instructions_count(result) + + # Should count all failures: 1 adherence + 1 groundedness + 2 toxicity = 4 + assert count == 4 + + +class TestEdgeCases: + """Test suite for edge cases and boundary conditions.""" + + def test_very_low_toxicity_score(self): + """Test with very low toxicity score (0.0).""" + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "No hate speech", + "follow_probability": 0.0, # Extremely toxic + "explanation": "Contains severe violations" + } + ] + } + + assert toxicity_check(result) is True + failed = get_failed_toxicity_instructions(result) + assert len(failed) == 1 + assert failed[0]["score"] == 0.0 + + def test_very_high_toxicity_score(self): + """Test with very high toxicity score (1.0).""" + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "No hate speech", + "follow_probability": 1.0, # Perfectly clean + "explanation": "No issues found" + } + ] + } + + assert toxicity_check(result) is False + failed = get_failed_toxicity_instructions(result) + assert len(failed) == 0 + + def test_missing_toxicity_field(self): + """Test behavior when toxicity field is missing.""" + result = MagicMock() + result.detect_response.toxicity = {} + + # Should handle missing instructions_list gracefully + assert toxicity_check(result) is False + assert _count_toxicity_failures(result) == 0 + assert get_failed_toxicity_instructions(result) == [] + + def test_missing_follow_probability(self): + """Test behavior when follow_probability is missing.""" + result = MagicMock() + result.detect_response.toxicity = { + "instructions_list": [ + { + "instruction": "No hate speech", + # Missing follow_probability + "explanation": "Test" + } + ] + } + + # Should default to 0.0, which is below threshold (toxic) + assert toxicity_check(result) is True + assert _count_toxicity_failures(result) == 1 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) + From b77ab54163f62a0784989450809d99fb2e3c11da Mon Sep 17 00:00:00 2001 From: Preetam Joshi Date: Mon, 13 Oct 2025 15:35:15 -0700 Subject: [PATCH 5/6] Updated tests and fixed a few bugs --- aimon/reprompting_api/utils.py | 15 +++++++++------ tests/test_reprompting_utils.py | 28 +++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/aimon/reprompting_api/utils.py b/aimon/reprompting_api/utils.py index 2294941..504b642 100644 --- a/aimon/reprompting_api/utils.py +++ b/aimon/reprompting_api/utils.py @@ -72,7 +72,7 @@ def f_retry(*args, **kwargs): return deco_retry # toxicity threshold for AIMon detection; Follow probabilities below this are considered failures (lower score = more toxic) -TOXICITY_THRESHOLD = 0.4 +TOXICITY_THRESHOLD = 0.5 def _count_toxicity_failures(result) -> int: """ @@ -191,13 +191,16 @@ def get_residual_error_score(result): Compute a normalized residual error score (0–1) based on: - Groundedness follow probabilities - Instruction adherence follow probabilities - - Toxicity (inverted: 1 - follow_probability, since lower toxicity scores indicate higher toxicity) + - Toxicity follow probabilities (lower scores indicate higher toxicity) Logic: - 1. Collect follow probabilities for groundedness & adherence. - 2. For toxicity, use 1 - follow_probability (since lower scores = higher toxicity, inverting gives higher error). + 1. Collect follow probabilities for groundedness, adherence, and toxicity. + 2. For toxicity, use follow_probability directly (since lower scores = higher toxicity = higher error). 3. Compute a penalized average using the helper. 4. Clamp the final score to [0,1]. + + Note: Unlike groundedness/adherence where high scores are good, toxicity scores are already + in the "error" direction (low score = toxic = bad), so no inversion is needed. """ combined_probs = [] @@ -207,9 +210,9 @@ def get_residual_error_score(result): for item in getattr(result.detect_response, source, {}).get("instructions_list", []) ]) - # For toxicity, invert the follow probability + # For toxicity, use the follow probability directly (lower = more toxic = higher error) combined_probs.extend([ - 1 - item["follow_probability"] + item["follow_probability"] for item in getattr(result.detect_response, "toxicity", {}).get("instructions_list", []) ]) diff --git a/tests/test_reprompting_utils.py b/tests/test_reprompting_utils.py index 85a386c..14713c4 100644 --- a/tests/test_reprompting_utils.py +++ b/tests/test_reprompting_utils.py @@ -212,8 +212,8 @@ def test_residual_error_with_non_toxic_content(self): # Should have minimal error assert score < 0.3 - def test_residual_error_toxicity_inversion(self): - """Test that toxicity scores are correctly inverted in residual error calculation.""" + def test_residual_error_toxicity_no_inversion(self): + """Test that toxicity scores are used directly (not inverted) in residual error calculation.""" result = MagicMock() # Empty groundedness and instruction_adherence @@ -229,7 +229,29 @@ def test_residual_error_toxicity_inversion(self): score = get_residual_error_score(result) - # Inverted score should be: 1 - 0.1 = 0.9 + # Low toxicity score (0.1) means high toxicity, which should give high error + # penalized_average with p=0.1 (< 0.5) gives penalty=(1-0.1)*2=1.8 + # Result should be 1.8 + assert score == 1.8 + + def test_residual_error_high_toxicity_score(self): + """Test that high toxicity scores (non-toxic content) give low error.""" + result = MagicMock() + + # Empty groundedness and instruction_adherence + result.detect_response.groundedness = {"instructions_list": []} + result.detect_response.instruction_adherence = {"instructions_list": []} + + # Only toxicity score + result.detect_response.toxicity = { + "instructions_list": [ + {"follow_probability": 0.9} # Not toxic (high score) + ] + } + + score = get_residual_error_score(result) + + # High toxicity score (0.9) means NOT toxic, which should give low/no error # penalized_average with p=0.9 (>= 0.5) gives penalty=0 # Result should be 0 assert score == 0.0 From 1938353398dec81223c97f59c7384cedfef6f8ad Mon Sep 17 00:00:00 2001 From: Preetam Joshi Date: Mon, 13 Oct 2025 16:46:51 -0700 Subject: [PATCH 6/6] Fixing tests --- tests/test_detect.py | 86 ++++++++++++++++----------------- tests/test_reprompting_utils.py | 4 +- 2 files changed, 43 insertions(+), 47 deletions(-) diff --git a/tests/test_detect.py b/tests/test_detect.py index 8fc8dfc..06d932e 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -39,8 +39,8 @@ def log_info(self, title, data): def test_basic_detect_functionality(self, caplog): """Test that the Detect decorator works with basic functionality without raising exceptions.""" - # Create the decorator - config = {'hallucination': {'detector_name': 'default'}} + # Create the decorator (using groundedness instead of deprecated hallucination) + config = {'groundedness': {'detector_name': 'default'}} values_returned = ["context", "generated_text", "user_query"] self.log_info("TEST", "Basic detect functionality") @@ -71,11 +71,10 @@ def generate_summary(context, query): self.log_info("OUTPUT_GENERATED_TEXT", generated_text) self.log_info("OUTPUT_STATUS", result.status) - if hasattr(result.detect_response, 'hallucination'): - self.log_info("OUTPUT_HALLUCINATION", { - "is_hallucinated": result.detect_response.hallucination.get("is_hallucinated", ""), - "score": result.detect_response.hallucination.get("score", ""), - "sentences_count": len(result.detect_response.hallucination.get("sentences", [])) + if hasattr(result.detect_response, 'groundedness'): + self.log_info("OUTPUT_GROUNDEDNESS", { + "score": result.detect_response.groundedness.get("score", ""), + "instructions_list": result.detect_response.groundedness.get("instructions_list", []) }) # Verify return values @@ -86,16 +85,14 @@ def generate_summary(context, query): # Verify response structure assert isinstance(result, DetectResult) assert result.status == 200 - assert hasattr(result.detect_response, 'hallucination') - assert "is_hallucinated" in result.detect_response.hallucination - assert "score" in result.detect_response.hallucination - assert "sentences" in result.detect_response.hallucination + assert hasattr(result.detect_response, 'groundedness') + assert "score" in result.detect_response.groundedness def test_detect_with_multiple_detectors(self): """Test the Detect decorator with multiple detectors without raising exceptions.""" - # Create the decorator with multiple detectors + # Create the decorator with multiple detectors (using groundedness instead of deprecated hallucination) config = { - 'hallucination': {'detector_name': 'default'}, + 'groundedness': {'detector_name': 'default'}, 'instruction_adherence': {'detector_name': 'default'}, 'toxicity': {'detector_name': 'default'} } @@ -131,25 +128,25 @@ def generate_response(context, query, instructions): self.log_info("Output - Generated Text", generated_text) self.log_info("Output - Status", result.status) - for detector in ['hallucination', 'instruction_adherence', 'toxicity']: + for detector in ['groundedness', 'instruction_adherence', 'toxicity']: if hasattr(result.detect_response, detector): self.log_info(f"Output - {detector.capitalize()} Response", getattr(result.detect_response, detector)) # Verify response structure - assert hasattr(result.detect_response, 'hallucination') + assert hasattr(result.detect_response, 'groundedness') assert hasattr(result.detect_response, 'instruction_adherence') assert hasattr(result.detect_response, 'toxicity') # Check key fields without verifying values - assert "score" in result.detect_response.hallucination + assert "score" in result.detect_response.groundedness assert "instructions_list" in result.detect_response.instruction_adherence assert "score" in result.detect_response.toxicity def test_detect_with_different_iterables(self): """Test the Detect decorator with different iterable types for values_returned.""" # Create the decorator with a tuple for values_returned - config = {'hallucination': {'detector_name': 'default'}} + config = {'groundedness': {'detector_name': 'default'}} values_returned = ("context", "generated_text") self.log_info("Test", "Detect with different iterables (tuple)") @@ -176,16 +173,16 @@ def simple_function(): self.log_info("Output - Generated Text", generated_text) self.log_info("Output - Status", result.status) - if hasattr(result.detect_response, 'hallucination'): - self.log_info("Output - Hallucination Response", - result.detect_response.hallucination) + if hasattr(result.detect_response, 'groundedness'): + self.log_info("Output - Groundedness Response", + result.detect_response.groundedness) # Verify return values and structure assert "Python" in context assert "data science" in generated_text assert isinstance(result, DetectResult) - assert hasattr(result.detect_response, 'hallucination') - assert "score" in result.detect_response.hallucination + assert hasattr(result.detect_response, 'groundedness') + assert "score" in result.detect_response.groundedness def test_detect_with_non_tuple_return(self): """Test the Detect decorator when the wrapped function returns a single value.""" @@ -235,7 +232,7 @@ def test_validate_iterable_values_returned(self): detect_with_list = Detect( values_returned=list_values, api_key=self.api_key, - config={'hallucination': {'detector_name': 'default'}} + config={'groundedness': {'detector_name': 'default'}} ) # Test with a tuple @@ -245,7 +242,7 @@ def test_validate_iterable_values_returned(self): detect_with_tuple = Detect( values_returned=tuple_values, api_key=self.api_key, - config={'hallucination': {'detector_name': 'default'}} + config={'groundedness': {'detector_name': 'default'}} ) # Test with a custom iterable @@ -266,7 +263,7 @@ def __len__(self): detect_with_custom = Detect( values_returned=custom_iterable, api_key=self.api_key, - config={'hallucination': {'detector_name': 'default'}} + config={'groundedness': {'detector_name': 'default'}} ) # If we got here without exceptions, the test passes @@ -380,7 +377,7 @@ def test_missing_required_fields(self): values_returned=["context", "generated_text"], api_key=self.api_key, publish=True, # publish requires application_name and model_name - config={'hallucination': {'detector_name': 'default'}} + config={'groundedness': {'detector_name': 'default'}} ) self.log_info("Error message (publish)", str(exc_info1.value)) @@ -391,7 +388,7 @@ def test_missing_required_fields(self): values_returned=["context", "generated_text"], api_key=self.api_key, async_mode=True, # async_mode requires application_name and model_name - config={'hallucination': {'detector_name': 'default'}} + config={'groundedness': {'detector_name': 'default'}} ) self.log_info("Error message (async_mode)", str(exc_info2.value)) @@ -434,15 +431,15 @@ def generate_text(): assert hasattr(result.detect_response, 'toxicity') assert "score" in result.detect_response.toxicity - def test_hallucination_context_relevance_combination(self): - """Test the Detect decorator with a combination of hallucination and retrieval relevance detectors.""" + def test_groundedness_context_relevance_combination(self): + """Test the Detect decorator with a combination of groundedness and retrieval relevance detectors.""" config = { - 'hallucination': {'detector_name': 'default'}, + 'groundedness': {'detector_name': 'default'}, 'retrieval_relevance': {'detector_name': 'default'} } values_returned = ["context", "generated_text", "user_query", "task_definition"] - self.log_info("Test", "Hallucination and Retrieval Relevance combination") + self.log_info("Test", "Groundedness and Retrieval Relevance combination") self.log_info("Configuration", config) self.log_info("Values returned", values_returned) @@ -469,7 +466,7 @@ def generate_summary(context, query): self.log_info("Output - Generated Text", generated_text) self.log_info("Output - Status", result.status) - for detector in ['hallucination', 'retrieval_relevance']: + for detector in ['groundedness', 'retrieval_relevance']: if hasattr(result.detect_response, detector): self.log_info(f"Output - {detector.capitalize()} Response", getattr(result.detect_response, detector)) @@ -477,7 +474,7 @@ def generate_summary(context, query): # Verify response structure assert isinstance(result, DetectResult) assert result.status == 200 - assert hasattr(result.detect_response, 'hallucination') + assert hasattr(result.detect_response, 'groundedness') assert hasattr(result.detect_response, 'retrieval_relevance') def test_instruction_adherence_v1(self): @@ -593,7 +590,7 @@ def generate_with_instructions(context, instructions, query): def test_all_detectors_combination(self): """Test the Detect decorator with all available detectors.""" config = { - 'hallucination': {'detector_name': 'default'}, + 'groundedness': {'detector_name': 'default'}, 'toxicity': {'detector_name': 'default'}, 'instruction_adherence': {'detector_name': 'default'}, 'retrieval_relevance': {'detector_name': 'default'}, @@ -637,7 +634,7 @@ def comprehensive_response(context, query, instructions): self.log_info("Output - Status", result.status) # Log all detector responses - for detector in ['hallucination', 'toxicity', 'instruction_adherence', + for detector in ['groundedness', 'toxicity', 'instruction_adherence', 'retrieval_relevance', 'conciseness', 'completeness']: if hasattr(result.detect_response, detector): self.log_info(f"Output - {detector.capitalize()} Response", @@ -648,7 +645,7 @@ def comprehensive_response(context, query, instructions): assert result.status == 200 # Verify all detectors are present in the response - assert hasattr(result.detect_response, 'hallucination') + assert hasattr(result.detect_response, 'groundedness') assert hasattr(result.detect_response, 'toxicity') assert hasattr(result.detect_response, 'instruction_adherence') assert hasattr(result.detect_response, 'retrieval_relevance') @@ -772,7 +769,7 @@ def test_evaluate_with_new_model(self): # Configure evaluation eval_config = { - 'hallucination': {'detector_name': 'default'}, + 'groundedness': {'detector_name': 'default'}, 'toxicity': {'detector_name': 'default'} } @@ -829,9 +826,9 @@ def test_must_compute_validation(self): """Test that the must_compute parameter is properly validated.""" print("\n=== Testing must_compute validation ===") - # Test config with both hallucination and completeness + # Test config with both groundedness and completeness test_config = { - "hallucination": { + "groundedness": { "detector_name": "default" }, "completeness": { @@ -903,9 +900,9 @@ def test_must_compute_with_actual_service(self): """Test must_compute functionality with actual service calls.""" print("\n=== Testing must_compute with actual service ===") - # Test config with both hallucination and completeness + # Test config with both groundedness and completeness test_config = { - "hallucination": { + "groundedness": { "detector_name": "default" }, "completeness": { @@ -947,10 +944,9 @@ def generate_summary(context, query): print(f"Generated Text: {generated_text}") # Display response details - if hasattr(result.detect_response, 'hallucination'): - hallucination = result.detect_response.hallucination - print(f"Hallucination Score: {hallucination.get('score', 'N/A')}") - print(f"Is Hallucinated: {hallucination.get('is_hallucinated', 'N/A')}") + if hasattr(result.detect_response, 'groundedness'): + groundedness = result.detect_response.groundedness + print(f"Groundedness Score: {groundedness.get('score', 'N/A')}") if hasattr(result.detect_response, 'completeness'): completeness = result.detect_response.completeness diff --git a/tests/test_reprompting_utils.py b/tests/test_reprompting_utils.py index 14713c4..bca1582 100644 --- a/tests/test_reprompting_utils.py +++ b/tests/test_reprompting_utils.py @@ -231,8 +231,8 @@ def test_residual_error_toxicity_no_inversion(self): # Low toxicity score (0.1) means high toxicity, which should give high error # penalized_average with p=0.1 (< 0.5) gives penalty=(1-0.1)*2=1.8 - # Result should be 1.8 - assert score == 1.8 + # But get_residual_error_score clamps to max 1.0, so result is 1.0 + assert score == 1.0 def test_residual_error_high_toxicity_score(self): """Test that high toxicity scores (non-toxic content) give low error."""