Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 77 additions & 3 deletions lib/crewai/src/crewai/utilities/evaluators/task_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import annotations

from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING, Any, cast

from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

from crewai.events.event_bus import crewai_event_bus
from crewai.events.types.task_events import TaskEvaluationEvent
Expand All @@ -25,16 +25,90 @@ class Entity(BaseModel):


class TaskEvaluation(BaseModel):
model_config = ConfigDict(extra="ignore")

suggestions: list[str] = Field(
default_factory=list,
description="Suggestions to improve future similar tasks."
)
quality: float = Field(
quality: float | None = Field(
default=None,
description="A score from 0 to 10 evaluating on completion, quality, and overall performance, all taking into account the task description, expected output, and the result of the task."
)
entities: list[Entity] = Field(
default_factory=list,
description="Entities extracted from the task output."
)

@model_validator(mode="before")
@classmethod
def map_score_to_quality(cls, data: Any) -> Any:
"""Map 'score' field to 'quality' if quality is missing."""
if isinstance(data, dict):
if "quality" not in data and "score" in data:
data["quality"] = data["score"]
return data

@field_validator("suggestions", mode="before")
@classmethod
def normalize_suggestions(cls, v: Any) -> list[str]:
"""Normalize suggestions from various formats to list[str].

Handles:
- None → []
- str → [str]
- dict with "point" → [point]
- dict without "point" → [str(dict)]
- list → flatten using same rules per item
"""
if v is None:
return []

if isinstance(v, str):
return [v]

if isinstance(v, dict):
if "point" in v:
return [str(v["point"])]
return [str(v)]

if isinstance(v, list):
result = []
for item in v:
if isinstance(item, str):
result.append(item)
elif isinstance(item, dict):
if "point" in item:
result.append(str(item["point"]))
else:
result.append(str(item))
else:
result.append(str(item))
return result

return [str(v)]

@field_validator("quality", mode="before")
@classmethod
def coerce_quality(cls, v: Any) -> float | None:
"""Coerce quality to float, accepting int and numeric strings.

Returns None if value is None, empty string, or cannot be parsed.
"""
if v is None or v == "":
return None

if isinstance(v, (int, float)):
return float(v)

if isinstance(v, str):
try:
return float(v)
except ValueError:
return None

return None


class TrainingTaskEvaluation(BaseModel):
suggestions: list[str] = Field(
Expand Down
268 changes: 268 additions & 0 deletions lib/crewai/tests/utilities/evaluators/test_task_evaluation_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
"""Tests for TaskEvaluation model validation and normalization.

These tests verify that TaskEvaluation correctly handles malformed LLM output
as reported in issue #3915, including:
- Missing quality field
- Suggestions as list of dicts with 'point' and 'priority' keys
- Score field instead of quality field
- Extra fields that should be ignored
"""

import pytest
from pydantic import ValidationError

from crewai.utilities.evaluators.task_evaluator import Entity, TaskEvaluation


def test_missing_quality_and_dict_suggestions_normalize():
"""Test that missing quality and dict suggestions are normalized correctly.

This replicates the exact error from issue #3915 where:
- quality field is missing
- suggestions is a list of dicts with 'point' and 'priority' keys
"""
payload = {
"suggestions": [
{"point": "Proceed immediately with the task", "priority": "high"},
{"point": "When asking for information, be specific", "priority": "medium"},
{"point": "Use markdown formatting", "priority": "medium"},
],
"entities": [],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality is None
assert result.suggestions == [
"Proceed immediately with the task",
"When asking for information, be specific",
"Use markdown formatting",
]
assert result.entities == []


def test_single_dict_suggestion():
"""Test that a single dict suggestion is normalized to a list with extracted point."""
payload = {
"suggestions": {"point": "Improve response quality", "priority": "high"},
"quality": 8.0,
}

result = TaskEvaluation.model_validate(payload)

assert result.suggestions == ["Improve response quality"]
assert result.quality == 8.0


def test_single_string_suggestion():
"""Test that a single string suggestion is normalized to a list."""
payload = {
"suggestions": "This is a single suggestion",
"quality": 7.5,
}

result = TaskEvaluation.model_validate(payload)

assert result.suggestions == ["This is a single suggestion"]
assert result.quality == 7.5


def test_mixed_suggestions():
"""Test that mixed suggestion types are normalized correctly."""
payload = {
"suggestions": [
"String suggestion",
{"point": "Dict with point", "priority": "high"},
{"other": "Dict without point"},
123,
],
"quality": 6.0,
}

result = TaskEvaluation.model_validate(payload)

assert result.suggestions == [
"String suggestion",
"Dict with point",
"{'other': 'Dict without point'}",
"123",
]


def test_quality_from_score():
"""Test that 'score' field is mapped to 'quality' when quality is missing."""
payload = {
"score": 3,
"suggestions": ["Improve performance"],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality == 3.0
assert result.suggestions == ["Improve performance"]


def test_quality_str_number():
"""Test that quality as a string number is coerced to float."""
payload = {
"quality": "7.5",
"suggestions": ["Good work"],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality == 7.5


def test_quality_int():
"""Test that quality as an int is coerced to float."""
payload = {
"quality": 8,
"suggestions": ["Excellent"],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality == 8.0


def test_quality_invalid_string():
"""Test that invalid quality string returns None."""
payload = {
"quality": "not a number",
"suggestions": ["Test"],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality is None


def test_quality_empty_string():
"""Test that empty quality string returns None."""
payload = {
"quality": "",
"suggestions": ["Test"],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality is None


def test_extra_fields_ignored():
"""Test that extra fields in the payload are ignored."""
payload = {
"suggestions": ["Test suggestion"],
"quality": 5.0,
"relationships": [],
"extra_field": "should be ignored",
"another_extra": 123,
}

result = TaskEvaluation.model_validate(payload)

assert result.suggestions == ["Test suggestion"]
assert result.quality == 5.0
assert result.entities == []


def test_none_suggestions():
"""Test that None suggestions are normalized to empty list."""
payload = {
"suggestions": None,
"quality": 5.0,
}

result = TaskEvaluation.model_validate(payload)

assert result.suggestions == []


def test_missing_all_optional_fields():
"""Test that all optional fields can be missing."""
payload = {}

result = TaskEvaluation.model_validate(payload)

assert result.suggestions == []
assert result.quality is None
assert result.entities == []


def test_entities_with_valid_data():
"""Test that entities are parsed correctly when provided."""
payload = {
"suggestions": ["Test"],
"quality": 8.0,
"entities": [
{
"name": "John Doe",
"type": "Person",
"description": "A test person",
"relationships": ["knows Jane"],
}
],
}

result = TaskEvaluation.model_validate(payload)

assert len(result.entities) == 1
assert result.entities[0].name == "John Doe"
assert result.entities[0].type == "Person"


def test_score_and_quality_both_present():
"""Test that quality takes precedence when both score and quality are present."""
payload = {
"score": 3,
"quality": 9.0,
"suggestions": ["Test"],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality == 9.0


def test_issue_3915_exact_payload():
"""Test the exact payload structure from issue #3915.

This is the actual error case reported in the issue where:
- quality field is missing
- suggestions contains dicts with 'point' and 'priority'
- relationships field is present (should be ignored)
"""
payload = {
"score": 3,
"suggestions": [
{"point": "Complete the task immediately", "priority": "high"},
{"point": "Provide detailed explanations", "priority": "medium"},
{"point": "Use proper formatting", "priority": "medium"},
],
"relationships": ["suggested"],
}

result = TaskEvaluation.model_validate(payload)

assert result.quality == 3.0
assert result.suggestions == [
"Complete the task immediately",
"Provide detailed explanations",
"Use proper formatting",
]
assert result.entities == []


def test_empty_suggestions_list():
"""Test that empty suggestions list is handled correctly."""
payload = {
"suggestions": [],
"quality": 5.0,
}

result = TaskEvaluation.model_validate(payload)

assert result.suggestions == []
assert result.quality == 5.0

Loading
Loading