Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ uv run mypy langfuse --no-error-summary

### Update openapi spec

1. Generate Fern Python SDK in [langfuse](https://github.com/langfuse/langfuse) and copy the files generated in `generated/python` into the `langfuse/api` folder in this repo.
2. Execute the linter by running `uv run ruff format .`
3. Rebuild and deploy the package to PyPi.
A PR with the changes is automatically created upon changing the Spec in the langfuse repo.

### Publish release

Expand Down
22 changes: 11 additions & 11 deletions langfuse/_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -1747,7 +1747,7 @@ def create_score(
trace_id: Optional[str] = None,
score_id: Optional[str] = None,
observation_id: Optional[str] = None,
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
comment: Optional[str] = None,
config_id: Optional[str] = None,
metadata: Optional[Any] = None,
Expand Down Expand Up @@ -1777,13 +1777,13 @@ def create_score(

Args:
name: Name of the score (e.g., "relevance", "accuracy")
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
session_id: ID of the Langfuse session to associate the score with
dataset_run_id: ID of the Langfuse dataset run to associate the score with
trace_id: ID of the Langfuse trace to associate the score with
observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse
metadata: Optional metadata to be attached to the score
Expand Down Expand Up @@ -1907,7 +1907,7 @@ def score_current_span(
name: str,
value: str,
score_id: Optional[str] = None,
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
comment: Optional[str] = None,
config_id: Optional[str] = None,
metadata: Optional[Any] = None,
Expand All @@ -1931,9 +1931,9 @@ def score_current_span(

Args:
name: Name of the score (e.g., "relevance", "accuracy")
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse
metadata: Optional metadata to be attached to the score
Expand Down Expand Up @@ -1971,7 +1971,7 @@ def score_current_span(
name=name,
value=cast(str, value),
score_id=score_id,
data_type=cast(Literal["CATEGORICAL"], data_type),
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
comment=comment,
config_id=config_id,
metadata=metadata,
Expand All @@ -1997,7 +1997,7 @@ def score_current_trace(
name: str,
value: str,
score_id: Optional[str] = None,
data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
comment: Optional[str] = None,
config_id: Optional[str] = None,
metadata: Optional[Any] = None,
Expand All @@ -2022,9 +2022,9 @@ def score_current_trace(

Args:
name: Name of the score (e.g., "user_satisfaction", "overall_quality")
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse
metadata: Optional metadata to be attached to the score
Expand Down Expand Up @@ -2060,7 +2060,7 @@ def score_current_trace(
name=name,
value=cast(str, value),
score_id=score_id,
data_type=cast(Literal["CATEGORICAL"], data_type),
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
comment=comment,
config_id=config_id,
metadata=metadata,
Expand Down
16 changes: 8 additions & 8 deletions langfuse/_client/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def score(
value: str,
score_id: Optional[str] = None,
data_type: Optional[
Literal[ScoreDataType.CATEGORICAL]
Literal[ScoreDataType.CATEGORICAL, ScoreDataType.TEXT]
] = ScoreDataType.CATEGORICAL,
comment: Optional[str] = None,
config_id: Optional[str] = None,
Expand All @@ -335,9 +335,9 @@ def score(

Args:
name: Name of the score (e.g., "relevance", "accuracy")
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse
timestamp: Optional timestamp for the score (defaults to current UTC time)
Expand All @@ -364,7 +364,7 @@ def score(
trace_id=self.trace_id,
observation_id=self.id,
score_id=score_id,
data_type=cast(Literal["CATEGORICAL"], data_type),
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
comment=comment,
config_id=config_id,
timestamp=timestamp,
Expand Down Expand Up @@ -395,7 +395,7 @@ def score_trace(
value: str,
score_id: Optional[str] = None,
data_type: Optional[
Literal[ScoreDataType.CATEGORICAL]
Literal[ScoreDataType.CATEGORICAL, ScoreDataType.TEXT]
] = ScoreDataType.CATEGORICAL,
comment: Optional[str] = None,
config_id: Optional[str] = None,
Expand Down Expand Up @@ -423,9 +423,9 @@ def score_trace(

Args:
name: Name of the score (e.g., "user_satisfaction", "overall_quality")
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
comment: Optional comment or explanation for the score
config_id: Optional ID of a score config defined in Langfuse
timestamp: Optional timestamp for the score (defaults to current UTC time)
Expand All @@ -451,7 +451,7 @@ def score_trace(
value=cast(str, value),
trace_id=self.trace_id,
score_id=score_id,
data_type=cast(Literal["CATEGORICAL"], data_type),
data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
comment=comment,
config_id=config_id,
timestamp=timestamp,
Expand Down
5 changes: 3 additions & 2 deletions langfuse/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
Union,
)

from langfuse.api import DatasetItem, ScoreDataType
from langfuse.api import DatasetItem
from langfuse.logger import langfuse_logger as logger
from langfuse.types import ExperimentScoreType


class LocalExperimentItem(TypedDict, total=False):
Expand Down Expand Up @@ -184,7 +185,7 @@ def __init__(
value: Union[int, float, str, bool],
comment: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
data_type: Optional[ScoreDataType] = None,
data_type: Optional[ExperimentScoreType] = None,
config_id: Optional[str] = None,
):
"""Initialize an Evaluation with the provided data.
Expand Down
6 changes: 5 additions & 1 deletion langfuse/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def my_evaluator(*, output: str, **kwargs) -> Evaluation:

SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]

ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN", "TEXT"]

# Text scores are not supported for evals and experiments
ExperimentScoreType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]


class MaskFunction(Protocol):
Expand Down Expand Up @@ -73,6 +76,7 @@ class TraceContext(TypedDict):
__all__ = [
"SpanLevel",
"ScoreDataType",
"ExperimentScoreType",
"MaskFunction",
"ParsedMediaReference",
"TraceContext",
Expand Down
58 changes: 58 additions & 0 deletions tests/test_core_sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,64 @@ def test_create_categorical_score():
assert created_score["stringValue"] == "high score"


def test_create_text_score():
langfuse = Langfuse()
api_wrapper = LangfuseAPI()

# Create a span and set trace properties
with langfuse.start_as_current_observation(name="test-span") as span:
with propagate_attributes(
trace_name="this-is-so-great-new",
user_id="test",
metadata={"test": "test"},
):
# Get trace ID for later use
trace_id = span.trace_id

# Ensure data is sent
langfuse.flush()
sleep(2)

# Create a text score
score_id = create_uuid()
langfuse.create_score(
score_id=score_id,
trace_id=trace_id,
name="this-is-a-score",
value="This is a detailed text evaluation of the output quality.",
data_type="TEXT",
)

# Create a generation in the same trace
generation = langfuse.start_observation(
as_type="generation",
name="yet another child",
metadata="test",
trace_context={"trace_id": trace_id},
)
generation.end()

# Ensure data is sent
langfuse.flush()
sleep(2)

# Retrieve and verify
trace = api_wrapper.get_trace(trace_id)

# Find the score we created by name
created_score = next(
(s for s in trace["scores"] if s["name"] == "this-is-a-score"), None
)
assert created_score is not None, "Score not found in trace"
assert created_score["id"] == score_id
assert created_score["dataType"] == "TEXT"
assert created_score["value"] is None
assert (
created_score["stringValue"]
== "This is a detailed text evaluation of the output quality."
)


def test_create_score_with_custom_timestamp():
langfuse = Langfuse()
api_wrapper = LangfuseAPI()
Expand Down
Loading