From 28795451a0f01c47dd80a62bb7255fa002a30f1e Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Sun, 10 Aug 2025 11:25:53 +0300 Subject: [PATCH 1/9] feat: Add LLM-based commentary scoring system - Add sefaria_llm_interface/commentary_scoring package with input/output dataclasses - Add commentary_scoring app with OpenAI-powered scoring functionality - Implement CommentaryScorer class for evaluating how well commentaries explain cited texts - Add Celery task integration for async commentary processing - Include text processing utilities for HTML stripping and content flattening - Update Celery autodiscovery to include commentary_scoring tasks --- app/celery_setup/app.py | 5 +- app/commentary_scoring/commentary_scoring.py | 34 ++ .../openai_commentary_scorer.py | 346 ++++++++++++++++++ app/commentary_scoring/tasks.py | 13 + app/commentary_scoring/text_utils.py | 23 ++ .../commentary_scoring/__init__.py | 4 + .../commentary_scoring/sheet_scoring_input.py | 8 + .../sheet_scoring_output.py | 19 + 8 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 app/commentary_scoring/commentary_scoring.py create mode 100644 app/commentary_scoring/openai_commentary_scorer.py create mode 100644 app/commentary_scoring/tasks.py create mode 100644 app/commentary_scoring/text_utils.py create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py diff --git a/app/celery_setup/app.py b/app/celery_setup/app.py index 0b50834..d1f1f69 100644 --- a/app/celery_setup/app.py +++ b/app/celery_setup/app.py @@ -3,4 +3,7 @@ app = Celery('llm') app.conf.update(**generate_config_from_env()) -app.autodiscover_tasks(packages=['topic_prompt']) +app.autodiscover_tasks(packages=['topic_prompt', + 'commentary_scoring'] + ) + diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py new file mode 100644 index 0000000..61cb196 --- /dev/null +++ b/app/commentary_scoring/commentary_scoring.py @@ -0,0 +1,34 @@ +from .openai_commentary_scorer import CommentaryScorer +import os +from pathlib import Path +from sefaria_llm_interface.commentary_scoring import ( + CommentaryScoringInput, + CommentaryScoringOutput, +) + + +def score_one_commentary(inp: CommentaryScoringInput) -> ( + CommentaryScoringOutput): + scorer = CommentaryScorer( + api_key=os.getenv("OPENAI_API_KEY") + ) + result = (scorer. + process_commentary_by_content( + commentary_ref=inp.commentary_ref, + cited_refs=inp.cited_refs, + commentary_text=inp.commentary_text + ) + ) + if not result: + return CommentaryScoringOutput( + commentary_ref=inp.commentary_ref, + ref_scores={}, + scores_explanation="", + processed_datetime=None, + ) + return CommentaryScoringOutput( + commentary_ref=inp.commentary_ref, + ref_scores=result.get(scorer.REF_SCORE_FIELD), + scores_explanation=result.get(scorer.EXPLANATION_FIELD), + processed_datetime=result.get(scorer.PROCESSED_AT_FIELD), + ) diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py new file mode 100644 index 0000000..bd7bebc --- /dev/null +++ b/app/commentary_scoring/openai_commentary_scorer.py @@ -0,0 +1,346 @@ +"""Commentary Scorer for evaluating Jewish text Commentaries. + +This module provides functionality to score how well commentaries explain +their cited texts using OpenAI's language models. +""" + +import json +import logging +from datetime import datetime, timezone +from enum import IntEnum +from typing import Any, Dict, List, Optional, Set, Union + +import tiktoken +from langchain.chat_models import ChatOpenAI +from langchain.schema import HumanMessage +from commentary_scoring.text_utils import to_plain_text + +logger = logging.getLogger(__name__) + + +class ExplanationLevel(IntEnum): + """Levels of explanation quality for commentary scoring.""" + + NO_EXPLANATION = 0 + MINIMAL = 1 + MODERATE = 2 + SIGNIFICANT = 3 + COMPREHENSIVE = 4 + + +class LanguageCode: + """ISO 639-1 language codes.""" + + ENGLISH = "en" + HEBREW = "he" + DEFAULT = ENGLISH + + +class CommentaryScorer: + """Scores how well commentaries explain their cited texts. + + This class uses OpenAI's language models to evaluate the quality of + explanations provided by Jewish commentaries for their cited texts. + + Attributes: + model: The OpenAI model to use for scoring + max_prompt_tokens: Maximum tokens allowed in prompt + token_margin: Reserved tokens for model response + """ + + # Configuration constants + DEFAULT_MAX_OUTPUT_TOKENS = 4096 + DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000 + DEFAULT_TOKEN_CHAR_RATIO = 3 + + # Response field names + REF_SCORE_FIELD = "ref_scores" + EXPLANATION_FIELD = "explanation" + LANGUAGE_FIELD = "language" + CITED_REF_FIELD = "cited_ref" + PROCESSED_AT_FIELD = "processed_datetime" + + # Valid explanation levels + VALID_LEVELS: Set[int] = {level.value for level in ExplanationLevel} + + def __init__( + self, + api_key: Optional[str] = None, + model: str = "gpt-4o-mini", + max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS, + token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS, + ) -> None: + """Initialize the commentary scorer. + Args: + api_key: OpenAI API key. If None, will use environment variable + model: OpenAI model name to use + max_prompt_tokens: Maximum tokens for input prompt + token_margin: Reserved tokens for model response + Raises: + ValueError: If model is not supported or parameters are invalid + """ + + self.model = model + self.max_prompt_tokens = max_prompt_tokens + self.token_margin = token_margin + + try: + self.llm = ChatOpenAI( + model_name=model, + temperature=0, #Model temperature (0.0 for deterministic grading) + openai_api_key=api_key, + model_kwargs={ + "top_p": 0, + "frequency_penalty": 0, + "presence_penalty": 0, + "seed": 42, + }, + ) + except Exception as e: + logger.error(f"Failed to initialize ChatOpenAI: {e}") + raise + + logger.info(f"Initialized CommentaryScorer with model {model}") + + def _count_tokens(self, text: str) -> int: + """Count tokens in text using the model's tokenizer. + """ + try: + encoding = tiktoken.encoding_for_model(self.model) + return len(encoding.encode(text)) + except Exception as e: + logger.warning(f"Failed to get exact token count: {e}") + # Fallback to character-based estimation + return max(1, len(text) // self.DEFAULT_TOKEN_CHAR_RATIO) + + def _validate_level(self,score: Any) -> int: + """Validate and normalize explanation level score. + """ + try: + score = int(score) + except (ValueError, TypeError): + logger.warning( + f"Invalid reference score '{score}', defaulting to 0" + ) + return ExplanationLevel.NO_EXPLANATION + + if score not in self.VALID_LEVELS: + clamped = max( + ExplanationLevel.NO_EXPLANATION, + min(score,ExplanationLevel.COMPREHENSIVE) + ) + logger.warning( + f"Reference score {score} out of range, clamping to {clamped}" + ) + return clamped + + return score + + def _invoke_llm( + self, + prompt: str, + function_schema: Dict[str, Any] + ) -> Dict[str, Any]: + """Invoke the language model with function calling. + """ + try: + response = self.llm.invoke( + [HumanMessage(content=prompt)], + functions=[function_schema], + function_call={"name": function_schema["name"]} + ) + function_call = getattr(response, "additional_kwargs", {}).get( + "function_call" + ) + if not function_call: + raise ValueError("No function call found in LLM response") + + arguments = function_call.get("arguments", "{}") + return json.loads(arguments) + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse LLM response JSON: {e}") + raise ValueError(f"Invalid JSON in LLM response: {e}") + except Exception as e: + logger.error(f"LLM invocation failed: {e}") + raise + + def _build_function_schema(self, cited_keys: List[str]) -> Dict[str,Any]: + """Build JSON schema for function calling. + + Args: + cited_keys: List of citation keys to score + + Returns: + JSON schema for the scoring function + """ + if not cited_keys: + raise ValueError("cited_keys cannot be empty") + + return { + "name": "score_multiple_explanations", + "description": "Score how well a commentary explains each cited text", + "parameters": { + "type": "object", + "properties": { + self.REF_SCORE_FIELD: { + "type": "object", + "properties": { + key: { + "type": "integer", + "minimum": ExplanationLevel.NO_EXPLANATION, + "maximum": ExplanationLevel.COMPREHENSIVE + } + for key in cited_keys + }, + "required": cited_keys, + "additionalProperties": False + }, + self.EXPLANATION_FIELD: { + "type": "object", + "properties": { + key: { + "type": "string", + "maxLength": 200, + "description": f"Explanation for {key} score (1-2 sentences)" + } + for key in cited_keys + }, + "required": cited_keys, + "additionalProperties": False + } + }, + "required": [ + self.REF_SCORE_FIELD, + self.EXPLANATION_FIELD + ], + "additionalProperties": False + } + } + + def _build_scoring_prompt( + self, + cited_refs: Dict[str,str], + commentary_text: str + ) -> str: + """Build the prompt for scoring commentary explanations. + + Args: + cited_refs: Mapping of reference keys to cited texts + commentary_text: The commentary text to evaluate + + Returns: + Formatted prompt string + """ + refs_section = "\n".join( + f"- {key}: {text}" for key,text in cited_refs.items() + ) + + return f"""You are an expert evaluator of Jewish commentary quality. + +COMMENTARY TEXT: +{commentary_text} + +CITED TEXTS TO EVALUATE: +{refs_section} + +TASK: For each cited text, score (0-4) how well the commentary explains it: + +SCORING SCALE: +{ExplanationLevel.NO_EXPLANATION}: NO EXPLANATION - Citation used for unrelated point +{ExplanationLevel.MINIMAL}: MINIMAL - Text merely paraphrased or mentioned +{ExplanationLevel.MODERATE}: MODERATE - Commentary shares theme but doesn't explain text +{ExplanationLevel.SIGNIFICANT}: SIGNIFICANT - Citation is main focus with meaningful explanation +{ExplanationLevel.COMPREHENSIVE}: COMPREHENSIVE - Deep, thorough explanation that fully illuminates the text + +RETURN JSON WITH: +1. {self.REF_SCORE_FIELD}: Object mapping each citation key to score (0-4) +2. {self.EXPLANATION_FIELD}: Object mapping each key to brief explanation (1-2 sentences) + +Be precise and consistent in your scoring.""" + + def process_commentary_by_content( + self, + commentary_text: Union[List[str],str], + cited_refs: Dict[str,str], + commentary_ref: str = "" + ) -> Optional[Dict[str, Any]]: + """Score how well a commentary explains its cited texts. + """ + if not cited_refs: + logger.info( + f"Commentary {commentary_ref} doesn't cite anything. " + f"Defaulting to None" + ) + return None + + if not commentary_text: + logger.info( + f"Commentary {commentary_ref} is empty. " + f"Defaulting to None" + ) + return None + + # Convert commentary text to string format + if isinstance(commentary_text,list): + commentary_text_str = to_plain_text(commentary_text) + else: + commentary_text_str = str(commentary_text) + + if not commentary_text_str.strip(): + logger.warning(f"Commentary's {commentary_ref} text is empty " + f"after processing") + + return None + + token_count = self._count_tokens(commentary_text_str) + max_allowed_tokens = self.max_prompt_tokens - self.token_margin + + if token_count > max_allowed_tokens: + # TODO: add long commentary support + logger.warning( + f"{commentary_ref}'s input too long " + f"({token_count} tokens > {max_allowed_tokens} limit). " + "Skipping scoring." + ) + return None + + logger.info( + f"Processing commentary with {token_count} tokens, " + f"{len(cited_refs)} citations" + ) + + try: + # Build prompt and schema + prompt = self._build_scoring_prompt(cited_refs, commentary_text_str) + schema = self._build_function_schema(list(cited_refs.keys())) + + # Get LLM response + raw_response = self._invoke_llm(prompt, schema) + + # Validate and normalize scores + raw_scores = raw_response.get(self.REF_SCORE_FIELD, {}) + validated_scores = { + key: self._validate_level(score) + for key, score in raw_scores.items() + } + + # Create structured result + result = { + self.REF_SCORE_FIELD: validated_scores, + self.EXPLANATION_FIELD: raw_response.get( + self.EXPLANATION_FIELD, {} + ), + self.PROCESSED_AT_FIELD: datetime.now(timezone.utc), + } + + logger.info( + f"Successfully scored commentary {commentary_ref}. " + f"Average score: {sum(validated_scores.values()) / len(validated_scores):.2f}" + ) + + return result + + except Exception as e: + logger.error(f"Commentary {commentary_ref} scoring failed: {e}") + return None diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py new file mode 100644 index 0000000..4215694 --- /dev/null +++ b/app/commentary_scoring/tasks.py @@ -0,0 +1,13 @@ +from celery import shared_task +from commentary_scoring.commentary_scoring import score_one_commentary +from sefaria_llm_interface.commentary_scoring import ( + CommentaryScoringInput +) +from dataclasses import asdict + + +@shared_task(name='llm.score_commentary') +def score_sheet_task(raw_input: dict) -> dict: + inp = CommentaryScoringInput(**raw_input) + out = score_one_commentary(inp) + return asdict(out) \ No newline at end of file diff --git a/app/commentary_scoring/text_utils.py b/app/commentary_scoring/text_utils.py new file mode 100644 index 0000000..75b82a2 --- /dev/null +++ b/app/commentary_scoring/text_utils.py @@ -0,0 +1,23 @@ +import re +from typing import Union, List + +TAG_RE = re.compile(r"<[^>]+>") + + +def strip_html(s: str) -> str: + return TAG_RE.sub("", s) + + +def flatten_text(x: Union[str, List, tuple]) -> str: + if isinstance(x, str): + return x + if isinstance(x, (list, tuple)): + return " ".join(flatten_text(el) for el in x) + return str(x) + + +def to_plain_text(raw: Union[str, List, tuple]) -> str: + """Recursively flatten + remove HTML → clean unicode.""" + flat = flatten_text(raw) + clean = strip_html(flat) + return re.sub(r"\s+", " ", clean).strip() \ No newline at end of file diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py new file mode 100644 index 0000000..5758e1d --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py @@ -0,0 +1,4 @@ +from .sheet_scoring_input import SheetScoringInput +from .sheet_scoring_output import SheetScoringOutput + +__all__ = ["SheetScoringInput", "SheetScoringOutput"] \ No newline at end of file diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py new file mode 100644 index 0000000..acff7e8 --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from typing import Any + + +@dataclass +class SheetScoringInput: + sheet_content: dict[str, Any] + diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py new file mode 100644 index 0000000..14bccd1 --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass +from typing import Dict, Union +from datetime import datetime + + +@dataclass +class SheetScoringOutput: + sheet_id: str + processed_datetime: str + language: str + title_interest_level: int + title_interest_reason: str + creativity_score: float + ref_levels: Dict[str, int] + ref_scores: Dict[str, float] + + def __post_init__(self): + if isinstance(self.processed_datetime, datetime): + self.processed_datetime = self.processed_datetime.isoformat() \ No newline at end of file From e05f3d910f59e418397bec090a04dd9346af46e9 Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Sun, 10 Aug 2025 11:26:54 +0300 Subject: [PATCH 2/9] feat: adding init to commentary_scoring package --- app/commentary_scoring/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 app/commentary_scoring/__init__.py diff --git a/app/commentary_scoring/__init__.py b/app/commentary_scoring/__init__.py new file mode 100644 index 0000000..e69de29 From 973f6a40adb3ee7cb8c907d1283fe0822b8a675b Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Sun, 10 Aug 2025 11:50:40 +0300 Subject: [PATCH 3/9] feat: added by mistake to previous commit sheet_scoring input/output classes instead of commit_scoring --- .../commentary_scoring/__init__.py | 6 +++--- .../commentary_scoring/commentary_scoring_input.py | 10 ++++++++++ ...coring_output.py => commentary_scoring_output.py} | 12 ++++-------- .../commentary_scoring/sheet_scoring_input.py | 8 -------- 4 files changed, 17 insertions(+), 19 deletions(-) create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py rename app/llm_interface/sefaria_llm_interface/commentary_scoring/{sheet_scoring_output.py => commentary_scoring_output.py} (58%) delete mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py index 5758e1d..31e3d07 100644 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py @@ -1,4 +1,4 @@ -from .sheet_scoring_input import SheetScoringInput -from .sheet_scoring_output import SheetScoringOutput +from .commentary_scoring_input import CommentaryScoringInput +from .commentary_scoring_output import CommentaryScoringOutput -__all__ = ["SheetScoringInput", "SheetScoringOutput"] \ No newline at end of file +__all__ = ["CommentaryScoringInput", "CommentaryScoringOutput"] \ No newline at end of file diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py new file mode 100644 index 0000000..e6ed706 --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass +from typing import List, Dict + + +@dataclass +class CommentaryScoringInput: + commentary_text: List[str] + cited_refs: Dict[str, str] + commentary_ref: str + diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py similarity index 58% rename from app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py rename to app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py index 14bccd1..4386fd9 100644 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py @@ -4,15 +4,11 @@ @dataclass -class SheetScoringOutput: - sheet_id: str +class CommentaryScoringOutput: + commentary_ref: str + ref_scores: Dict[str, int] + scores_explanation: Dict[str, str] processed_datetime: str - language: str - title_interest_level: int - title_interest_reason: str - creativity_score: float - ref_levels: Dict[str, int] - ref_scores: Dict[str, float] def __post_init__(self): if isinstance(self.processed_datetime, datetime): diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py deleted file mode 100644 index acff7e8..0000000 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py +++ /dev/null @@ -1,8 +0,0 @@ -from dataclasses import dataclass -from typing import Any - - -@dataclass -class SheetScoringInput: - sheet_content: dict[str, Any] - From a598a9185c130a644ec32760263cbde46de6b083 Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Mon, 11 Aug 2025 17:05:21 +0300 Subject: [PATCH 4/9] feat: - added to CommentaryScoringOutput debugging fields: request_status and request_status_message - updated CommentaryScorer, so it will return CommentaryScoringOutput instead of dictionary; this update also influences commentary_scoring.py NOTE: by now importing from sefaria-llm-interface are local and not package-style, since the version with necessary files was not yet released --- app/commentary_scoring/commentary_scoring.py | 23 +----- .../openai_commentary_scorer.py | 82 ++++++++++++------- app/commentary_scoring/tasks.py | 5 +- .../commentary_scoring_output.py | 2 + 4 files changed, 63 insertions(+), 49 deletions(-) diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py index 61cb196..3f2530a 100644 --- a/app/commentary_scoring/commentary_scoring.py +++ b/app/commentary_scoring/commentary_scoring.py @@ -1,34 +1,19 @@ from .openai_commentary_scorer import CommentaryScorer import os from pathlib import Path -from sefaria_llm_interface.commentary_scoring import ( +# TODO: change the imports when compile package +from app.llm_interface.sefaria_llm_interface.commentary_scoring import ( CommentaryScoringInput, CommentaryScoringOutput, ) - def score_one_commentary(inp: CommentaryScoringInput) -> ( CommentaryScoringOutput): scorer = CommentaryScorer( api_key=os.getenv("OPENAI_API_KEY") ) - result = (scorer. - process_commentary_by_content( + return scorer.process_commentary_by_content( commentary_ref=inp.commentary_ref, cited_refs=inp.cited_refs, commentary_text=inp.commentary_text - ) - ) - if not result: - return CommentaryScoringOutput( - commentary_ref=inp.commentary_ref, - ref_scores={}, - scores_explanation="", - processed_datetime=None, - ) - return CommentaryScoringOutput( - commentary_ref=inp.commentary_ref, - ref_scores=result.get(scorer.REF_SCORE_FIELD), - scores_explanation=result.get(scorer.EXPLANATION_FIELD), - processed_datetime=result.get(scorer.PROCESSED_AT_FIELD), - ) + ) \ No newline at end of file diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py index bd7bebc..1b13f60 100644 --- a/app/commentary_scoring/openai_commentary_scorer.py +++ b/app/commentary_scoring/openai_commentary_scorer.py @@ -14,7 +14,11 @@ from langchain.chat_models import ChatOpenAI from langchain.schema import HumanMessage from commentary_scoring.text_utils import to_plain_text - +# TODO: change the imports when compile package +from app.llm_interface.sefaria_llm_interface.commentary_scoring import ( + CommentaryScoringInput, + CommentaryScoringOutput, +) logger = logging.getLogger(__name__) @@ -28,6 +32,13 @@ class ExplanationLevel(IntEnum): COMPREHENSIVE = 4 +class RequestStatus(IntEnum): + """LLM's success/failure""" + + SUCCESS = 1 + FAILURE = 0 + + class LanguageCode: """ISO 639-1 language codes.""" @@ -218,6 +229,20 @@ def _build_function_schema(self, cited_keys: List[str]) -> Dict[str,Any]: } } + def _create_failure_scoring_output(self, commentary_ref, + processed_datetime: datetime, + request_status_message: str) -> ( + CommentaryScoringOutput): + logger.warning(request_status_message) + return CommentaryScoringOutput( + commentary_ref=commentary_ref, + ref_scores={}, + scores_explanation={}, + processed_datetime=str(processed_datetime), + request_status_message=request_status_message, + request_status=RequestStatus.FAILURE + ) + def _build_scoring_prompt( self, cited_refs: Dict[str,str], @@ -264,22 +289,18 @@ def process_commentary_by_content( commentary_text: Union[List[str],str], cited_refs: Dict[str,str], commentary_ref: str = "" - ) -> Optional[Dict[str, Any]]: + ) -> CommentaryScoringOutput: """Score how well a commentary explains its cited texts. """ if not cited_refs: - logger.info( - f"Commentary {commentary_ref} doesn't cite anything. " - f"Defaulting to None" - ) - return None + return self._create_failure_scoring_output(commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary {commentary_ref} doesn't cite anything. ") if not commentary_text: - logger.info( - f"Commentary {commentary_ref} is empty. " - f"Defaulting to None" - ) - return None + return self._create_failure_scoring_output(commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary {commentary_ref} is empty. ") # Convert commentary text to string format if isinstance(commentary_text,list): @@ -288,22 +309,22 @@ def process_commentary_by_content( commentary_text_str = str(commentary_text) if not commentary_text_str.strip(): - logger.warning(f"Commentary's {commentary_ref} text is empty " - f"after processing") - return None + return self._create_failure_scoring_output( + commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary's {commentary_ref} text is empty " + ) token_count = self._count_tokens(commentary_text_str) max_allowed_tokens = self.max_prompt_tokens - self.token_margin if token_count > max_allowed_tokens: # TODO: add long commentary support - logger.warning( - f"{commentary_ref}'s input too long " - f"({token_count} tokens > {max_allowed_tokens} limit). " - "Skipping scoring." - ) - return None + return self._create_failure_scoring_output(commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=(f"{commentary_ref}'s input too long " + f"({token_count} tokens > {max_allowed_tokens} limit). ")) logger.info( f"Processing commentary with {token_count} tokens, " @@ -326,13 +347,15 @@ def process_commentary_by_content( } # Create structured result - result = { - self.REF_SCORE_FIELD: validated_scores, - self.EXPLANATION_FIELD: raw_response.get( + result = CommentaryScoringOutput( + commentary_ref=commentary_ref, + ref_scores=validated_scores, + scores_explanation=raw_response.get( self.EXPLANATION_FIELD, {} ), - self.PROCESSED_AT_FIELD: datetime.now(timezone.utc), - } + processed_datetime=str(datetime.now(timezone.utc)), + request_status_message="", + request_status=RequestStatus.SUCCESS) logger.info( f"Successfully scored commentary {commentary_ref}. " @@ -342,5 +365,8 @@ def process_commentary_by_content( return result except Exception as e: - logger.error(f"Commentary {commentary_ref} scoring failed: {e}") - return None + return self._create_failure_scoring_output( + commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary {commentary_ref} scoring failed: {e}" + ) \ No newline at end of file diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py index 4215694..ff4cbd6 100644 --- a/app/commentary_scoring/tasks.py +++ b/app/commentary_scoring/tasks.py @@ -1,7 +1,8 @@ from celery import shared_task from commentary_scoring.commentary_scoring import score_one_commentary -from sefaria_llm_interface.commentary_scoring import ( - CommentaryScoringInput +# TODO: change the import +from app.llm_interface.sefaria_llm_interface.commentary_scoring import ( + CommentaryScoringInput, ) from dataclasses import asdict diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py index 4386fd9..9248d2d 100644 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py @@ -9,6 +9,8 @@ class CommentaryScoringOutput: ref_scores: Dict[str, int] scores_explanation: Dict[str, str] processed_datetime: str + request_status_message: str + request_status: int def __post_init__(self): if isinstance(self.processed_datetime, datetime): From 8a54f5834acc78a25e710476a92dad4a94d15eac Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Tue, 12 Aug 2025 12:39:13 +0300 Subject: [PATCH 5/9] =?UTF-8?q?feat(llm/commentary=5Fscoring):=20switch=20?= =?UTF-8?q?to=20binary=20=E2=80=9Cexplains=20or=20not=E2=80=9D=20labeling?= =?UTF-8?q?=20(0/1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace 0–4 ExplanationLevel with binary ExplainsFlag {0: NOT_EXPLAINED, 1: EXPLAINED} - Clamp/validate scores to 0/1 in _validate_level - Update function-calling JSON schema to minimum: 0, maximum: 1 per cited key - Rewrite prompt to policy: -- Return 1 if the commentary provides any substantive interpretation of any part of the citation (incl. methodological/kabbalistic reads) -- Return 0 if citation is decorative/prooftext/only paraphrased -- If A is cited only via B and C adds no new interpretation of A beyond B → 0 -- Partial coverage still counts as 1 - Explanations: ask model to begin each rationale with Explained spans: ''; ... then 1–2 sentence justification (no schema change) - Logging: report explained X/Y (Z%) instead of average 0–4 -- BREAKING BEHAVIOR: numeric scale semantics changed from graded (0–4) to binary (0/1). --- .../openai_commentary_scorer.py | 128 ++++++++---------- 1 file changed, 55 insertions(+), 73 deletions(-) diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py index 1b13f60..cc133a1 100644 --- a/app/commentary_scoring/openai_commentary_scorer.py +++ b/app/commentary_scoring/openai_commentary_scorer.py @@ -22,14 +22,9 @@ logger = logging.getLogger(__name__) -class ExplanationLevel(IntEnum): - """Levels of explanation quality for commentary scoring.""" - - NO_EXPLANATION = 0 - MINIMAL = 1 - MODERATE = 2 - SIGNIFICANT = 3 - COMPREHENSIVE = 4 +class ExplainsFlag(IntEnum): + NOT_EXPLAINED = 0 + EXPLAINED = 1 class RequestStatus(IntEnum): @@ -72,7 +67,8 @@ class CommentaryScorer: PROCESSED_AT_FIELD = "processed_datetime" # Valid explanation levels - VALID_LEVELS: Set[int] = {level.value for level in ExplanationLevel} + VALID_LEVELS: Set[int] = \ + {ExplainsFlag.NOT_EXPLAINED, ExplainsFlag.EXPLAINED} def __init__( self, @@ -124,25 +120,20 @@ def _count_tokens(self, text: str) -> int: # Fallback to character-based estimation return max(1, len(text) // self.DEFAULT_TOKEN_CHAR_RATIO) - def _validate_level(self,score: Any) -> int: - """Validate and normalize explanation level score. - """ + def _validate_level(self, score: Any) -> int: try: score = int(score) except (ValueError, TypeError): logger.warning( f"Invalid reference score '{score}', defaulting to 0" ) - return ExplanationLevel.NO_EXPLANATION + return ExplainsFlag.NOT_EXPLAINED if score not in self.VALID_LEVELS: - clamped = max( - ExplanationLevel.NO_EXPLANATION, - min(score,ExplanationLevel.COMPREHENSIVE) - ) + clamped = ExplainsFlag.EXPLAINED if score >= 1 else ExplainsFlag.NOT_EXPLAINED logger.warning( f"Reference score {score} out of range, clamping to {clamped}" - ) + ) return clamped return score @@ -176,57 +167,46 @@ def _invoke_llm( logger.error(f"LLM invocation failed: {e}") raise - def _build_function_schema(self, cited_keys: List[str]) -> Dict[str,Any]: - """Build JSON schema for function calling. - - Args: - cited_keys: List of citation keys to score - - Returns: - JSON schema for the scoring function - """ + def _build_function_schema(self,cited_keys: List[str]) -> Dict[str,Any]: if not cited_keys: raise ValueError("cited_keys cannot be empty") return { "name": "score_multiple_explanations", - "description": "Score how well a commentary explains each cited text", + "description": "Binary labeling: does the commentary actually interpret/explain each cited text?", "parameters": { "type": "object", "properties": { self.REF_SCORE_FIELD: { "type": "object", "properties": { - key: { - "type": "integer", - "minimum": ExplanationLevel.NO_EXPLANATION, - "maximum": ExplanationLevel.COMPREHENSIVE - } + key: {"type": "integer","minimum": 0,"maximum": 1} for key in cited_keys }, "required": cited_keys, - "additionalProperties": False + "additionalProperties": False, }, self.EXPLANATION_FIELD: { "type": "object", "properties": { key: { "type": "string", - "maxLength": 200, - "description": f"Explanation for {key} score (1-2 sentences)" + "maxLength": 150, + "description": ( + "Brief rationale. Start with: " + "\"Explained spans: ''; ''\" " + "then 1–2 sentences why 0/1." + ), } for key in cited_keys }, "required": cited_keys, - "additionalProperties": False - } + "additionalProperties": False, + }, }, - "required": [ - self.REF_SCORE_FIELD, - self.EXPLANATION_FIELD - ], - "additionalProperties": False - } + "required": [self.REF_SCORE_FIELD,self.EXPLANATION_FIELD], + "additionalProperties": False, + }, } def _create_failure_scoring_output(self, commentary_ref, @@ -249,45 +229,45 @@ def _build_scoring_prompt( commentary_text: str ) -> str: """Build the prompt for scoring commentary explanations. - - Args: - cited_refs: Mapping of reference keys to cited texts - commentary_text: The commentary text to evaluate - - Returns: - Formatted prompt string """ refs_section = "\n".join( f"- {key}: {text}" for key,text in cited_refs.items() - ) + ) - return f"""You are an expert evaluator of Jewish commentary quality. + return f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text. -COMMENTARY TEXT: -{commentary_text} + COMMENTARY TEXT: + {commentary_text} -CITED TEXTS TO EVALUATE: -{refs_section} + CITED TEXTS: + {refs_section} -TASK: For each cited text, score (0-4) how well the commentary explains it: + TASK (binary per citation): + Return 1 if the commentary provides any substantive interpretation or explanation + of ANY PART of the cited text (including methodological interpretation, e.g., reading a word + as a symbol) — not just quoting or paraphrasing. -SCORING SCALE: -{ExplanationLevel.NO_EXPLANATION}: NO EXPLANATION - Citation used for unrelated point -{ExplanationLevel.MINIMAL}: MINIMAL - Text merely paraphrased or mentioned -{ExplanationLevel.MODERATE}: MODERATE - Commentary shares theme but doesn't explain text -{ExplanationLevel.SIGNIFICANT}: SIGNIFICANT - Citation is main focus with meaningful explanation -{ExplanationLevel.COMPREHENSIVE}: COMPREHENSIVE - Deep, thorough explanation that fully illuminates the text + Return 0 if: + • The citation is used for another goal (decorative, rhetorical, prooftext with no interpretation). + • The commentary cites Source A only via Source B, but adds NO new interpretation of A beyond B. + (Inherited interpretation does NOT count as explanation of A.) + • It merely references or paraphrases without interpreting. -RETURN JSON WITH: -1. {self.REF_SCORE_FIELD}: Object mapping each citation key to score (0-4) -2. {self.EXPLANATION_FIELD}: Object mapping each key to brief explanation (1-2 sentences) + Important: + • If the commentary explains only PARTS of the citation, still return 1. + • In your explanation, list the exact phrases from the cited text that ARE explained (if any), + then give a concise rationale for 0/1. -Be precise and consistent in your scoring.""" + RETURN JSON WITH: + 1. {self.REF_SCORE_FIELD}: object of 0/1 per citation key + 2. {self.EXPLANATION_FIELD}: object of brief rationales. Begin each value with: + Explained spans: ''; '' (or 'None'), then 1–2 sentences of rationale. + """ def process_commentary_by_content( self, commentary_text: Union[List[str],str], - cited_refs: Dict[str,str], + cited_refs: Dict[str, str], commentary_ref: str = "" ) -> CommentaryScoringOutput: """Score how well a commentary explains its cited texts. @@ -303,7 +283,7 @@ def process_commentary_by_content( request_status_message=f"Commentary {commentary_ref} is empty. ") # Convert commentary text to string format - if isinstance(commentary_text,list): + if isinstance(commentary_text, list): commentary_text_str = to_plain_text(commentary_text) else: commentary_text_str = str(commentary_text) @@ -357,9 +337,11 @@ def process_commentary_by_content( request_status_message="", request_status=RequestStatus.SUCCESS) + explained = sum(validated_scores.values()) + total = len(validated_scores) logger.info( - f"Successfully scored commentary {commentary_ref}. " - f"Average score: {sum(validated_scores.values()) / len(validated_scores):.2f}" + f"Scored commentary {commentary_ref}: explained {explained}/{total} " + f"({(explained / total * 100 if total else 0):.0f}%)" ) return result @@ -369,4 +351,4 @@ def process_commentary_by_content( commentary_ref=commentary_ref, processed_datetime=datetime.now(timezone.utc), request_status_message=f"Commentary {commentary_ref} scoring failed: {e}" - ) \ No newline at end of file + ) \ No newline at end of file From 6e4f34d230fd29a679c137360d2c75eea95a549f Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Tue, 12 Aug 2025 17:36:19 +0300 Subject: [PATCH 6/9] feat: changed the commentary_text type from List to str --- app/commentary_scoring/openai_commentary_scorer.py | 14 +++++--------- .../commentary_scoring/commentary_scoring_input.py | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py index cc133a1..4536ada 100644 --- a/app/commentary_scoring/openai_commentary_scorer.py +++ b/app/commentary_scoring/openai_commentary_scorer.py @@ -266,7 +266,7 @@ def _build_scoring_prompt( def process_commentary_by_content( self, - commentary_text: Union[List[str],str], + commentary_text: str, cited_refs: Dict[str, str], commentary_ref: str = "" ) -> CommentaryScoringOutput: @@ -282,13 +282,9 @@ def process_commentary_by_content( processed_datetime=datetime.now(timezone.utc), request_status_message=f"Commentary {commentary_ref} is empty. ") - # Convert commentary text to string format - if isinstance(commentary_text, list): - commentary_text_str = to_plain_text(commentary_text) - else: - commentary_text_str = str(commentary_text) - if not commentary_text_str.strip(): + + if not commentary_text.strip(): return self._create_failure_scoring_output( commentary_ref=commentary_ref, @@ -296,7 +292,7 @@ def process_commentary_by_content( request_status_message=f"Commentary's {commentary_ref} text is empty " ) - token_count = self._count_tokens(commentary_text_str) + token_count = self._count_tokens(commentary_text) max_allowed_tokens = self.max_prompt_tokens - self.token_margin if token_count > max_allowed_tokens: @@ -313,7 +309,7 @@ def process_commentary_by_content( try: # Build prompt and schema - prompt = self._build_scoring_prompt(cited_refs, commentary_text_str) + prompt = self._build_scoring_prompt(cited_refs, commentary_text) schema = self._build_function_schema(list(cited_refs.keys())) # Get LLM response diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py index e6ed706..cb48532 100644 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py @@ -4,7 +4,7 @@ @dataclass class CommentaryScoringInput: - commentary_text: List[str] + commentary_text: str cited_refs: Dict[str, str] commentary_ref: str From cc189d4e5352f28143f412e60903a772002a50bf Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Sun, 17 Aug 2025 13:49:43 +0300 Subject: [PATCH 7/9] feat: changed sefaria-llm-interface importing from folder importing to package importing --- app/commentary_scoring/commentary_scoring.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py index 3f2530a..291be6f 100644 --- a/app/commentary_scoring/commentary_scoring.py +++ b/app/commentary_scoring/commentary_scoring.py @@ -1,8 +1,6 @@ from .openai_commentary_scorer import CommentaryScorer import os -from pathlib import Path -# TODO: change the imports when compile package -from app.llm_interface.sefaria_llm_interface.commentary_scoring import ( +from sefaria_llm_interface.commentary_scoring import ( CommentaryScoringInput, CommentaryScoringOutput, ) From 652b9342c28099aad24917e88851b14d64776918 Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Tue, 19 Aug 2025 15:44:00 +0300 Subject: [PATCH 8/9] style: - added README with explanation of the code - removed unnecessary imports from commentary_scoring_input.py, commentary_scoring_output.py - in openai_commentary_scorer.py changed the sefaria-llm-inteface importing from local folder importing to package importing; added comments to some functions; removed unnecessary spaces in functions definitions and added spaces after commas. same for text_utils.py - added textwrap.dedent to prompt definition - in tasks.py changed the sefaria-llm-inteface importing from local folder importing to package importing; --- app/commentary_scoring/README.md | 215 ++++++++++++++++++ .../openai_commentary_scorer.py | 135 ++++++----- app/commentary_scoring/tasks.py | 3 +- app/commentary_scoring/text_utils.py | 19 +- .../commentary_scoring_input.py | 2 +- .../commentary_scoring_output.py | 2 +- 6 files changed, 297 insertions(+), 79 deletions(-) create mode 100644 app/commentary_scoring/README.md diff --git a/app/commentary_scoring/README.md b/app/commentary_scoring/README.md new file mode 100644 index 0000000..4e69aec --- /dev/null +++ b/app/commentary_scoring/README.md @@ -0,0 +1,215 @@ +# CommentaryScorer — Commentary–Citation Analysis Tool + +**CommentaryScorer** is a Python tool that uses **LLMs** to analyze a **commentary** and determine, for **each cited base text**, whether the commentary actually **explains/interprets** it. It returns a **binary score (0/1)** per citation together with a **short rationale**. + +--- + +## ⭐ Scores Extracted + +- **Per-Citation Explanation Score**: `0` = not explained, `1` = explained +- **Per-Citation Rationale**: short reason string that begins with + `Explained spans: ''; ''` (or `'None'`) + +--- + +## 🚀 Quick Start + +```python +from commentary_scoring.commentary_scoring import score_one_commentary +from sefaria_llm_interface.commentary_scoring import CommentaryScoringInput + +inp = CommentaryScoringInput( + commentary_ref="Rashi on Genesis 1:1", + cited_refs={ + "Genesis 1:1": "In the beginning God created the heavens and the earth.", + "Genesis 1:2": "Now the earth was formless and void..." + }, + commentary_text=""" + Rashi on 'בראשית' explains sequencing/purpose and interprets terms... + """ +) + +out = score_one_commentary(inp) + +print("Scores:", out.ref_scores) +print("Reasons:", out.scores_explanation) + +``` + +## 📦 Data Structures + +### **Input — `CommentaryScoringInput`** + +```python +{ + "commentary_ref": "Rashi on Genesis 1:1", # Optional string identifier + "cited_refs": { # Dict of citation → base text + "Genesis 1:1": "In the beginning ...", + "Genesis 1:2": "Now the earth ..." + }, + "commentary_text": "Full commentary text (plain or HTML)" +} +``` + +- **commentary_ref**: identifier for the commentary (helpful for logging) +- **cited_refs**: dictionary mapping citation keys (e.g., `"Genesis 1:1"`) to their textual content +- **commentary_text**: commentary body text (string, can contain HTML, nested lists, etc.) + +--- + +### **Output — `CommentaryScoringOutput`** + +```python +{ + "commentary_ref": "Rashi on Genesis 1:1", + "ref_scores": { "Genesis 1:1": 1, "Genesis 1:2": 0 }, + "scores_explanation": { + "Genesis 1:1": "Explained spans: 'בראשית'; 'אלוקים' — Adds interpretive content ...", + "Genesis 1:2": "Explained spans: None — Only a decorative quote, no interpretation ..." + }, + "processed_datetime": "2025-08-19T10:30:00Z", + "request_status": 1, + "request_status_message": "" +} +``` + +- **ref_scores**: dictionary of binary scores per citation (0 = not explained, 1 = explained) +- **scores_explanation**: dictionary of rationales per citation, each beginning with **“Explained spans”** +- **processed_datetime**: UTC ISO8601 timestamp when scoring was done +- **request_status**: `1 = success`, `0 = failure` +- **request_status_message**: error description in case of failure + +--- + +## ⚙️ Scoring System + +### **Architecture** + +The `commentary_scoring` package consists of: + +- `commentary_scoring.py` — Main API with `score_one_commentary()` +- `openai_commentary_scorer.py` — Core LLM engine (`CommentaryScorer`) +- `tasks.py` — Celery task wrapper for async processing +- `text_utils.py` — Utilities for HTML stripping and flattening +- `README.md` — Documentation + + +--- + +### **Explanation Levels** + +| Level | Description | +|-------|-------------| +| **0 — Not Explained** | Commentary does not interpret the cited text (decorative prooftext, paraphrase only, inherited interpretation). | +| **1 — Explained** | Commentary provides interpretation or explanation of any part of the cited text. | + +--- + +## 🧠 Algorithm + +### **Input Validation** +- Fail if `cited_refs` is empty or `commentary_text` is missing +- Token counting via `tiktoken` (fallback = character length) +- If too long → fail fast with `"input too long"` + +### **Build Prompt** +- Commentary text + cited refs in structured sections +- Explicit instructions for binary labeling per citation +- Require **“Explained spans”** prefix in explanations + +### **Schema Enforcement** +- OpenAI function calling schema requires: + - `ref_scores`: dict of citation → 0/1 + - `explanation`: dict of citation → rationale string + +### **LLM Invocation** +- Config: `gpt-4o-mini`, `temperature=0`, `top_p=0`, `seed=42` +- Parse structured JSON output + +### **Post-Processing** +- Clamp invalid values to `0` or `1` +- Return `CommentaryScoringOutput` + +--- + +## 🔧 Configuration Options + +### **Initialization** + +```python +from commentary_scoring.openai_commentary_scorer import CommentaryScorer + +scorer = CommentaryScorer( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4o-mini", # default model + max_prompt_tokens=32000, # max tokens for input prompt + token_margin=4096 # reserved for model response +) +``` + +- **API Key**: via `OPENAI_API_KEY` environment variable or explicit parameter +- **Model**: defaults to `gpt-4o-mini`, override if needed +- **Token Guardrails**: ensures commentary fits within prompt budget + +--- + +## 📜 Celery Integration + +### **Task Wrapper** + +```python +@shared_task(name='llm.score_commentary') +def score_sheet_task(raw_input: dict) -> dict: + inp = CommentaryScoringInput(**raw_input) + out = score_one_commentary(inp) + return asdict(out) +``` + +### **Usage** + +```python +from celery import signature + +payload = { + "commentary_ref": "Rashi on Genesis 1:1", + "cited_refs": {"Genesis 1:1": "...", "Genesis 1:2": "..."}, + "commentary_text": "Rashi explains ..." +} +sig = signature("llm.score_commentary", args=[payload], queue="llm") +print(sig.apply_async().get()) +``` + +--- + +## 📊 Output Fields + +| Field | Description | +|------------------------|--------------------------------------------------| +| `ref_scores` | Binary 0/1 scores per citation | +| `scores_explanation` | Rationale strings beginning with `"Explained spans"` | +| `commentary_ref` | Commentary identifier | +| `processed_datetime` | UTC ISO8601 timestamp | +| `request_status` | `1 = success`, `0 = failure` | +| `request_status_message` | Error message if failure | + +--- + +## 📝 Logging + +- **Info**: token count, number of citations, success summary +- **Warning**: invalid scores clamped, tokenizer fallback +- **Error**: LLM or JSON parse failures + +```python +import logging +logging.getLogger("commentary_scoring").setLevel(logging.INFO) +``` + +--- + +## ✅ Extensibility + +- By now there is no support for very long commentaries, because during testing I didn't encounter any. The chances are high that we won't need this feature at all -- but the matter should be explored. + +--- + diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py index 4536ada..37a1ef0 100644 --- a/app/commentary_scoring/openai_commentary_scorer.py +++ b/app/commentary_scoring/openai_commentary_scorer.py @@ -6,16 +6,15 @@ import json import logging +import textwrap from datetime import datetime, timezone from enum import IntEnum -from typing import Any, Dict, List, Optional, Set, Union +from typing import Any, Dict, List, Optional, Set import tiktoken from langchain.chat_models import ChatOpenAI from langchain.schema import HumanMessage -from commentary_scoring.text_utils import to_plain_text -# TODO: change the imports when compile package -from app.llm_interface.sefaria_llm_interface.commentary_scoring import ( +from sefaria_llm_interface.commentary_scoring import ( CommentaryScoringInput, CommentaryScoringOutput, ) @@ -23,23 +22,22 @@ class ExplainsFlag(IntEnum): - NOT_EXPLAINED = 0 - EXPLAINED = 1 + """Binary flags for whether a commentary explains a cited text.""" + NOT_EXPLAINED = 0 # Commentary doesn't interpret the cited text + EXPLAINED = 1 # Commentary provides interpretation/explanation class RequestStatus(IntEnum): - """LLM's success/failure""" - - SUCCESS = 1 - FAILURE = 0 + """Status codes for LLM processing requests.""" + SUCCESS = 1 + FAILURE = 0 class LanguageCode: - """ISO 639-1 language codes.""" - - ENGLISH = "en" - HEBREW = "he" - DEFAULT = ENGLISH + """ISO 639-1 language codes for supported languages.""" + ENGLISH = "en" + HEBREW = "he" + DEFAULT = ENGLISH class CommentaryScorer: @@ -49,24 +47,25 @@ class CommentaryScorer: explanations provided by Jewish commentaries for their cited texts. Attributes: - model: The OpenAI model to use for scoring - max_prompt_tokens: Maximum tokens allowed in prompt - token_margin: Reserved tokens for model response - """ - - # Configuration constants - DEFAULT_MAX_OUTPUT_TOKENS = 4096 - DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000 - DEFAULT_TOKEN_CHAR_RATIO = 3 - - # Response field names - REF_SCORE_FIELD = "ref_scores" - EXPLANATION_FIELD = "explanation" - LANGUAGE_FIELD = "language" - CITED_REF_FIELD = "cited_ref" - PROCESSED_AT_FIELD = "processed_datetime" - - # Valid explanation levels + model (str): The OpenAI model to use for scoring + max_prompt_tokens (int): Maximum tokens allowed in prompt + token_margin (int): Reserved tokens for model response + llm (ChatOpenAI): Initialized language model client + """ + + # Configuration constants for token management + DEFAULT_MAX_OUTPUT_TOKENS = 4096 # Reserve for LLM response + DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000 # Total token budget + DEFAULT_TOKEN_CHAR_RATIO = 3 # Fallback chars-per-token estimate + + # JSON response field names for structured output + REF_SCORE_FIELD = "ref_scores" # Binary scores per citation + EXPLANATION_FIELD = "explanation" # Rationale strings per citation + LANGUAGE_FIELD = "language" # Detected language code + CITED_REF_FIELD = "cited_ref" # Citation reference key + PROCESSED_AT_FIELD = "processed_datetime" # Processing timestamp + + # Valid explanation levels for score validation VALID_LEVELS: Set[int] = \ {ExplainsFlag.NOT_EXPLAINED, ExplainsFlag.EXPLAINED} @@ -77,14 +76,15 @@ def __init__( max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS, token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS, ) -> None: - """Initialize the commentary scorer. + """Initialize the commentary scorer with OpenAI client. Args: - api_key: OpenAI API key. If None, will use environment variable - model: OpenAI model name to use - max_prompt_tokens: Maximum tokens for input prompt - token_margin: Reserved tokens for model response + api_key: OpenAI API key. If None, uses OPENAI_API_KEY env var + model: OpenAI model name (default: gpt-4o-mini for cost efficiency) + max_prompt_tokens: Maximum tokens for input prompt (includes commentary + citations) + token_margin: Reserved tokens for model response (ensures budget compliance) Raises: ValueError: If model is not supported or parameters are invalid + Exception: If OpenAI client initialization fails """ self.model = model @@ -92,15 +92,16 @@ def __init__( self.token_margin = token_margin try: + # Initialize OpenAI client with deterministic settings for consistent scoring self.llm = ChatOpenAI( model_name=model, - temperature=0, #Model temperature (0.0 for deterministic grading) + temperature=0, # Deterministic output for consistent grading openai_api_key=api_key, model_kwargs={ - "top_p": 0, - "frequency_penalty": 0, - "presence_penalty": 0, - "seed": 42, + "top_p": 0, # No nucleus sampling + "frequency_penalty": 0, # No frequency penalties + "presence_penalty": 0, # No presence penalties + "seed": 42, # Fixed seed for reproducibility }, ) except Exception as e: @@ -138,11 +139,8 @@ def _validate_level(self, score: Any) -> int: return score - def _invoke_llm( - self, - prompt: str, - function_schema: Dict[str, Any] - ) -> Dict[str, Any]: + def _invoke_llm(self, prompt: str, function_schema: Dict[str, Any]) \ + -> Dict[str, Any]: """Invoke the language model with function calling. """ try: @@ -167,7 +165,7 @@ def _invoke_llm( logger.error(f"LLM invocation failed: {e}") raise - def _build_function_schema(self,cited_keys: List[str]) -> Dict[str,Any]: + def _build_function_schema(self, cited_keys: List[str]) -> Dict[str, Any]: if not cited_keys: raise ValueError("cited_keys cannot be empty") @@ -209,10 +207,15 @@ def _build_function_schema(self,cited_keys: List[str]) -> Dict[str,Any]: }, } - def _create_failure_scoring_output(self, commentary_ref, - processed_datetime: datetime, - request_status_message: str) -> ( - CommentaryScoringOutput): + def _create_failure_scoring_output( + self, + commentary_ref: str, + processed_datetime: datetime, + request_status_message: str + ) -> CommentaryScoringOutput: + """Create standardized failure output for error cases. + Returns: CommentaryScoringOutput: Failure result with error details + """ logger.warning(request_status_message) return CommentaryScoringOutput( commentary_ref=commentary_ref, @@ -223,18 +226,12 @@ def _create_failure_scoring_output(self, commentary_ref, request_status=RequestStatus.FAILURE ) - def _build_scoring_prompt( - self, - cited_refs: Dict[str,str], - commentary_text: str - ) -> str: + def _build_scoring_prompt(self, cited_refs: Dict[str, str], commentary_text: str) -> str: """Build the prompt for scoring commentary explanations. """ - refs_section = "\n".join( - f"- {key}: {text}" for key,text in cited_refs.items() - ) + refs_section = "\n".join(f"- {key}: {text}" for key, text in cited_refs.items()) - return f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text. + return textwrap.dedent(f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text. COMMENTARY TEXT: {commentary_text} @@ -243,11 +240,11 @@ def _build_scoring_prompt( {refs_section} TASK (binary per citation): - Return 1 if the commentary provides any substantive interpretation or explanation + Return {ExplainsFlag.EXPLAINED} if the commentary provides any substantive interpretation or explanation of ANY PART of the cited text (including methodological interpretation, e.g., reading a word as a symbol) — not just quoting or paraphrasing. - Return 0 if: + Return {ExplainsFlag.NOT_EXPLAINED} if: • The citation is used for another goal (decorative, rhetorical, prooftext with no interpretation). • The commentary cites Source A only via Source B, but adds NO new interpretation of A beyond B. (Inherited interpretation does NOT count as explanation of A.) @@ -256,13 +253,13 @@ def _build_scoring_prompt( Important: • If the commentary explains only PARTS of the citation, still return 1. • In your explanation, list the exact phrases from the cited text that ARE explained (if any), - then give a concise rationale for 0/1. + then give a concise rationale for {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED}. RETURN JSON WITH: - 1. {self.REF_SCORE_FIELD}: object of 0/1 per citation key + 1. {self.REF_SCORE_FIELD}: object of {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED} per citation key 2. {self.EXPLANATION_FIELD}: object of brief rationales. Begin each value with: Explained spans: ''; '' (or 'None'), then 1–2 sentences of rationale. - """ + """) def process_commentary_by_content( self, @@ -282,8 +279,6 @@ def process_commentary_by_content( processed_datetime=datetime.now(timezone.utc), request_status_message=f"Commentary {commentary_ref} is empty. ") - - if not commentary_text.strip(): return self._create_failure_scoring_output( @@ -306,7 +301,6 @@ def process_commentary_by_content( f"Processing commentary with {token_count} tokens, " f"{len(cited_refs)} citations" ) - try: # Build prompt and schema prompt = self._build_scoring_prompt(cited_refs, commentary_text) @@ -321,7 +315,6 @@ def process_commentary_by_content( key: self._validate_level(score) for key, score in raw_scores.items() } - # Create structured result result = CommentaryScoringOutput( commentary_ref=commentary_ref, diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py index ff4cbd6..c270128 100644 --- a/app/commentary_scoring/tasks.py +++ b/app/commentary_scoring/tasks.py @@ -1,7 +1,6 @@ from celery import shared_task from commentary_scoring.commentary_scoring import score_one_commentary -# TODO: change the import -from app.llm_interface.sefaria_llm_interface.commentary_scoring import ( +from sefaria_llm_interface.commentary_scoring import ( CommentaryScoringInput, ) from dataclasses import asdict diff --git a/app/commentary_scoring/text_utils.py b/app/commentary_scoring/text_utils.py index 75b82a2..46f4e4c 100644 --- a/app/commentary_scoring/text_utils.py +++ b/app/commentary_scoring/text_utils.py @@ -1,23 +1,34 @@ import re from typing import Union, List +# Regular expression to match HTML tags (e.g.,
, ,

, etc.) TAG_RE = re.compile(r"<[^>]+>") def strip_html(s: str) -> str: + """ + Remove all HTML tags from a given string. + """ return TAG_RE.sub("", s) def flatten_text(x: Union[str, List, tuple]) -> str: + """ + Recursively flatten a nested structure of strings, lists, or tuples into a single string. + """ if isinstance(x, str): return x if isinstance(x, (list, tuple)): + # Recursively flatten all elements and join with spaces return " ".join(flatten_text(el) for el in x) + # If it's not a string or list/tuple, convert it to string return str(x) def to_plain_text(raw: Union[str, List, tuple]) -> str: - """Recursively flatten + remove HTML → clean unicode.""" - flat = flatten_text(raw) - clean = strip_html(flat) - return re.sub(r"\s+", " ", clean).strip() \ No newline at end of file + """ + Convert raw input (possibly nested and HTML-formatted) to clean plain text. + """ + flat = flatten_text(raw) # Step 1: Flatten nested structure + clean = strip_html(flat) # Step 2: Remove HTML tags + return re.sub(r"\s+", " ", clean).strip() # Step 3: Normalize whitespace diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py index cb48532..e6b4c2a 100644 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Dict +from typing import Dict @dataclass diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py index 9248d2d..27b86b4 100644 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Dict, Union +from typing import Dict from datetime import datetime From ab6d8f26827e4c300b2791deb99127b1795996b9 Mon Sep 17 00:00:00 2001 From: morganizzzm Date: Tue, 19 Aug 2025 15:46:23 +0300 Subject: [PATCH 9/9] style: - updated commentary_scoring init from local to package imporing --- .../sefaria_llm_interface/commentary_scoring/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py index 31e3d07..a086340 100644 --- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py @@ -1,4 +1,3 @@ -from .commentary_scoring_input import CommentaryScoringInput -from .commentary_scoring_output import CommentaryScoringOutput +from sefaria_llm_interface.commentary_scoring.commentary_scoring_input import * +from sefaria_llm_interface.commentary_scoring.commentary_scoring_output import * -__all__ = ["CommentaryScoringInput", "CommentaryScoringOutput"] \ No newline at end of file