diff --git a/app/celery_setup/app.py b/app/celery_setup/app.py index 0b50834..d1f1f69 100644 --- a/app/celery_setup/app.py +++ b/app/celery_setup/app.py @@ -3,4 +3,7 @@ app = Celery('llm') app.conf.update(**generate_config_from_env()) -app.autodiscover_tasks(packages=['topic_prompt']) +app.autodiscover_tasks(packages=['topic_prompt', + 'commentary_scoring'] + ) + diff --git a/app/commentary_scoring/README.md b/app/commentary_scoring/README.md new file mode 100644 index 0000000..4e69aec --- /dev/null +++ b/app/commentary_scoring/README.md @@ -0,0 +1,215 @@ +# CommentaryScorer — Commentary–Citation Analysis Tool + +**CommentaryScorer** is a Python tool that uses **LLMs** to analyze a **commentary** and determine, for **each cited base text**, whether the commentary actually **explains/interprets** it. It returns a **binary score (0/1)** per citation together with a **short rationale**. + +--- + +## ⭐ Scores Extracted + +- **Per-Citation Explanation Score**: `0` = not explained, `1` = explained +- **Per-Citation Rationale**: short reason string that begins with + `Explained spans: ''; ''` (or `'None'`) + +--- + +## 🚀 Quick Start + +```python +from commentary_scoring.commentary_scoring import score_one_commentary +from sefaria_llm_interface.commentary_scoring import CommentaryScoringInput + +inp = CommentaryScoringInput( + commentary_ref="Rashi on Genesis 1:1", + cited_refs={ + "Genesis 1:1": "In the beginning God created the heavens and the earth.", + "Genesis 1:2": "Now the earth was formless and void..." + }, + commentary_text=""" + Rashi on 'בראשית' explains sequencing/purpose and interprets terms... + """ +) + +out = score_one_commentary(inp) + +print("Scores:", out.ref_scores) +print("Reasons:", out.scores_explanation) + +``` + +## 📦 Data Structures + +### **Input — `CommentaryScoringInput`** + +```python +{ + "commentary_ref": "Rashi on Genesis 1:1", # Optional string identifier + "cited_refs": { # Dict of citation → base text + "Genesis 1:1": "In the beginning ...", + "Genesis 1:2": "Now the earth ..." + }, + "commentary_text": "Full commentary text (plain or HTML)" +} +``` + +- **commentary_ref**: identifier for the commentary (helpful for logging) +- **cited_refs**: dictionary mapping citation keys (e.g., `"Genesis 1:1"`) to their textual content +- **commentary_text**: commentary body text (string, can contain HTML, nested lists, etc.) + +--- + +### **Output — `CommentaryScoringOutput`** + +```python +{ + "commentary_ref": "Rashi on Genesis 1:1", + "ref_scores": { "Genesis 1:1": 1, "Genesis 1:2": 0 }, + "scores_explanation": { + "Genesis 1:1": "Explained spans: 'בראשית'; 'אלוקים' — Adds interpretive content ...", + "Genesis 1:2": "Explained spans: None — Only a decorative quote, no interpretation ..." + }, + "processed_datetime": "2025-08-19T10:30:00Z", + "request_status": 1, + "request_status_message": "" +} +``` + +- **ref_scores**: dictionary of binary scores per citation (0 = not explained, 1 = explained) +- **scores_explanation**: dictionary of rationales per citation, each beginning with **“Explained spans”** +- **processed_datetime**: UTC ISO8601 timestamp when scoring was done +- **request_status**: `1 = success`, `0 = failure` +- **request_status_message**: error description in case of failure + +--- + +## ⚙️ Scoring System + +### **Architecture** + +The `commentary_scoring` package consists of: + +- `commentary_scoring.py` — Main API with `score_one_commentary()` +- `openai_commentary_scorer.py` — Core LLM engine (`CommentaryScorer`) +- `tasks.py` — Celery task wrapper for async processing +- `text_utils.py` — Utilities for HTML stripping and flattening +- `README.md` — Documentation + + +--- + +### **Explanation Levels** + +| Level | Description | +|-------|-------------| +| **0 — Not Explained** | Commentary does not interpret the cited text (decorative prooftext, paraphrase only, inherited interpretation). | +| **1 — Explained** | Commentary provides interpretation or explanation of any part of the cited text. | + +--- + +## 🧠 Algorithm + +### **Input Validation** +- Fail if `cited_refs` is empty or `commentary_text` is missing +- Token counting via `tiktoken` (fallback = character length) +- If too long → fail fast with `"input too long"` + +### **Build Prompt** +- Commentary text + cited refs in structured sections +- Explicit instructions for binary labeling per citation +- Require **“Explained spans”** prefix in explanations + +### **Schema Enforcement** +- OpenAI function calling schema requires: + - `ref_scores`: dict of citation → 0/1 + - `explanation`: dict of citation → rationale string + +### **LLM Invocation** +- Config: `gpt-4o-mini`, `temperature=0`, `top_p=0`, `seed=42` +- Parse structured JSON output + +### **Post-Processing** +- Clamp invalid values to `0` or `1` +- Return `CommentaryScoringOutput` + +--- + +## 🔧 Configuration Options + +### **Initialization** + +```python +from commentary_scoring.openai_commentary_scorer import CommentaryScorer + +scorer = CommentaryScorer( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4o-mini", # default model + max_prompt_tokens=32000, # max tokens for input prompt + token_margin=4096 # reserved for model response +) +``` + +- **API Key**: via `OPENAI_API_KEY` environment variable or explicit parameter +- **Model**: defaults to `gpt-4o-mini`, override if needed +- **Token Guardrails**: ensures commentary fits within prompt budget + +--- + +## 📜 Celery Integration + +### **Task Wrapper** + +```python +@shared_task(name='llm.score_commentary') +def score_sheet_task(raw_input: dict) -> dict: + inp = CommentaryScoringInput(**raw_input) + out = score_one_commentary(inp) + return asdict(out) +``` + +### **Usage** + +```python +from celery import signature + +payload = { + "commentary_ref": "Rashi on Genesis 1:1", + "cited_refs": {"Genesis 1:1": "...", "Genesis 1:2": "..."}, + "commentary_text": "Rashi explains ..." +} +sig = signature("llm.score_commentary", args=[payload], queue="llm") +print(sig.apply_async().get()) +``` + +--- + +## 📊 Output Fields + +| Field | Description | +|------------------------|--------------------------------------------------| +| `ref_scores` | Binary 0/1 scores per citation | +| `scores_explanation` | Rationale strings beginning with `"Explained spans"` | +| `commentary_ref` | Commentary identifier | +| `processed_datetime` | UTC ISO8601 timestamp | +| `request_status` | `1 = success`, `0 = failure` | +| `request_status_message` | Error message if failure | + +--- + +## 📝 Logging + +- **Info**: token count, number of citations, success summary +- **Warning**: invalid scores clamped, tokenizer fallback +- **Error**: LLM or JSON parse failures + +```python +import logging +logging.getLogger("commentary_scoring").setLevel(logging.INFO) +``` + +--- + +## ✅ Extensibility + +- By now there is no support for very long commentaries, because during testing I didn't encounter any. The chances are high that we won't need this feature at all -- but the matter should be explored. + +--- + diff --git a/app/commentary_scoring/__init__.py b/app/commentary_scoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py new file mode 100644 index 0000000..291be6f --- /dev/null +++ b/app/commentary_scoring/commentary_scoring.py @@ -0,0 +1,17 @@ +from .openai_commentary_scorer import CommentaryScorer +import os +from sefaria_llm_interface.commentary_scoring import ( + CommentaryScoringInput, + CommentaryScoringOutput, +) + +def score_one_commentary(inp: CommentaryScoringInput) -> ( + CommentaryScoringOutput): + scorer = CommentaryScorer( + api_key=os.getenv("OPENAI_API_KEY") + ) + return scorer.process_commentary_by_content( + commentary_ref=inp.commentary_ref, + cited_refs=inp.cited_refs, + commentary_text=inp.commentary_text + ) \ No newline at end of file diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py new file mode 100644 index 0000000..37a1ef0 --- /dev/null +++ b/app/commentary_scoring/openai_commentary_scorer.py @@ -0,0 +1,343 @@ +"""Commentary Scorer for evaluating Jewish text Commentaries. + +This module provides functionality to score how well commentaries explain +their cited texts using OpenAI's language models. +""" + +import json +import logging +import textwrap +from datetime import datetime, timezone +from enum import IntEnum +from typing import Any, Dict, List, Optional, Set + +import tiktoken +from langchain.chat_models import ChatOpenAI +from langchain.schema import HumanMessage +from sefaria_llm_interface.commentary_scoring import ( + CommentaryScoringInput, + CommentaryScoringOutput, +) +logger = logging.getLogger(__name__) + + +class ExplainsFlag(IntEnum): + """Binary flags for whether a commentary explains a cited text.""" + NOT_EXPLAINED = 0 # Commentary doesn't interpret the cited text + EXPLAINED = 1 # Commentary provides interpretation/explanation + + +class RequestStatus(IntEnum): + """Status codes for LLM processing requests.""" + SUCCESS = 1 + FAILURE = 0 + + +class LanguageCode: + """ISO 639-1 language codes for supported languages.""" + ENGLISH = "en" + HEBREW = "he" + DEFAULT = ENGLISH + + +class CommentaryScorer: + """Scores how well commentaries explain their cited texts. + + This class uses OpenAI's language models to evaluate the quality of + explanations provided by Jewish commentaries for their cited texts. + + Attributes: + model (str): The OpenAI model to use for scoring + max_prompt_tokens (int): Maximum tokens allowed in prompt + token_margin (int): Reserved tokens for model response + llm (ChatOpenAI): Initialized language model client + """ + + # Configuration constants for token management + DEFAULT_MAX_OUTPUT_TOKENS = 4096 # Reserve for LLM response + DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000 # Total token budget + DEFAULT_TOKEN_CHAR_RATIO = 3 # Fallback chars-per-token estimate + + # JSON response field names for structured output + REF_SCORE_FIELD = "ref_scores" # Binary scores per citation + EXPLANATION_FIELD = "explanation" # Rationale strings per citation + LANGUAGE_FIELD = "language" # Detected language code + CITED_REF_FIELD = "cited_ref" # Citation reference key + PROCESSED_AT_FIELD = "processed_datetime" # Processing timestamp + + # Valid explanation levels for score validation + VALID_LEVELS: Set[int] = \ + {ExplainsFlag.NOT_EXPLAINED, ExplainsFlag.EXPLAINED} + + def __init__( + self, + api_key: Optional[str] = None, + model: str = "gpt-4o-mini", + max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS, + token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS, + ) -> None: + """Initialize the commentary scorer with OpenAI client. + Args: + api_key: OpenAI API key. If None, uses OPENAI_API_KEY env var + model: OpenAI model name (default: gpt-4o-mini for cost efficiency) + max_prompt_tokens: Maximum tokens for input prompt (includes commentary + citations) + token_margin: Reserved tokens for model response (ensures budget compliance) + Raises: + ValueError: If model is not supported or parameters are invalid + Exception: If OpenAI client initialization fails + """ + + self.model = model + self.max_prompt_tokens = max_prompt_tokens + self.token_margin = token_margin + + try: + # Initialize OpenAI client with deterministic settings for consistent scoring + self.llm = ChatOpenAI( + model_name=model, + temperature=0, # Deterministic output for consistent grading + openai_api_key=api_key, + model_kwargs={ + "top_p": 0, # No nucleus sampling + "frequency_penalty": 0, # No frequency penalties + "presence_penalty": 0, # No presence penalties + "seed": 42, # Fixed seed for reproducibility + }, + ) + except Exception as e: + logger.error(f"Failed to initialize ChatOpenAI: {e}") + raise + + logger.info(f"Initialized CommentaryScorer with model {model}") + + def _count_tokens(self, text: str) -> int: + """Count tokens in text using the model's tokenizer. + """ + try: + encoding = tiktoken.encoding_for_model(self.model) + return len(encoding.encode(text)) + except Exception as e: + logger.warning(f"Failed to get exact token count: {e}") + # Fallback to character-based estimation + return max(1, len(text) // self.DEFAULT_TOKEN_CHAR_RATIO) + + def _validate_level(self, score: Any) -> int: + try: + score = int(score) + except (ValueError, TypeError): + logger.warning( + f"Invalid reference score '{score}', defaulting to 0" + ) + return ExplainsFlag.NOT_EXPLAINED + + if score not in self.VALID_LEVELS: + clamped = ExplainsFlag.EXPLAINED if score >= 1 else ExplainsFlag.NOT_EXPLAINED + logger.warning( + f"Reference score {score} out of range, clamping to {clamped}" + ) + return clamped + + return score + + def _invoke_llm(self, prompt: str, function_schema: Dict[str, Any]) \ + -> Dict[str, Any]: + """Invoke the language model with function calling. + """ + try: + response = self.llm.invoke( + [HumanMessage(content=prompt)], + functions=[function_schema], + function_call={"name": function_schema["name"]} + ) + function_call = getattr(response, "additional_kwargs", {}).get( + "function_call" + ) + if not function_call: + raise ValueError("No function call found in LLM response") + + arguments = function_call.get("arguments", "{}") + return json.loads(arguments) + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse LLM response JSON: {e}") + raise ValueError(f"Invalid JSON in LLM response: {e}") + except Exception as e: + logger.error(f"LLM invocation failed: {e}") + raise + + def _build_function_schema(self, cited_keys: List[str]) -> Dict[str, Any]: + if not cited_keys: + raise ValueError("cited_keys cannot be empty") + + return { + "name": "score_multiple_explanations", + "description": "Binary labeling: does the commentary actually interpret/explain each cited text?", + "parameters": { + "type": "object", + "properties": { + self.REF_SCORE_FIELD: { + "type": "object", + "properties": { + key: {"type": "integer","minimum": 0,"maximum": 1} + for key in cited_keys + }, + "required": cited_keys, + "additionalProperties": False, + }, + self.EXPLANATION_FIELD: { + "type": "object", + "properties": { + key: { + "type": "string", + "maxLength": 150, + "description": ( + "Brief rationale. Start with: " + "\"Explained spans: ''; ''\" " + "then 1–2 sentences why 0/1." + ), + } + for key in cited_keys + }, + "required": cited_keys, + "additionalProperties": False, + }, + }, + "required": [self.REF_SCORE_FIELD,self.EXPLANATION_FIELD], + "additionalProperties": False, + }, + } + + def _create_failure_scoring_output( + self, + commentary_ref: str, + processed_datetime: datetime, + request_status_message: str + ) -> CommentaryScoringOutput: + """Create standardized failure output for error cases. + Returns: CommentaryScoringOutput: Failure result with error details + """ + logger.warning(request_status_message) + return CommentaryScoringOutput( + commentary_ref=commentary_ref, + ref_scores={}, + scores_explanation={}, + processed_datetime=str(processed_datetime), + request_status_message=request_status_message, + request_status=RequestStatus.FAILURE + ) + + def _build_scoring_prompt(self, cited_refs: Dict[str, str], commentary_text: str) -> str: + """Build the prompt for scoring commentary explanations. + """ + refs_section = "\n".join(f"- {key}: {text}" for key, text in cited_refs.items()) + + return textwrap.dedent(f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text. + + COMMENTARY TEXT: + {commentary_text} + + CITED TEXTS: + {refs_section} + + TASK (binary per citation): + Return {ExplainsFlag.EXPLAINED} if the commentary provides any substantive interpretation or explanation + of ANY PART of the cited text (including methodological interpretation, e.g., reading a word + as a symbol) — not just quoting or paraphrasing. + + Return {ExplainsFlag.NOT_EXPLAINED} if: + • The citation is used for another goal (decorative, rhetorical, prooftext with no interpretation). + • The commentary cites Source A only via Source B, but adds NO new interpretation of A beyond B. + (Inherited interpretation does NOT count as explanation of A.) + • It merely references or paraphrases without interpreting. + + Important: + • If the commentary explains only PARTS of the citation, still return 1. + • In your explanation, list the exact phrases from the cited text that ARE explained (if any), + then give a concise rationale for {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED}. + + RETURN JSON WITH: + 1. {self.REF_SCORE_FIELD}: object of {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED} per citation key + 2. {self.EXPLANATION_FIELD}: object of brief rationales. Begin each value with: + Explained spans: ''; '' (or 'None'), then 1–2 sentences of rationale. + """) + + def process_commentary_by_content( + self, + commentary_text: str, + cited_refs: Dict[str, str], + commentary_ref: str = "" + ) -> CommentaryScoringOutput: + """Score how well a commentary explains its cited texts. + """ + if not cited_refs: + return self._create_failure_scoring_output(commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary {commentary_ref} doesn't cite anything. ") + + if not commentary_text: + return self._create_failure_scoring_output(commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary {commentary_ref} is empty. ") + + if not commentary_text.strip(): + + return self._create_failure_scoring_output( + commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary's {commentary_ref} text is empty " + ) + + token_count = self._count_tokens(commentary_text) + max_allowed_tokens = self.max_prompt_tokens - self.token_margin + + if token_count > max_allowed_tokens: + # TODO: add long commentary support + return self._create_failure_scoring_output(commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=(f"{commentary_ref}'s input too long " + f"({token_count} tokens > {max_allowed_tokens} limit). ")) + + logger.info( + f"Processing commentary with {token_count} tokens, " + f"{len(cited_refs)} citations" + ) + try: + # Build prompt and schema + prompt = self._build_scoring_prompt(cited_refs, commentary_text) + schema = self._build_function_schema(list(cited_refs.keys())) + + # Get LLM response + raw_response = self._invoke_llm(prompt, schema) + + # Validate and normalize scores + raw_scores = raw_response.get(self.REF_SCORE_FIELD, {}) + validated_scores = { + key: self._validate_level(score) + for key, score in raw_scores.items() + } + # Create structured result + result = CommentaryScoringOutput( + commentary_ref=commentary_ref, + ref_scores=validated_scores, + scores_explanation=raw_response.get( + self.EXPLANATION_FIELD, {} + ), + processed_datetime=str(datetime.now(timezone.utc)), + request_status_message="", + request_status=RequestStatus.SUCCESS) + + explained = sum(validated_scores.values()) + total = len(validated_scores) + logger.info( + f"Scored commentary {commentary_ref}: explained {explained}/{total} " + f"({(explained / total * 100 if total else 0):.0f}%)" + ) + + return result + + except Exception as e: + return self._create_failure_scoring_output( + commentary_ref=commentary_ref, + processed_datetime=datetime.now(timezone.utc), + request_status_message=f"Commentary {commentary_ref} scoring failed: {e}" + ) \ No newline at end of file diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py new file mode 100644 index 0000000..c270128 --- /dev/null +++ b/app/commentary_scoring/tasks.py @@ -0,0 +1,13 @@ +from celery import shared_task +from commentary_scoring.commentary_scoring import score_one_commentary +from sefaria_llm_interface.commentary_scoring import ( + CommentaryScoringInput, +) +from dataclasses import asdict + + +@shared_task(name='llm.score_commentary') +def score_sheet_task(raw_input: dict) -> dict: + inp = CommentaryScoringInput(**raw_input) + out = score_one_commentary(inp) + return asdict(out) \ No newline at end of file diff --git a/app/commentary_scoring/text_utils.py b/app/commentary_scoring/text_utils.py new file mode 100644 index 0000000..46f4e4c --- /dev/null +++ b/app/commentary_scoring/text_utils.py @@ -0,0 +1,34 @@ +import re +from typing import Union, List + +# Regular expression to match HTML tags (e.g.,
, ,

, etc.) +TAG_RE = re.compile(r"<[^>]+>") + + +def strip_html(s: str) -> str: + """ + Remove all HTML tags from a given string. + """ + return TAG_RE.sub("", s) + + +def flatten_text(x: Union[str, List, tuple]) -> str: + """ + Recursively flatten a nested structure of strings, lists, or tuples into a single string. + """ + if isinstance(x, str): + return x + if isinstance(x, (list, tuple)): + # Recursively flatten all elements and join with spaces + return " ".join(flatten_text(el) for el in x) + # If it's not a string or list/tuple, convert it to string + return str(x) + + +def to_plain_text(raw: Union[str, List, tuple]) -> str: + """ + Convert raw input (possibly nested and HTML-formatted) to clean plain text. + """ + flat = flatten_text(raw) # Step 1: Flatten nested structure + clean = strip_html(flat) # Step 2: Remove HTML tags + return re.sub(r"\s+", " ", clean).strip() # Step 3: Normalize whitespace diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py new file mode 100644 index 0000000..a086340 --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py @@ -0,0 +1,3 @@ +from sefaria_llm_interface.commentary_scoring.commentary_scoring_input import * +from sefaria_llm_interface.commentary_scoring.commentary_scoring_output import * + diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py new file mode 100644 index 0000000..e6b4c2a --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass +from typing import Dict + + +@dataclass +class CommentaryScoringInput: + commentary_text: str + cited_refs: Dict[str, str] + commentary_ref: str + diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py new file mode 100644 index 0000000..27b86b4 --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass +from typing import Dict +from datetime import datetime + + +@dataclass +class CommentaryScoringOutput: + commentary_ref: str + ref_scores: Dict[str, int] + scores_explanation: Dict[str, str] + processed_datetime: str + request_status_message: str + request_status: int + + def __post_init__(self): + if isinstance(self.processed_datetime, datetime): + self.processed_datetime = self.processed_datetime.isoformat() \ No newline at end of file