From 28795451a0f01c47dd80a62bb7255fa002a30f1e Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Sun, 10 Aug 2025 11:25:53 +0300
Subject: [PATCH 1/9] feat: Add LLM-based commentary scoring system

- Add sefaria_llm_interface/commentary_scoring package with input/output dataclasses
- Add commentary_scoring app with OpenAI-powered scoring functionality
- Implement CommentaryScorer class for evaluating how well commentaries explain cited texts
- Add Celery task integration for async commentary processing
- Include text processing utilities for HTML stripping and content flattening
- Update Celery autodiscovery to include commentary_scoring tasks
---
 app/celery_setup/app.py                       |   5 +-
 app/commentary_scoring/commentary_scoring.py  |  34 ++
 .../openai_commentary_scorer.py               | 346 ++++++++++++++++++
 app/commentary_scoring/tasks.py               |  13 +
 app/commentary_scoring/text_utils.py          |  23 ++
 .../commentary_scoring/__init__.py            |   4 +
 .../commentary_scoring/sheet_scoring_input.py |   8 +
 .../sheet_scoring_output.py                   |  19 +
 8 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 app/commentary_scoring/commentary_scoring.py
 create mode 100644 app/commentary_scoring/openai_commentary_scorer.py
 create mode 100644 app/commentary_scoring/tasks.py
 create mode 100644 app/commentary_scoring/text_utils.py
 create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
 create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py
 create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py

diff --git a/app/celery_setup/app.py b/app/celery_setup/app.py
index 0b50834..d1f1f69 100644
--- a/app/celery_setup/app.py
+++ b/app/celery_setup/app.py
@@ -3,4 +3,7 @@
 
 app = Celery('llm')
 app.conf.update(**generate_config_from_env())
-app.autodiscover_tasks(packages=['topic_prompt'])
+app.autodiscover_tasks(packages=['topic_prompt',
+                                 'commentary_scoring']
+                       )
+
diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py
new file mode 100644
index 0000000..61cb196
--- /dev/null
+++ b/app/commentary_scoring/commentary_scoring.py
@@ -0,0 +1,34 @@
+from .openai_commentary_scorer import CommentaryScorer
+import os
+from pathlib import Path
+from sefaria_llm_interface.commentary_scoring import (
+    CommentaryScoringInput,
+    CommentaryScoringOutput,
+)
+
+
+def score_one_commentary(inp: CommentaryScoringInput) -> (
+        CommentaryScoringOutput):
+    scorer = CommentaryScorer(
+        api_key=os.getenv("OPENAI_API_KEY")
+    )
+    result = (scorer.
+    process_commentary_by_content(
+        commentary_ref=inp.commentary_ref,
+        cited_refs=inp.cited_refs,
+        commentary_text=inp.commentary_text
+    )
+    )
+    if not result:
+        return CommentaryScoringOutput(
+            commentary_ref=inp.commentary_ref,
+            ref_scores={},
+            scores_explanation="",
+            processed_datetime=None,
+        )
+    return CommentaryScoringOutput(
+        commentary_ref=inp.commentary_ref,
+        ref_scores=result.get(scorer.REF_SCORE_FIELD),
+        scores_explanation=result.get(scorer.EXPLANATION_FIELD),
+        processed_datetime=result.get(scorer.PROCESSED_AT_FIELD),
+    )
diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py
new file mode 100644
index 0000000..bd7bebc
--- /dev/null
+++ b/app/commentary_scoring/openai_commentary_scorer.py
@@ -0,0 +1,346 @@
+"""Commentary Scorer for evaluating Jewish text Commentaries.
+
+This module provides functionality to score how well commentaries explain
+their cited texts using OpenAI's language models.
+"""
+
+import json
+import logging
+from datetime import datetime, timezone
+from enum import IntEnum
+from typing import Any, Dict, List, Optional, Set, Union
+
+import tiktoken
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import HumanMessage
+from commentary_scoring.text_utils import to_plain_text
+
+logger = logging.getLogger(__name__)
+
+
+class ExplanationLevel(IntEnum):
+    """Levels of explanation quality for commentary scoring."""
+
+    NO_EXPLANATION = 0
+    MINIMAL = 1
+    MODERATE = 2
+    SIGNIFICANT = 3
+    COMPREHENSIVE = 4
+
+
+class LanguageCode:
+    """ISO 639-1 language codes."""
+
+    ENGLISH = "en"
+    HEBREW = "he"
+    DEFAULT = ENGLISH
+
+
+class CommentaryScorer:
+    """Scores how well commentaries explain their cited texts.
+
+    This class uses OpenAI's language models to evaluate the quality of
+    explanations provided by Jewish commentaries for their cited texts.
+
+    Attributes:
+        model: The OpenAI model to use for scoring
+        max_prompt_tokens: Maximum tokens allowed in prompt
+        token_margin: Reserved tokens for model response
+    """
+
+    # Configuration constants
+    DEFAULT_MAX_OUTPUT_TOKENS = 4096
+    DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000
+    DEFAULT_TOKEN_CHAR_RATIO = 3
+
+    # Response field names
+    REF_SCORE_FIELD = "ref_scores"
+    EXPLANATION_FIELD = "explanation"
+    LANGUAGE_FIELD = "language"
+    CITED_REF_FIELD = "cited_ref"
+    PROCESSED_AT_FIELD = "processed_datetime"
+
+    # Valid explanation levels
+    VALID_LEVELS: Set[int] = {level.value for level in ExplanationLevel}
+
+    def __init__(
+            self,
+            api_key: Optional[str] = None,
+            model: str = "gpt-4o-mini",
+            max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS,
+            token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS,
+    ) -> None:
+        """Initialize the commentary scorer.
+        Args:
+            api_key: OpenAI API key. If None, will use environment variable
+            model: OpenAI model name to use
+            max_prompt_tokens: Maximum tokens for input prompt
+            token_margin: Reserved tokens for model response
+        Raises:
+            ValueError: If model is not supported or parameters are invalid
+        """
+
+        self.model = model
+        self.max_prompt_tokens = max_prompt_tokens
+        self.token_margin = token_margin
+
+        try:
+            self.llm = ChatOpenAI(
+                model_name=model,
+                temperature=0, #Model temperature (0.0 for deterministic grading)
+                openai_api_key=api_key,
+                model_kwargs={
+                    "top_p": 0,
+                    "frequency_penalty": 0,
+                    "presence_penalty": 0,
+                    "seed": 42,
+                },
+            )
+        except Exception as e:
+            logger.error(f"Failed to initialize ChatOpenAI: {e}")
+            raise
+
+        logger.info(f"Initialized CommentaryScorer with model {model}")
+
+    def _count_tokens(self, text: str) -> int:
+        """Count tokens in text using the model's tokenizer.
+        """
+        try:
+            encoding = tiktoken.encoding_for_model(self.model)
+            return len(encoding.encode(text))
+        except Exception as e:
+            logger.warning(f"Failed to get exact token count: {e}")
+            # Fallback to character-based estimation
+            return max(1, len(text) // self.DEFAULT_TOKEN_CHAR_RATIO)
+
+    def _validate_level(self,score: Any) -> int:
+        """Validate and normalize explanation level score.
+        """
+        try:
+            score = int(score)
+        except (ValueError, TypeError):
+            logger.warning(
+                f"Invalid reference score '{score}', defaulting to 0"
+                )
+            return ExplanationLevel.NO_EXPLANATION
+
+        if score not in self.VALID_LEVELS:
+            clamped = max(
+                ExplanationLevel.NO_EXPLANATION,
+                min(score,ExplanationLevel.COMPREHENSIVE)
+            )
+            logger.warning(
+                f"Reference score {score} out of range, clamping to {clamped}"
+            )
+            return clamped
+
+        return score
+
+    def _invoke_llm(
+            self,
+            prompt: str,
+            function_schema: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Invoke the language model with function calling.
+        """
+        try:
+            response = self.llm.invoke(
+                [HumanMessage(content=prompt)],
+                functions=[function_schema],
+                function_call={"name": function_schema["name"]}
+            )
+            function_call = getattr(response, "additional_kwargs", {}).get(
+                "function_call"
+                )
+            if not function_call:
+                raise ValueError("No function call found in LLM response")
+
+            arguments = function_call.get("arguments", "{}")
+            return json.loads(arguments)
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse LLM response JSON: {e}")
+            raise ValueError(f"Invalid JSON in LLM response: {e}")
+        except Exception as e:
+            logger.error(f"LLM invocation failed: {e}")
+            raise
+
+    def _build_function_schema(self, cited_keys: List[str]) -> Dict[str,Any]:
+        """Build JSON schema for function calling.
+
+        Args:
+            cited_keys: List of citation keys to score
+
+        Returns:
+            JSON schema for the scoring function
+        """
+        if not cited_keys:
+            raise ValueError("cited_keys cannot be empty")
+
+        return {
+            "name": "score_multiple_explanations",
+            "description": "Score how well a commentary explains each cited text",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    self.REF_SCORE_FIELD: {
+                        "type": "object",
+                        "properties": {
+                            key: {
+                                "type": "integer",
+                                "minimum": ExplanationLevel.NO_EXPLANATION,
+                                "maximum": ExplanationLevel.COMPREHENSIVE
+                            }
+                            for key in cited_keys
+                        },
+                        "required": cited_keys,
+                        "additionalProperties": False
+                    },
+                    self.EXPLANATION_FIELD: {
+                        "type": "object",
+                        "properties": {
+                            key: {
+                                "type": "string",
+                                "maxLength": 200,
+                                "description": f"Explanation for {key} score (1-2 sentences)"
+                            }
+                            for key in cited_keys
+                        },
+                        "required": cited_keys,
+                        "additionalProperties": False
+                    }
+                },
+                "required": [
+                    self.REF_SCORE_FIELD,
+                    self.EXPLANATION_FIELD
+                ],
+                "additionalProperties": False
+            }
+        }
+
+    def _build_scoring_prompt(
+            self,
+            cited_refs: Dict[str,str],
+            commentary_text: str
+    ) -> str:
+        """Build the prompt for scoring commentary explanations.
+
+        Args:
+            cited_refs: Mapping of reference keys to cited texts
+            commentary_text: The commentary text to evaluate
+
+        Returns:
+            Formatted prompt string
+        """
+        refs_section = "\n".join(
+            f"- {key}: {text}" for key,text in cited_refs.items()
+        )
+
+        return f"""You are an expert evaluator of Jewish commentary quality.
+
+COMMENTARY TEXT:
+{commentary_text}
+
+CITED TEXTS TO EVALUATE:
+{refs_section}
+
+TASK: For each cited text, score (0-4) how well the commentary explains it:
+
+SCORING SCALE:
+{ExplanationLevel.NO_EXPLANATION}: NO EXPLANATION - Citation used for unrelated point
+{ExplanationLevel.MINIMAL}: MINIMAL - Text merely paraphrased or mentioned
+{ExplanationLevel.MODERATE}: MODERATE - Commentary shares theme but doesn't explain text
+{ExplanationLevel.SIGNIFICANT}: SIGNIFICANT - Citation is main focus with meaningful explanation
+{ExplanationLevel.COMPREHENSIVE}: COMPREHENSIVE - Deep, thorough explanation that fully illuminates the text
+
+RETURN JSON WITH:
+1. {self.REF_SCORE_FIELD}: Object mapping each citation key to score (0-4)
+2. {self.EXPLANATION_FIELD}: Object mapping each key to brief explanation (1-2 sentences)
+
+Be precise and consistent in your scoring."""
+
+    def process_commentary_by_content(
+            self,
+            commentary_text: Union[List[str],str],
+            cited_refs: Dict[str,str],
+            commentary_ref: str = ""
+    ) -> Optional[Dict[str, Any]]:
+        """Score how well a commentary explains its cited texts.
+        """
+        if not cited_refs:
+            logger.info(
+                f"Commentary {commentary_ref} doesn't cite anything. "
+                f"Defaulting to None"
+            )
+            return None
+
+        if not commentary_text:
+            logger.info(
+                f"Commentary {commentary_ref} is empty. "
+                f"Defaulting to None"
+            )
+            return None
+
+            # Convert commentary text to string format
+        if isinstance(commentary_text,list):
+            commentary_text_str = to_plain_text(commentary_text)
+        else:
+            commentary_text_str = str(commentary_text)
+
+        if not commentary_text_str.strip():
+            logger.warning(f"Commentary's {commentary_ref} text is empty "
+                           f"after processing")
+
+            return None
+
+        token_count = self._count_tokens(commentary_text_str)
+        max_allowed_tokens = self.max_prompt_tokens - self.token_margin
+
+        if token_count > max_allowed_tokens:
+            # TODO: add long commentary support
+            logger.warning(
+                f"{commentary_ref}'s input too long "
+                f"({token_count} tokens > {max_allowed_tokens} limit). "
+                "Skipping scoring."
+            )
+            return None
+
+        logger.info(
+            f"Processing commentary with {token_count} tokens, "
+            f"{len(cited_refs)} citations"
+        )
+
+        try:
+            # Build prompt and schema
+            prompt = self._build_scoring_prompt(cited_refs, commentary_text_str)
+            schema = self._build_function_schema(list(cited_refs.keys()))
+
+            # Get LLM response
+            raw_response = self._invoke_llm(prompt, schema)
+
+            # Validate and normalize scores
+            raw_scores = raw_response.get(self.REF_SCORE_FIELD, {})
+            validated_scores = {
+                key: self._validate_level(score)
+                for key, score in raw_scores.items()
+            }
+
+            # Create structured result
+            result = {
+                self.REF_SCORE_FIELD: validated_scores,
+                self.EXPLANATION_FIELD: raw_response.get(
+                    self.EXPLANATION_FIELD, {}
+                    ),
+                self.PROCESSED_AT_FIELD: datetime.now(timezone.utc),
+            }
+
+            logger.info(
+                f"Successfully scored commentary {commentary_ref}. "
+                f"Average score: {sum(validated_scores.values()) / len(validated_scores):.2f}"
+            )
+
+            return result
+
+        except Exception as e:
+            logger.error(f"Commentary {commentary_ref} scoring failed: {e}")
+            return None
diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py
new file mode 100644
index 0000000..4215694
--- /dev/null
+++ b/app/commentary_scoring/tasks.py
@@ -0,0 +1,13 @@
+from celery import shared_task
+from commentary_scoring.commentary_scoring import score_one_commentary
+from sefaria_llm_interface.commentary_scoring import (
+    CommentaryScoringInput
+)
+from dataclasses import asdict
+
+
+@shared_task(name='llm.score_commentary')
+def score_sheet_task(raw_input: dict) -> dict:
+    inp = CommentaryScoringInput(**raw_input)
+    out = score_one_commentary(inp)
+    return asdict(out)
\ No newline at end of file
diff --git a/app/commentary_scoring/text_utils.py b/app/commentary_scoring/text_utils.py
new file mode 100644
index 0000000..75b82a2
--- /dev/null
+++ b/app/commentary_scoring/text_utils.py
@@ -0,0 +1,23 @@
+import re
+from typing import Union, List
+
+TAG_RE = re.compile(r"<[^>]+>")
+
+
+def strip_html(s: str) -> str:
+    return TAG_RE.sub("", s)
+
+
+def flatten_text(x: Union[str, List, tuple]) -> str:
+    if isinstance(x, str):
+        return x
+    if isinstance(x, (list, tuple)):
+        return " ".join(flatten_text(el) for el in x)
+    return str(x)
+
+
+def to_plain_text(raw: Union[str, List, tuple]) -> str:
+    """Recursively flatten + remove HTML → clean unicode."""
+    flat = flatten_text(raw)
+    clean = strip_html(flat)
+    return re.sub(r"\s+", " ", clean).strip()
\ No newline at end of file
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
new file mode 100644
index 0000000..5758e1d
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
@@ -0,0 +1,4 @@
+from .sheet_scoring_input import SheetScoringInput
+from .sheet_scoring_output import SheetScoringOutput
+
+__all__ = ["SheetScoringInput", "SheetScoringOutput"]
\ No newline at end of file
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py
new file mode 100644
index 0000000..acff7e8
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py
@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class SheetScoringInput:
+    sheet_content: dict[str, Any]
+
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py
new file mode 100644
index 0000000..14bccd1
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py
@@ -0,0 +1,19 @@
+from dataclasses import dataclass
+from typing import Dict, Union
+from datetime import datetime
+
+
+@dataclass
+class SheetScoringOutput:
+    sheet_id: str
+    processed_datetime: str
+    language: str
+    title_interest_level: int
+    title_interest_reason: str
+    creativity_score: float
+    ref_levels: Dict[str, int]
+    ref_scores: Dict[str, float]
+
+    def __post_init__(self):
+        if isinstance(self.processed_datetime, datetime):
+            self.processed_datetime = self.processed_datetime.isoformat()
\ No newline at end of file

From e05f3d910f59e418397bec090a04dd9346af46e9 Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Sun, 10 Aug 2025 11:26:54 +0300
Subject: [PATCH 2/9] feat: adding init to commentary_scoring package

---
 app/commentary_scoring/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 app/commentary_scoring/__init__.py

diff --git a/app/commentary_scoring/__init__.py b/app/commentary_scoring/__init__.py
new file mode 100644
index 0000000..e69de29

From 973f6a40adb3ee7cb8c907d1283fe0822b8a675b Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Sun, 10 Aug 2025 11:50:40 +0300
Subject: [PATCH 3/9] feat: added by mistake to previous commit sheet_scoring
 input/output classes instead of commit_scoring

---
 .../commentary_scoring/__init__.py                   |  6 +++---
 .../commentary_scoring/commentary_scoring_input.py   | 10 ++++++++++
 ...coring_output.py => commentary_scoring_output.py} | 12 ++++--------
 .../commentary_scoring/sheet_scoring_input.py        |  8 --------
 4 files changed, 17 insertions(+), 19 deletions(-)
 create mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
 rename app/llm_interface/sefaria_llm_interface/commentary_scoring/{sheet_scoring_output.py => commentary_scoring_output.py} (58%)
 delete mode 100644 app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py

diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
index 5758e1d..31e3d07 100644
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
@@ -1,4 +1,4 @@
-from .sheet_scoring_input import SheetScoringInput
-from .sheet_scoring_output import SheetScoringOutput
+from .commentary_scoring_input import CommentaryScoringInput
+from .commentary_scoring_output import CommentaryScoringOutput
 
-__all__ = ["SheetScoringInput", "SheetScoringOutput"]
\ No newline at end of file
+__all__ = ["CommentaryScoringInput", "CommentaryScoringOutput"]
\ No newline at end of file
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
new file mode 100644
index 0000000..e6ed706
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass
+from typing import List, Dict
+
+
+@dataclass
+class CommentaryScoringInput:
+    commentary_text: List[str]
+    cited_refs: Dict[str, str]
+    commentary_ref: str
+
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
similarity index 58%
rename from app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py
rename to app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
index 14bccd1..4386fd9 100644
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_output.py
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
@@ -4,15 +4,11 @@
 
 
 @dataclass
-class SheetScoringOutput:
-    sheet_id: str
+class CommentaryScoringOutput:
+    commentary_ref: str
+    ref_scores: Dict[str, int]
+    scores_explanation: Dict[str, str]
     processed_datetime: str
-    language: str
-    title_interest_level: int
-    title_interest_reason: str
-    creativity_score: float
-    ref_levels: Dict[str, int]
-    ref_scores: Dict[str, float]
 
     def __post_init__(self):
         if isinstance(self.processed_datetime, datetime):
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py
deleted file mode 100644
index acff7e8..0000000
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/sheet_scoring_input.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from dataclasses import dataclass
-from typing import Any
-
-
-@dataclass
-class SheetScoringInput:
-    sheet_content: dict[str, Any]
-

From a598a9185c130a644ec32760263cbde46de6b083 Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Mon, 11 Aug 2025 17:05:21 +0300
Subject: [PATCH 4/9] feat:

- added to CommentaryScoringOutput debugging fields: request_status and request_status_message
- updated CommentaryScorer, so it will return CommentaryScoringOutput instead of dictionary; this update also influences commentary_scoring.py

NOTE: by now importing from sefaria-llm-interface are local and not package-style, since the version with necessary files was not yet released
---
 app/commentary_scoring/commentary_scoring.py  | 23 +-----
 .../openai_commentary_scorer.py               | 82 ++++++++++++-------
 app/commentary_scoring/tasks.py               |  5 +-
 .../commentary_scoring_output.py              |  2 +
 4 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py
index 61cb196..3f2530a 100644
--- a/app/commentary_scoring/commentary_scoring.py
+++ b/app/commentary_scoring/commentary_scoring.py
@@ -1,34 +1,19 @@
 from .openai_commentary_scorer import CommentaryScorer
 import os
 from pathlib import Path
-from sefaria_llm_interface.commentary_scoring import (
+# TODO: change the imports when compile package
+from app.llm_interface.sefaria_llm_interface.commentary_scoring import (
     CommentaryScoringInput,
     CommentaryScoringOutput,
 )
 
-
 def score_one_commentary(inp: CommentaryScoringInput) -> (
         CommentaryScoringOutput):
     scorer = CommentaryScorer(
         api_key=os.getenv("OPENAI_API_KEY")
     )
-    result = (scorer.
-    process_commentary_by_content(
+    return scorer.process_commentary_by_content(
         commentary_ref=inp.commentary_ref,
         cited_refs=inp.cited_refs,
         commentary_text=inp.commentary_text
-    )
-    )
-    if not result:
-        return CommentaryScoringOutput(
-            commentary_ref=inp.commentary_ref,
-            ref_scores={},
-            scores_explanation="",
-            processed_datetime=None,
-        )
-    return CommentaryScoringOutput(
-        commentary_ref=inp.commentary_ref,
-        ref_scores=result.get(scorer.REF_SCORE_FIELD),
-        scores_explanation=result.get(scorer.EXPLANATION_FIELD),
-        processed_datetime=result.get(scorer.PROCESSED_AT_FIELD),
-    )
+    )
\ No newline at end of file
diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py
index bd7bebc..1b13f60 100644
--- a/app/commentary_scoring/openai_commentary_scorer.py
+++ b/app/commentary_scoring/openai_commentary_scorer.py
@@ -14,7 +14,11 @@
 from langchain.chat_models import ChatOpenAI
 from langchain.schema import HumanMessage
 from commentary_scoring.text_utils import to_plain_text
-
+# TODO: change the imports when compile package
+from app.llm_interface.sefaria_llm_interface.commentary_scoring import (
+    CommentaryScoringInput,
+    CommentaryScoringOutput,
+)
 logger = logging.getLogger(__name__)
 
 
@@ -28,6 +32,13 @@ class ExplanationLevel(IntEnum):
     COMPREHENSIVE = 4
 
 
+class RequestStatus(IntEnum):
+    """LLM's success/failure"""
+
+    SUCCESS = 1
+    FAILURE = 0
+
+
 class LanguageCode:
     """ISO 639-1 language codes."""
 
@@ -218,6 +229,20 @@ def _build_function_schema(self, cited_keys: List[str]) -> Dict[str,Any]:
             }
         }
 
+    def _create_failure_scoring_output(self, commentary_ref,
+                                       processed_datetime: datetime,
+                                       request_status_message: str) -> (
+            CommentaryScoringOutput):
+        logger.warning(request_status_message)
+        return CommentaryScoringOutput(
+            commentary_ref=commentary_ref,
+            ref_scores={},
+            scores_explanation={},
+            processed_datetime=str(processed_datetime),
+            request_status_message=request_status_message,
+            request_status=RequestStatus.FAILURE
+        )
+
     def _build_scoring_prompt(
             self,
             cited_refs: Dict[str,str],
@@ -264,22 +289,18 @@ def process_commentary_by_content(
             commentary_text: Union[List[str],str],
             cited_refs: Dict[str,str],
             commentary_ref: str = ""
-    ) -> Optional[Dict[str, Any]]:
+    ) -> CommentaryScoringOutput:
         """Score how well a commentary explains its cited texts.
         """
         if not cited_refs:
-            logger.info(
-                f"Commentary {commentary_ref} doesn't cite anything. "
-                f"Defaulting to None"
-            )
-            return None
+            return self._create_failure_scoring_output(commentary_ref=commentary_ref,
+                                                       processed_datetime=datetime.now(timezone.utc),
+                                                       request_status_message=f"Commentary {commentary_ref} doesn't cite anything. ")
 
         if not commentary_text:
-            logger.info(
-                f"Commentary {commentary_ref} is empty. "
-                f"Defaulting to None"
-            )
-            return None
+            return self._create_failure_scoring_output(commentary_ref=commentary_ref,
+                                                       processed_datetime=datetime.now(timezone.utc),
+                                                       request_status_message=f"Commentary {commentary_ref} is empty. ")
 
             # Convert commentary text to string format
         if isinstance(commentary_text,list):
@@ -288,22 +309,22 @@ def process_commentary_by_content(
             commentary_text_str = str(commentary_text)
 
         if not commentary_text_str.strip():
-            logger.warning(f"Commentary's {commentary_ref} text is empty "
-                           f"after processing")
 
-            return None
+            return self._create_failure_scoring_output(
+                commentary_ref=commentary_ref,
+                processed_datetime=datetime.now(timezone.utc),
+                request_status_message=f"Commentary's {commentary_ref} text is empty "
+                )
 
         token_count = self._count_tokens(commentary_text_str)
         max_allowed_tokens = self.max_prompt_tokens - self.token_margin
 
         if token_count > max_allowed_tokens:
             # TODO: add long commentary support
-            logger.warning(
-                f"{commentary_ref}'s input too long "
-                f"({token_count} tokens > {max_allowed_tokens} limit). "
-                "Skipping scoring."
-            )
-            return None
+            return self._create_failure_scoring_output(commentary_ref=commentary_ref,
+                                                       processed_datetime=datetime.now(timezone.utc),
+                                                       request_status_message=(f"{commentary_ref}'s input too long "
+                                      f"({token_count} tokens > {max_allowed_tokens} limit). "))
 
         logger.info(
             f"Processing commentary with {token_count} tokens, "
@@ -326,13 +347,15 @@ def process_commentary_by_content(
             }
 
             # Create structured result
-            result = {
-                self.REF_SCORE_FIELD: validated_scores,
-                self.EXPLANATION_FIELD: raw_response.get(
+            result = CommentaryScoringOutput(
+                commentary_ref=commentary_ref,
+                ref_scores=validated_scores,
+                scores_explanation=raw_response.get(
                     self.EXPLANATION_FIELD, {}
                     ),
-                self.PROCESSED_AT_FIELD: datetime.now(timezone.utc),
-            }
+                processed_datetime=str(datetime.now(timezone.utc)),
+                request_status_message="",
+                request_status=RequestStatus.SUCCESS)
 
             logger.info(
                 f"Successfully scored commentary {commentary_ref}. "
@@ -342,5 +365,8 @@ def process_commentary_by_content(
             return result
 
         except Exception as e:
-            logger.error(f"Commentary {commentary_ref} scoring failed: {e}")
-            return None
+            return self._create_failure_scoring_output(
+                commentary_ref=commentary_ref,
+                processed_datetime=datetime.now(timezone.utc),
+                request_status_message=f"Commentary {commentary_ref} scoring failed: {e}"
+                )
\ No newline at end of file
diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py
index 4215694..ff4cbd6 100644
--- a/app/commentary_scoring/tasks.py
+++ b/app/commentary_scoring/tasks.py
@@ -1,7 +1,8 @@
 from celery import shared_task
 from commentary_scoring.commentary_scoring import score_one_commentary
-from sefaria_llm_interface.commentary_scoring import (
-    CommentaryScoringInput
+# TODO: change the import
+from app.llm_interface.sefaria_llm_interface.commentary_scoring import (
+    CommentaryScoringInput,
 )
 from dataclasses import asdict
 
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
index 4386fd9..9248d2d 100644
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
@@ -9,6 +9,8 @@ class CommentaryScoringOutput:
     ref_scores: Dict[str, int]
     scores_explanation: Dict[str, str]
     processed_datetime: str
+    request_status_message: str
+    request_status: int
 
     def __post_init__(self):
         if isinstance(self.processed_datetime, datetime):

From 8a54f5834acc78a25e710476a92dad4a94d15eac Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Tue, 12 Aug 2025 12:39:13 +0300
Subject: [PATCH 5/9] =?UTF-8?q?feat(llm/commentary=5Fscoring):=20switch=20?=
 =?UTF-8?q?to=20binary=20=E2=80=9Cexplains=20or=20not=E2=80=9D=20labeling?=
 =?UTF-8?q?=20(0/1)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace 0–4 ExplanationLevel with binary ExplainsFlag {0: NOT_EXPLAINED, 1: EXPLAINED}
- Clamp/validate scores to 0/1 in _validate_level
- Update function-calling JSON schema to minimum: 0, maximum: 1 per cited key
- Rewrite prompt to policy:
-- Return 1 if the commentary provides any substantive interpretation of any part of the citation (incl. methodological/kabbalistic reads)
-- Return 0 if citation is decorative/prooftext/only paraphrased
-- If A is cited only via B and C adds no new interpretation of A beyond B → 0
-- Partial coverage still counts as 1
- Explanations: ask model to begin each rationale with Explained spans: '<phrase1>'; ... then 1–2 sentence justification (no schema change)
- Logging: report explained X/Y (Z%) instead of average 0–4

--
BREAKING BEHAVIOR: numeric scale semantics changed from graded (0–4) to binary (0/1).
---
 .../openai_commentary_scorer.py               | 128 ++++++++----------
 1 file changed, 55 insertions(+), 73 deletions(-)

diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py
index 1b13f60..cc133a1 100644
--- a/app/commentary_scoring/openai_commentary_scorer.py
+++ b/app/commentary_scoring/openai_commentary_scorer.py
@@ -22,14 +22,9 @@
 logger = logging.getLogger(__name__)
 
 
-class ExplanationLevel(IntEnum):
-    """Levels of explanation quality for commentary scoring."""
-
-    NO_EXPLANATION = 0
-    MINIMAL = 1
-    MODERATE = 2
-    SIGNIFICANT = 3
-    COMPREHENSIVE = 4
+class ExplainsFlag(IntEnum):
+    NOT_EXPLAINED = 0
+    EXPLAINED = 1
 
 
 class RequestStatus(IntEnum):
@@ -72,7 +67,8 @@ class CommentaryScorer:
     PROCESSED_AT_FIELD = "processed_datetime"
 
     # Valid explanation levels
-    VALID_LEVELS: Set[int] = {level.value for level in ExplanationLevel}
+    VALID_LEVELS: Set[int] = \
+        {ExplainsFlag.NOT_EXPLAINED, ExplainsFlag.EXPLAINED}
 
     def __init__(
             self,
@@ -124,25 +120,20 @@ def _count_tokens(self, text: str) -> int:
             # Fallback to character-based estimation
             return max(1, len(text) // self.DEFAULT_TOKEN_CHAR_RATIO)
 
-    def _validate_level(self,score: Any) -> int:
-        """Validate and normalize explanation level score.
-        """
+    def _validate_level(self, score: Any) -> int:
         try:
             score = int(score)
         except (ValueError, TypeError):
             logger.warning(
                 f"Invalid reference score '{score}', defaulting to 0"
                 )
-            return ExplanationLevel.NO_EXPLANATION
+            return ExplainsFlag.NOT_EXPLAINED
 
         if score not in self.VALID_LEVELS:
-            clamped = max(
-                ExplanationLevel.NO_EXPLANATION,
-                min(score,ExplanationLevel.COMPREHENSIVE)
-            )
+            clamped = ExplainsFlag.EXPLAINED if score >= 1 else ExplainsFlag.NOT_EXPLAINED
             logger.warning(
                 f"Reference score {score} out of range, clamping to {clamped}"
-            )
+                )
             return clamped
 
         return score
@@ -176,57 +167,46 @@ def _invoke_llm(
             logger.error(f"LLM invocation failed: {e}")
             raise
 
-    def _build_function_schema(self, cited_keys: List[str]) -> Dict[str,Any]:
-        """Build JSON schema for function calling.
-
-        Args:
-            cited_keys: List of citation keys to score
-
-        Returns:
-            JSON schema for the scoring function
-        """
+    def _build_function_schema(self,cited_keys: List[str]) -> Dict[str,Any]:
         if not cited_keys:
             raise ValueError("cited_keys cannot be empty")
 
         return {
             "name": "score_multiple_explanations",
-            "description": "Score how well a commentary explains each cited text",
+            "description": "Binary labeling: does the commentary actually interpret/explain each cited text?",
             "parameters": {
                 "type": "object",
                 "properties": {
                     self.REF_SCORE_FIELD: {
                         "type": "object",
                         "properties": {
-                            key: {
-                                "type": "integer",
-                                "minimum": ExplanationLevel.NO_EXPLANATION,
-                                "maximum": ExplanationLevel.COMPREHENSIVE
-                            }
+                            key: {"type": "integer","minimum": 0,"maximum": 1}
                             for key in cited_keys
                         },
                         "required": cited_keys,
-                        "additionalProperties": False
+                        "additionalProperties": False,
                     },
                     self.EXPLANATION_FIELD: {
                         "type": "object",
                         "properties": {
                             key: {
                                 "type": "string",
-                                "maxLength": 200,
-                                "description": f"Explanation for {key} score (1-2 sentences)"
+                                "maxLength": 150,
+                                "description": (
+                                    "Brief rationale. Start with: "
+                                    "\"Explained spans: '<phrase1>'; '<phrase2>'\" "
+                                    "then 1–2 sentences why 0/1."
+                                ),
                             }
                             for key in cited_keys
                         },
                         "required": cited_keys,
-                        "additionalProperties": False
-                    }
+                        "additionalProperties": False,
+                    },
                 },
-                "required": [
-                    self.REF_SCORE_FIELD,
-                    self.EXPLANATION_FIELD
-                ],
-                "additionalProperties": False
-            }
+                "required": [self.REF_SCORE_FIELD,self.EXPLANATION_FIELD],
+                "additionalProperties": False,
+            },
         }
 
     def _create_failure_scoring_output(self, commentary_ref,
@@ -249,45 +229,45 @@ def _build_scoring_prompt(
             commentary_text: str
     ) -> str:
         """Build the prompt for scoring commentary explanations.
-
-        Args:
-            cited_refs: Mapping of reference keys to cited texts
-            commentary_text: The commentary text to evaluate
-
-        Returns:
-            Formatted prompt string
         """
         refs_section = "\n".join(
             f"- {key}: {text}" for key,text in cited_refs.items()
-        )
+            )
 
-        return f"""You are an expert evaluator of Jewish commentary quality.
+        return f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text.
 
-COMMENTARY TEXT:
-{commentary_text}
+        COMMENTARY TEXT:
+        {commentary_text}
 
-CITED TEXTS TO EVALUATE:
-{refs_section}
+        CITED TEXTS:
+        {refs_section}
 
-TASK: For each cited text, score (0-4) how well the commentary explains it:
+        TASK (binary per citation):
+        Return 1 if the commentary provides any substantive interpretation or explanation
+        of ANY PART of the cited text (including methodological interpretation, e.g., reading a word
+        as a symbol) — not just quoting or paraphrasing.
 
-SCORING SCALE:
-{ExplanationLevel.NO_EXPLANATION}: NO EXPLANATION - Citation used for unrelated point
-{ExplanationLevel.MINIMAL}: MINIMAL - Text merely paraphrased or mentioned
-{ExplanationLevel.MODERATE}: MODERATE - Commentary shares theme but doesn't explain text
-{ExplanationLevel.SIGNIFICANT}: SIGNIFICANT - Citation is main focus with meaningful explanation
-{ExplanationLevel.COMPREHENSIVE}: COMPREHENSIVE - Deep, thorough explanation that fully illuminates the text
+        Return 0 if:
+        • The citation is used for another goal (decorative, rhetorical, prooftext with no interpretation).
+        • The commentary cites Source A only via Source B, but adds NO new interpretation of A beyond B.
+          (Inherited interpretation does NOT count as explanation of A.)
+        • It merely references or paraphrases without interpreting.
 
-RETURN JSON WITH:
-1. {self.REF_SCORE_FIELD}: Object mapping each citation key to score (0-4)
-2. {self.EXPLANATION_FIELD}: Object mapping each key to brief explanation (1-2 sentences)
+        Important:
+        • If the commentary explains only PARTS of the citation, still return 1.
+        • In your explanation, list the exact phrases from the cited text that ARE explained (if any),
+          then give a concise rationale for 0/1.
 
-Be precise and consistent in your scoring."""
+        RETURN JSON WITH:
+        1. {self.REF_SCORE_FIELD}: object of 0/1 per citation key
+        2. {self.EXPLANATION_FIELD}: object of brief rationales. Begin each value with:
+           Explained spans: '<phrase1>'; '<phrase2>' (or 'None'), then 1–2 sentences of rationale.
+        """
 
     def process_commentary_by_content(
             self,
             commentary_text: Union[List[str],str],
-            cited_refs: Dict[str,str],
+            cited_refs: Dict[str, str],
             commentary_ref: str = ""
     ) -> CommentaryScoringOutput:
         """Score how well a commentary explains its cited texts.
@@ -303,7 +283,7 @@ def process_commentary_by_content(
                                                        request_status_message=f"Commentary {commentary_ref} is empty. ")
 
             # Convert commentary text to string format
-        if isinstance(commentary_text,list):
+        if isinstance(commentary_text, list):
             commentary_text_str = to_plain_text(commentary_text)
         else:
             commentary_text_str = str(commentary_text)
@@ -357,9 +337,11 @@ def process_commentary_by_content(
                 request_status_message="",
                 request_status=RequestStatus.SUCCESS)
 
+            explained = sum(validated_scores.values())
+            total = len(validated_scores)
             logger.info(
-                f"Successfully scored commentary {commentary_ref}. "
-                f"Average score: {sum(validated_scores.values()) / len(validated_scores):.2f}"
+                f"Scored commentary {commentary_ref}: explained {explained}/{total} "
+                f"({(explained / total * 100 if total else 0):.0f}%)"
             )
 
             return result
@@ -369,4 +351,4 @@ def process_commentary_by_content(
                 commentary_ref=commentary_ref,
                 processed_datetime=datetime.now(timezone.utc),
                 request_status_message=f"Commentary {commentary_ref} scoring failed: {e}"
-                )
\ No newline at end of file
+            )
\ No newline at end of file

From 6e4f34d230fd29a679c137360d2c75eea95a549f Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Tue, 12 Aug 2025 17:36:19 +0300
Subject: [PATCH 6/9] feat: changed the commentary_text type from List to str

---
 app/commentary_scoring/openai_commentary_scorer.py | 14 +++++---------
 .../commentary_scoring/commentary_scoring_input.py |  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py
index cc133a1..4536ada 100644
--- a/app/commentary_scoring/openai_commentary_scorer.py
+++ b/app/commentary_scoring/openai_commentary_scorer.py
@@ -266,7 +266,7 @@ def _build_scoring_prompt(
 
     def process_commentary_by_content(
             self,
-            commentary_text: Union[List[str],str],
+            commentary_text: str,
             cited_refs: Dict[str, str],
             commentary_ref: str = ""
     ) -> CommentaryScoringOutput:
@@ -282,13 +282,9 @@ def process_commentary_by_content(
                                                        processed_datetime=datetime.now(timezone.utc),
                                                        request_status_message=f"Commentary {commentary_ref} is empty. ")
 
-            # Convert commentary text to string format
-        if isinstance(commentary_text, list):
-            commentary_text_str = to_plain_text(commentary_text)
-        else:
-            commentary_text_str = str(commentary_text)
 
-        if not commentary_text_str.strip():
+
+        if not commentary_text.strip():
 
             return self._create_failure_scoring_output(
                 commentary_ref=commentary_ref,
@@ -296,7 +292,7 @@ def process_commentary_by_content(
                 request_status_message=f"Commentary's {commentary_ref} text is empty "
                 )
 
-        token_count = self._count_tokens(commentary_text_str)
+        token_count = self._count_tokens(commentary_text)
         max_allowed_tokens = self.max_prompt_tokens - self.token_margin
 
         if token_count > max_allowed_tokens:
@@ -313,7 +309,7 @@ def process_commentary_by_content(
 
         try:
             # Build prompt and schema
-            prompt = self._build_scoring_prompt(cited_refs, commentary_text_str)
+            prompt = self._build_scoring_prompt(cited_refs, commentary_text)
             schema = self._build_function_schema(list(cited_refs.keys()))
 
             # Get LLM response
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
index e6ed706..cb48532 100644
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
@@ -4,7 +4,7 @@
 
 @dataclass
 class CommentaryScoringInput:
-    commentary_text: List[str]
+    commentary_text: str
     cited_refs: Dict[str, str]
     commentary_ref: str
 

From cc189d4e5352f28143f412e60903a772002a50bf Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Sun, 17 Aug 2025 13:49:43 +0300
Subject: [PATCH 7/9] feat: changed sefaria-llm-interface importing from folder
 importing to package importing

---
 app/commentary_scoring/commentary_scoring.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py
index 3f2530a..291be6f 100644
--- a/app/commentary_scoring/commentary_scoring.py
+++ b/app/commentary_scoring/commentary_scoring.py
@@ -1,8 +1,6 @@
 from .openai_commentary_scorer import CommentaryScorer
 import os
-from pathlib import Path
-# TODO: change the imports when compile package
-from app.llm_interface.sefaria_llm_interface.commentary_scoring import (
+from sefaria_llm_interface.commentary_scoring import (
     CommentaryScoringInput,
     CommentaryScoringOutput,
 )

From 652b9342c28099aad24917e88851b14d64776918 Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Tue, 19 Aug 2025 15:44:00 +0300
Subject: [PATCH 8/9] style: - added README with explanation of the code -
 removed unnecessary imports from commentary_scoring_input.py,
 commentary_scoring_output.py - in openai_commentary_scorer.py changed the
 sefaria-llm-inteface importing from local folder importing to package
 importing; added comments to some functions; removed unnecessary spaces in
 functions definitions and added spaces after commas. same for text_utils.py -
 added textwrap.dedent to prompt definition - in tasks.py changed the
 sefaria-llm-inteface importing from local folder importing to package
 importing;

---
 app/commentary_scoring/README.md              | 215 ++++++++++++++++++
 .../openai_commentary_scorer.py               | 135 ++++++-----
 app/commentary_scoring/tasks.py               |   3 +-
 app/commentary_scoring/text_utils.py          |  19 +-
 .../commentary_scoring_input.py               |   2 +-
 .../commentary_scoring_output.py              |   2 +-
 6 files changed, 297 insertions(+), 79 deletions(-)
 create mode 100644 app/commentary_scoring/README.md

diff --git a/app/commentary_scoring/README.md b/app/commentary_scoring/README.md
new file mode 100644
index 0000000..4e69aec
--- /dev/null
+++ b/app/commentary_scoring/README.md
@@ -0,0 +1,215 @@
+# CommentaryScorer — Commentary–Citation Analysis Tool
+
+**CommentaryScorer** is a Python tool that uses **LLMs** to analyze a **commentary** and determine, for **each cited base text**, whether the commentary actually **explains/interprets** it. It returns a **binary score (0/1)** per citation together with a **short rationale**.
+
+---
+
+## ⭐ Scores Extracted
+
+- **Per-Citation Explanation Score**: `0` = not explained, `1` = explained  
+- **Per-Citation Rationale**: short reason string that begins with  
+  `Explained spans: '<phrase1>'; '<phrase2>'` (or `'None'`)  
+
+---
+
+## 🚀 Quick Start
+
+```python
+from commentary_scoring.commentary_scoring import score_one_commentary
+from sefaria_llm_interface.commentary_scoring import CommentaryScoringInput
+
+inp = CommentaryScoringInput(
+    commentary_ref="Rashi on Genesis 1:1",
+    cited_refs={
+        "Genesis 1:1": "In the beginning God created the heavens and the earth.",
+        "Genesis 1:2": "Now the earth was formless and void..."
+    },
+    commentary_text="""
+      Rashi on 'בראשית' explains sequencing/purpose and interprets terms...
+    """
+)
+
+out = score_one_commentary(inp)
+
+print("Scores:", out.ref_scores)
+print("Reasons:", out.scores_explanation)
+
+```
+
+## 📦 Data Structures
+
+### **Input — `CommentaryScoringInput`**
+
+```python
+{
+  "commentary_ref": "Rashi on Genesis 1:1",   # Optional string identifier
+  "cited_refs": {                             # Dict of citation → base text
+    "Genesis 1:1": "In the beginning ...",
+    "Genesis 1:2": "Now the earth ..."
+  },
+  "commentary_text": "Full commentary text (plain or HTML)"
+}
+```
+
+- **commentary_ref**: identifier for the commentary (helpful for logging)  
+- **cited_refs**: dictionary mapping citation keys (e.g., `"Genesis 1:1"`) to their textual content  
+- **commentary_text**: commentary body text (string, can contain HTML, nested lists, etc.)
+
+---
+
+### **Output — `CommentaryScoringOutput`**
+
+```python
+{
+  "commentary_ref": "Rashi on Genesis 1:1",
+  "ref_scores": { "Genesis 1:1": 1, "Genesis 1:2": 0 },
+  "scores_explanation": {
+    "Genesis 1:1": "Explained spans: 'בראשית'; 'אלוקים' — Adds interpretive content ...",
+    "Genesis 1:2": "Explained spans: None — Only a decorative quote, no interpretation ..."
+  },
+  "processed_datetime": "2025-08-19T10:30:00Z",
+  "request_status": 1,
+  "request_status_message": ""
+}
+```
+
+- **ref_scores**: dictionary of binary scores per citation (0 = not explained, 1 = explained)  
+- **scores_explanation**: dictionary of rationales per citation, each beginning with **“Explained spans”**  
+- **processed_datetime**: UTC ISO8601 timestamp when scoring was done  
+- **request_status**: `1 = success`, `0 = failure`  
+- **request_status_message**: error description in case of failure
+
+---
+
+## ⚙️ Scoring System
+
+### **Architecture**
+
+The `commentary_scoring` package consists of:
+
+- `commentary_scoring.py` — Main API with `score_one_commentary()`  
+- `openai_commentary_scorer.py` — Core LLM engine (`CommentaryScorer`)  
+- `tasks.py` — Celery task wrapper for async processing  
+- `text_utils.py` — Utilities for HTML stripping and flattening  
+- `README.md` — Documentation  
+
+
+---
+
+### **Explanation Levels**
+
+| Level | Description |
+|-------|-------------|
+| **0 — Not Explained** | Commentary does not interpret the cited text (decorative prooftext, paraphrase only, inherited interpretation). |
+| **1 — Explained**     | Commentary provides interpretation or explanation of any part of the cited text. |
+
+---
+
+## 🧠 Algorithm
+
+### **Input Validation**
+- Fail if `cited_refs` is empty or `commentary_text` is missing  
+- Token counting via `tiktoken` (fallback = character length)  
+- If too long → fail fast with `"input too long"`
+
+### **Build Prompt**
+- Commentary text + cited refs in structured sections  
+- Explicit instructions for binary labeling per citation  
+- Require **“Explained spans”** prefix in explanations
+
+### **Schema Enforcement**
+- OpenAI function calling schema requires:  
+  - `ref_scores`: dict of citation → 0/1  
+  - `explanation`: dict of citation → rationale string  
+
+### **LLM Invocation**
+- Config: `gpt-4o-mini`, `temperature=0`, `top_p=0`, `seed=42`  
+- Parse structured JSON output
+
+### **Post-Processing**
+- Clamp invalid values to `0` or `1`  
+- Return `CommentaryScoringOutput`
+
+---
+
+## 🔧 Configuration Options
+
+### **Initialization**
+
+```python
+from commentary_scoring.openai_commentary_scorer import CommentaryScorer
+
+scorer = CommentaryScorer(
+    api_key=os.getenv("OPENAI_API_KEY"),
+    model="gpt-4o-mini",        # default model
+    max_prompt_tokens=32000,    # max tokens for input prompt
+    token_margin=4096           # reserved for model response
+)
+```
+
+- **API Key**: via `OPENAI_API_KEY` environment variable or explicit parameter  
+- **Model**: defaults to `gpt-4o-mini`, override if needed  
+- **Token Guardrails**: ensures commentary fits within prompt budget  
+
+---
+
+## 📜 Celery Integration
+
+### **Task Wrapper**
+
+```python
+@shared_task(name='llm.score_commentary')
+def score_sheet_task(raw_input: dict) -> dict:
+    inp = CommentaryScoringInput(**raw_input)
+    out = score_one_commentary(inp)
+    return asdict(out)
+```
+
+### **Usage**
+
+```python
+from celery import signature
+
+payload = {
+  "commentary_ref": "Rashi on Genesis 1:1",
+  "cited_refs": {"Genesis 1:1": "...", "Genesis 1:2": "..."},
+  "commentary_text": "Rashi explains ..."
+}
+sig = signature("llm.score_commentary", args=[payload], queue="llm")
+print(sig.apply_async().get())
+```
+
+---
+
+## 📊 Output Fields
+
+| Field                  | Description                                      |
+|------------------------|--------------------------------------------------|
+| `ref_scores`           | Binary 0/1 scores per citation                   |
+| `scores_explanation`   | Rationale strings beginning with `"Explained spans"` |
+| `commentary_ref`       | Commentary identifier                            |
+| `processed_datetime`   | UTC ISO8601 timestamp                            |
+| `request_status`       | `1 = success`, `0 = failure`                     |
+| `request_status_message` | Error message if failure                        |
+
+---
+
+## 📝 Logging
+
+- **Info**: token count, number of citations, success summary  
+- **Warning**: invalid scores clamped, tokenizer fallback  
+- **Error**: LLM or JSON parse failures  
+
+```python
+import logging
+logging.getLogger("commentary_scoring").setLevel(logging.INFO)
+```
+
+---
+
+##  ✅ Extensibility
+
+- By now there is no support for very long commentaries, because during testing I didn't encounter any. The chances are high that we won't need this feature at all -- but the matter should be explored.
+
+---
+
diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py
index 4536ada..37a1ef0 100644
--- a/app/commentary_scoring/openai_commentary_scorer.py
+++ b/app/commentary_scoring/openai_commentary_scorer.py
@@ -6,16 +6,15 @@
 
 import json
 import logging
+import textwrap
 from datetime import datetime, timezone
 from enum import IntEnum
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Set
 
 import tiktoken
 from langchain.chat_models import ChatOpenAI
 from langchain.schema import HumanMessage
-from commentary_scoring.text_utils import to_plain_text
-# TODO: change the imports when compile package
-from app.llm_interface.sefaria_llm_interface.commentary_scoring import (
+from sefaria_llm_interface.commentary_scoring import (
     CommentaryScoringInput,
     CommentaryScoringOutput,
 )
@@ -23,23 +22,22 @@
 
 
 class ExplainsFlag(IntEnum):
-    NOT_EXPLAINED = 0
-    EXPLAINED = 1
+   """Binary flags for whether a commentary explains a cited text."""
+   NOT_EXPLAINED = 0  # Commentary doesn't interpret the cited text
+   EXPLAINED = 1      # Commentary provides interpretation/explanation
 
 
 class RequestStatus(IntEnum):
-    """LLM's success/failure"""
-
-    SUCCESS = 1
-    FAILURE = 0
+   """Status codes for LLM processing requests."""
+   SUCCESS = 1
+   FAILURE = 0
 
 
 class LanguageCode:
-    """ISO 639-1 language codes."""
-
-    ENGLISH = "en"
-    HEBREW = "he"
-    DEFAULT = ENGLISH
+   """ISO 639-1 language codes for supported languages."""
+   ENGLISH = "en"
+   HEBREW = "he"
+   DEFAULT = ENGLISH
 
 
 class CommentaryScorer:
@@ -49,24 +47,25 @@ class CommentaryScorer:
     explanations provided by Jewish commentaries for their cited texts.
 
     Attributes:
-        model: The OpenAI model to use for scoring
-        max_prompt_tokens: Maximum tokens allowed in prompt
-        token_margin: Reserved tokens for model response
-    """
-
-    # Configuration constants
-    DEFAULT_MAX_OUTPUT_TOKENS = 4096
-    DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000
-    DEFAULT_TOKEN_CHAR_RATIO = 3
-
-    # Response field names
-    REF_SCORE_FIELD = "ref_scores"
-    EXPLANATION_FIELD = "explanation"
-    LANGUAGE_FIELD = "language"
-    CITED_REF_FIELD = "cited_ref"
-    PROCESSED_AT_FIELD = "processed_datetime"
-
-    # Valid explanation levels
+       model (str): The OpenAI model to use for scoring
+       max_prompt_tokens (int): Maximum tokens allowed in prompt
+       token_margin (int): Reserved tokens for model response
+       llm (ChatOpenAI): Initialized language model client
+   """
+
+    # Configuration constants for token management
+    DEFAULT_MAX_OUTPUT_TOKENS = 4096  # Reserve for LLM response
+    DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000  # Total token budget
+    DEFAULT_TOKEN_CHAR_RATIO = 3  # Fallback chars-per-token estimate
+
+    # JSON response field names for structured output
+    REF_SCORE_FIELD = "ref_scores"  # Binary scores per citation
+    EXPLANATION_FIELD = "explanation"  # Rationale strings per citation
+    LANGUAGE_FIELD = "language"  # Detected language code
+    CITED_REF_FIELD = "cited_ref"  # Citation reference key
+    PROCESSED_AT_FIELD = "processed_datetime"  # Processing timestamp
+
+    # Valid explanation levels for score validation
     VALID_LEVELS: Set[int] = \
         {ExplainsFlag.NOT_EXPLAINED, ExplainsFlag.EXPLAINED}
 
@@ -77,14 +76,15 @@ def __init__(
             max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS,
             token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS,
     ) -> None:
-        """Initialize the commentary scorer.
+        """Initialize the commentary scorer with OpenAI client.
         Args:
-            api_key: OpenAI API key. If None, will use environment variable
-            model: OpenAI model name to use
-            max_prompt_tokens: Maximum tokens for input prompt
-            token_margin: Reserved tokens for model response
+            api_key: OpenAI API key. If None, uses OPENAI_API_KEY env var
+            model: OpenAI model name (default: gpt-4o-mini for cost efficiency)
+            max_prompt_tokens: Maximum tokens for input prompt (includes commentary + citations)
+            token_margin: Reserved tokens for model response (ensures budget compliance)
         Raises:
             ValueError: If model is not supported or parameters are invalid
+            Exception: If OpenAI client initialization fails
         """
 
         self.model = model
@@ -92,15 +92,16 @@ def __init__(
         self.token_margin = token_margin
 
         try:
+            # Initialize OpenAI client with deterministic settings for consistent scoring
             self.llm = ChatOpenAI(
                 model_name=model,
-                temperature=0, #Model temperature (0.0 for deterministic grading)
+                temperature=0,  # Deterministic output for consistent grading
                 openai_api_key=api_key,
                 model_kwargs={
-                    "top_p": 0,
-                    "frequency_penalty": 0,
-                    "presence_penalty": 0,
-                    "seed": 42,
+                    "top_p": 0,  # No nucleus sampling
+                    "frequency_penalty": 0,  # No frequency penalties
+                    "presence_penalty": 0,  # No presence penalties
+                    "seed": 42,  # Fixed seed for reproducibility
                 },
             )
         except Exception as e:
@@ -138,11 +139,8 @@ def _validate_level(self, score: Any) -> int:
 
         return score
 
-    def _invoke_llm(
-            self,
-            prompt: str,
-            function_schema: Dict[str, Any]
-    ) -> Dict[str, Any]:
+    def _invoke_llm(self, prompt: str, function_schema: Dict[str, Any]) \
+            -> Dict[str, Any]:
         """Invoke the language model with function calling.
         """
         try:
@@ -167,7 +165,7 @@ def _invoke_llm(
             logger.error(f"LLM invocation failed: {e}")
             raise
 
-    def _build_function_schema(self,cited_keys: List[str]) -> Dict[str,Any]:
+    def _build_function_schema(self, cited_keys: List[str]) -> Dict[str, Any]:
         if not cited_keys:
             raise ValueError("cited_keys cannot be empty")
 
@@ -209,10 +207,15 @@ def _build_function_schema(self,cited_keys: List[str]) -> Dict[str,Any]:
             },
         }
 
-    def _create_failure_scoring_output(self, commentary_ref,
-                                       processed_datetime: datetime,
-                                       request_status_message: str) -> (
-            CommentaryScoringOutput):
+    def _create_failure_scoring_output(
+            self,
+            commentary_ref: str,
+            processed_datetime: datetime,
+            request_status_message: str
+    ) -> CommentaryScoringOutput:
+        """Create standardized failure output for error cases.
+        Returns: CommentaryScoringOutput: Failure result with error details
+        """
         logger.warning(request_status_message)
         return CommentaryScoringOutput(
             commentary_ref=commentary_ref,
@@ -223,18 +226,12 @@ def _create_failure_scoring_output(self, commentary_ref,
             request_status=RequestStatus.FAILURE
         )
 
-    def _build_scoring_prompt(
-            self,
-            cited_refs: Dict[str,str],
-            commentary_text: str
-    ) -> str:
+    def _build_scoring_prompt(self, cited_refs: Dict[str, str], commentary_text: str) -> str:
         """Build the prompt for scoring commentary explanations.
         """
-        refs_section = "\n".join(
-            f"- {key}: {text}" for key,text in cited_refs.items()
-            )
+        refs_section = "\n".join(f"- {key}: {text}" for key, text in cited_refs.items())
 
-        return f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text.
+        return textwrap.dedent(f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text.
 
         COMMENTARY TEXT:
         {commentary_text}
@@ -243,11 +240,11 @@ def _build_scoring_prompt(
         {refs_section}
 
         TASK (binary per citation):
-        Return 1 if the commentary provides any substantive interpretation or explanation
+        Return {ExplainsFlag.EXPLAINED} if the commentary provides any substantive interpretation or explanation
         of ANY PART of the cited text (including methodological interpretation, e.g., reading a word
         as a symbol) — not just quoting or paraphrasing.
 
-        Return 0 if:
+        Return {ExplainsFlag.NOT_EXPLAINED} if:
         • The citation is used for another goal (decorative, rhetorical, prooftext with no interpretation).
         • The commentary cites Source A only via Source B, but adds NO new interpretation of A beyond B.
           (Inherited interpretation does NOT count as explanation of A.)
@@ -256,13 +253,13 @@ def _build_scoring_prompt(
         Important:
         • If the commentary explains only PARTS of the citation, still return 1.
         • In your explanation, list the exact phrases from the cited text that ARE explained (if any),
-          then give a concise rationale for 0/1.
+          then give a concise rationale for {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED}.
 
         RETURN JSON WITH:
-        1. {self.REF_SCORE_FIELD}: object of 0/1 per citation key
+        1. {self.REF_SCORE_FIELD}: object of {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED} per citation key
         2. {self.EXPLANATION_FIELD}: object of brief rationales. Begin each value with:
            Explained spans: '<phrase1>'; '<phrase2>' (or 'None'), then 1–2 sentences of rationale.
-        """
+        """)
 
     def process_commentary_by_content(
             self,
@@ -282,8 +279,6 @@ def process_commentary_by_content(
                                                        processed_datetime=datetime.now(timezone.utc),
                                                        request_status_message=f"Commentary {commentary_ref} is empty. ")
 
-
-
         if not commentary_text.strip():
 
             return self._create_failure_scoring_output(
@@ -306,7 +301,6 @@ def process_commentary_by_content(
             f"Processing commentary with {token_count} tokens, "
             f"{len(cited_refs)} citations"
         )
-
         try:
             # Build prompt and schema
             prompt = self._build_scoring_prompt(cited_refs, commentary_text)
@@ -321,7 +315,6 @@ def process_commentary_by_content(
                 key: self._validate_level(score)
                 for key, score in raw_scores.items()
             }
-
             # Create structured result
             result = CommentaryScoringOutput(
                 commentary_ref=commentary_ref,
diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py
index ff4cbd6..c270128 100644
--- a/app/commentary_scoring/tasks.py
+++ b/app/commentary_scoring/tasks.py
@@ -1,7 +1,6 @@
 from celery import shared_task
 from commentary_scoring.commentary_scoring import score_one_commentary
-# TODO: change the import
-from app.llm_interface.sefaria_llm_interface.commentary_scoring import (
+from sefaria_llm_interface.commentary_scoring import (
     CommentaryScoringInput,
 )
 from dataclasses import asdict
diff --git a/app/commentary_scoring/text_utils.py b/app/commentary_scoring/text_utils.py
index 75b82a2..46f4e4c 100644
--- a/app/commentary_scoring/text_utils.py
+++ b/app/commentary_scoring/text_utils.py
@@ -1,23 +1,34 @@
 import re
 from typing import Union, List
 
+# Regular expression to match HTML tags (e.g., <div>, <b>, </p>, etc.)
 TAG_RE = re.compile(r"<[^>]+>")
 
 
 def strip_html(s: str) -> str:
+    """
+    Remove all HTML tags from a given string.
+    """
     return TAG_RE.sub("", s)
 
 
 def flatten_text(x: Union[str, List, tuple]) -> str:
+    """
+    Recursively flatten a nested structure of strings, lists, or tuples into a single string.
+    """
     if isinstance(x, str):
         return x
     if isinstance(x, (list, tuple)):
+        # Recursively flatten all elements and join with spaces
         return " ".join(flatten_text(el) for el in x)
+    # If it's not a string or list/tuple, convert it to string
     return str(x)
 
 
 def to_plain_text(raw: Union[str, List, tuple]) -> str:
-    """Recursively flatten + remove HTML → clean unicode."""
-    flat = flatten_text(raw)
-    clean = strip_html(flat)
-    return re.sub(r"\s+", " ", clean).strip()
\ No newline at end of file
+    """
+    Convert raw input (possibly nested and HTML-formatted) to clean plain text.
+    """
+    flat = flatten_text(raw)  # Step 1: Flatten nested structure
+    clean = strip_html(flat)  # Step 2: Remove HTML tags
+    return re.sub(r"\s+", " ", clean).strip()  # Step 3: Normalize whitespace
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
index cb48532..e6b4c2a 100644
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Dict
+from typing import Dict
 
 
 @dataclass
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
index 9248d2d..27b86b4 100644
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Union
+from typing import Dict
 from datetime import datetime
 
 

From ab6d8f26827e4c300b2791deb99127b1795996b9 Mon Sep 17 00:00:00 2001
From: morganizzzm <morganizzzm@gmail.com>
Date: Tue, 19 Aug 2025 15:46:23 +0300
Subject: [PATCH 9/9] style: - updated commentary_scoring init from local to
 package imporing

---
 .../sefaria_llm_interface/commentary_scoring/__init__.py     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
index 31e3d07..a086340 100644
--- a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
@@ -1,4 +1,3 @@
-from .commentary_scoring_input import CommentaryScoringInput
-from .commentary_scoring_output import CommentaryScoringOutput
+from sefaria_llm_interface.commentary_scoring.commentary_scoring_input import *
+from sefaria_llm_interface.commentary_scoring.commentary_scoring_output import *
 
-__all__ = ["CommentaryScoringInput", "CommentaryScoringOutput"]
\ No newline at end of file