diff --git a/app/celery_setup/app.py b/app/celery_setup/app.py
index 0b50834..d1f1f69 100644
--- a/app/celery_setup/app.py
+++ b/app/celery_setup/app.py
@@ -3,4 +3,7 @@
 
 app = Celery('llm')
 app.conf.update(**generate_config_from_env())
-app.autodiscover_tasks(packages=['topic_prompt'])
+app.autodiscover_tasks(packages=['topic_prompt',
+                                 'commentary_scoring']
+                       )
+
diff --git a/app/commentary_scoring/README.md b/app/commentary_scoring/README.md
new file mode 100644
index 0000000..4e69aec
--- /dev/null
+++ b/app/commentary_scoring/README.md
@@ -0,0 +1,215 @@
+# CommentaryScorer — Commentary–Citation Analysis Tool
+
+**CommentaryScorer** is a Python tool that uses **LLMs** to analyze a **commentary** and determine, for **each cited base text**, whether the commentary actually **explains/interprets** it. It returns a **binary score (0/1)** per citation together with a **short rationale**.
+
+---
+
+## ⭐ Scores Extracted
+
+- **Per-Citation Explanation Score**: `0` = not explained, `1` = explained  
+- **Per-Citation Rationale**: short reason string that begins with  
+  `Explained spans: '<phrase1>'; '<phrase2>'` (or `'None'`)  
+
+---
+
+## 🚀 Quick Start
+
+```python
+from commentary_scoring.commentary_scoring import score_one_commentary
+from sefaria_llm_interface.commentary_scoring import CommentaryScoringInput
+
+inp = CommentaryScoringInput(
+    commentary_ref="Rashi on Genesis 1:1",
+    cited_refs={
+        "Genesis 1:1": "In the beginning God created the heavens and the earth.",
+        "Genesis 1:2": "Now the earth was formless and void..."
+    },
+    commentary_text="""
+      Rashi on 'בראשית' explains sequencing/purpose and interprets terms...
+    """
+)
+
+out = score_one_commentary(inp)
+
+print("Scores:", out.ref_scores)
+print("Reasons:", out.scores_explanation)
+
+```
+
+## 📦 Data Structures
+
+### **Input — `CommentaryScoringInput`**
+
+```python
+{
+  "commentary_ref": "Rashi on Genesis 1:1",   # Optional string identifier
+  "cited_refs": {                             # Dict of citation → base text
+    "Genesis 1:1": "In the beginning ...",
+    "Genesis 1:2": "Now the earth ..."
+  },
+  "commentary_text": "Full commentary text (plain or HTML)"
+}
+```
+
+- **commentary_ref**: identifier for the commentary (helpful for logging)  
+- **cited_refs**: dictionary mapping citation keys (e.g., `"Genesis 1:1"`) to their textual content  
+- **commentary_text**: commentary body text (string, can contain HTML, nested lists, etc.)
+
+---
+
+### **Output — `CommentaryScoringOutput`**
+
+```python
+{
+  "commentary_ref": "Rashi on Genesis 1:1",
+  "ref_scores": { "Genesis 1:1": 1, "Genesis 1:2": 0 },
+  "scores_explanation": {
+    "Genesis 1:1": "Explained spans: 'בראשית'; 'אלוקים' — Adds interpretive content ...",
+    "Genesis 1:2": "Explained spans: None — Only a decorative quote, no interpretation ..."
+  },
+  "processed_datetime": "2025-08-19T10:30:00Z",
+  "request_status": 1,
+  "request_status_message": ""
+}
+```
+
+- **ref_scores**: dictionary of binary scores per citation (0 = not explained, 1 = explained)  
+- **scores_explanation**: dictionary of rationales per citation, each beginning with **“Explained spans”**  
+- **processed_datetime**: UTC ISO8601 timestamp when scoring was done  
+- **request_status**: `1 = success`, `0 = failure`  
+- **request_status_message**: error description in case of failure
+
+---
+
+## ⚙️ Scoring System
+
+### **Architecture**
+
+The `commentary_scoring` package consists of:
+
+- `commentary_scoring.py` — Main API with `score_one_commentary()`  
+- `openai_commentary_scorer.py` — Core LLM engine (`CommentaryScorer`)  
+- `tasks.py` — Celery task wrapper for async processing  
+- `text_utils.py` — Utilities for HTML stripping and flattening  
+- `README.md` — Documentation  
+
+
+---
+
+### **Explanation Levels**
+
+| Level | Description |
+|-------|-------------|
+| **0 — Not Explained** | Commentary does not interpret the cited text (decorative prooftext, paraphrase only, inherited interpretation). |
+| **1 — Explained**     | Commentary provides interpretation or explanation of any part of the cited text. |
+
+---
+
+## 🧠 Algorithm
+
+### **Input Validation**
+- Fail if `cited_refs` is empty or `commentary_text` is missing  
+- Token counting via `tiktoken` (fallback = character length)  
+- If too long → fail fast with `"input too long"`
+
+### **Build Prompt**
+- Commentary text + cited refs in structured sections  
+- Explicit instructions for binary labeling per citation  
+- Require **“Explained spans”** prefix in explanations
+
+### **Schema Enforcement**
+- OpenAI function calling schema requires:  
+  - `ref_scores`: dict of citation → 0/1  
+  - `explanation`: dict of citation → rationale string  
+
+### **LLM Invocation**
+- Config: `gpt-4o-mini`, `temperature=0`, `top_p=0`, `seed=42`  
+- Parse structured JSON output
+
+### **Post-Processing**
+- Clamp invalid values to `0` or `1`  
+- Return `CommentaryScoringOutput`
+
+---
+
+## 🔧 Configuration Options
+
+### **Initialization**
+
+```python
+from commentary_scoring.openai_commentary_scorer import CommentaryScorer
+
+scorer = CommentaryScorer(
+    api_key=os.getenv("OPENAI_API_KEY"),
+    model="gpt-4o-mini",        # default model
+    max_prompt_tokens=32000,    # max tokens for input prompt
+    token_margin=4096           # reserved for model response
+)
+```
+
+- **API Key**: via `OPENAI_API_KEY` environment variable or explicit parameter  
+- **Model**: defaults to `gpt-4o-mini`, override if needed  
+- **Token Guardrails**: ensures commentary fits within prompt budget  
+
+---
+
+## 📜 Celery Integration
+
+### **Task Wrapper**
+
+```python
+@shared_task(name='llm.score_commentary')
+def score_sheet_task(raw_input: dict) -> dict:
+    inp = CommentaryScoringInput(**raw_input)
+    out = score_one_commentary(inp)
+    return asdict(out)
+```
+
+### **Usage**
+
+```python
+from celery import signature
+
+payload = {
+  "commentary_ref": "Rashi on Genesis 1:1",
+  "cited_refs": {"Genesis 1:1": "...", "Genesis 1:2": "..."},
+  "commentary_text": "Rashi explains ..."
+}
+sig = signature("llm.score_commentary", args=[payload], queue="llm")
+print(sig.apply_async().get())
+```
+
+---
+
+## 📊 Output Fields
+
+| Field                  | Description                                      |
+|------------------------|--------------------------------------------------|
+| `ref_scores`           | Binary 0/1 scores per citation                   |
+| `scores_explanation`   | Rationale strings beginning with `"Explained spans"` |
+| `commentary_ref`       | Commentary identifier                            |
+| `processed_datetime`   | UTC ISO8601 timestamp                            |
+| `request_status`       | `1 = success`, `0 = failure`                     |
+| `request_status_message` | Error message if failure                        |
+
+---
+
+## 📝 Logging
+
+- **Info**: token count, number of citations, success summary  
+- **Warning**: invalid scores clamped, tokenizer fallback  
+- **Error**: LLM or JSON parse failures  
+
+```python
+import logging
+logging.getLogger("commentary_scoring").setLevel(logging.INFO)
+```
+
+---
+
+##  ✅ Extensibility
+
+- By now there is no support for very long commentaries, because during testing I didn't encounter any. The chances are high that we won't need this feature at all -- but the matter should be explored.
+
+---
+
diff --git a/app/commentary_scoring/__init__.py b/app/commentary_scoring/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/commentary_scoring/commentary_scoring.py b/app/commentary_scoring/commentary_scoring.py
new file mode 100644
index 0000000..291be6f
--- /dev/null
+++ b/app/commentary_scoring/commentary_scoring.py
@@ -0,0 +1,17 @@
+from .openai_commentary_scorer import CommentaryScorer
+import os
+from sefaria_llm_interface.commentary_scoring import (
+    CommentaryScoringInput,
+    CommentaryScoringOutput,
+)
+
+def score_one_commentary(inp: CommentaryScoringInput) -> (
+        CommentaryScoringOutput):
+    scorer = CommentaryScorer(
+        api_key=os.getenv("OPENAI_API_KEY")
+    )
+    return scorer.process_commentary_by_content(
+        commentary_ref=inp.commentary_ref,
+        cited_refs=inp.cited_refs,
+        commentary_text=inp.commentary_text
+    )
\ No newline at end of file
diff --git a/app/commentary_scoring/openai_commentary_scorer.py b/app/commentary_scoring/openai_commentary_scorer.py
new file mode 100644
index 0000000..37a1ef0
--- /dev/null
+++ b/app/commentary_scoring/openai_commentary_scorer.py
@@ -0,0 +1,343 @@
+"""Commentary Scorer for evaluating Jewish text Commentaries.
+
+This module provides functionality to score how well commentaries explain
+their cited texts using OpenAI's language models.
+"""
+
+import json
+import logging
+import textwrap
+from datetime import datetime, timezone
+from enum import IntEnum
+from typing import Any, Dict, List, Optional, Set
+
+import tiktoken
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import HumanMessage
+from sefaria_llm_interface.commentary_scoring import (
+    CommentaryScoringInput,
+    CommentaryScoringOutput,
+)
+logger = logging.getLogger(__name__)
+
+
+class ExplainsFlag(IntEnum):
+   """Binary flags for whether a commentary explains a cited text."""
+   NOT_EXPLAINED = 0  # Commentary doesn't interpret the cited text
+   EXPLAINED = 1      # Commentary provides interpretation/explanation
+
+
+class RequestStatus(IntEnum):
+   """Status codes for LLM processing requests."""
+   SUCCESS = 1
+   FAILURE = 0
+
+
+class LanguageCode:
+   """ISO 639-1 language codes for supported languages."""
+   ENGLISH = "en"
+   HEBREW = "he"
+   DEFAULT = ENGLISH
+
+
+class CommentaryScorer:
+    """Scores how well commentaries explain their cited texts.
+
+    This class uses OpenAI's language models to evaluate the quality of
+    explanations provided by Jewish commentaries for their cited texts.
+
+    Attributes:
+       model (str): The OpenAI model to use for scoring
+       max_prompt_tokens (int): Maximum tokens allowed in prompt
+       token_margin (int): Reserved tokens for model response
+       llm (ChatOpenAI): Initialized language model client
+   """
+
+    # Configuration constants for token management
+    DEFAULT_MAX_OUTPUT_TOKENS = 4096  # Reserve for LLM response
+    DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 32000  # Total token budget
+    DEFAULT_TOKEN_CHAR_RATIO = 3  # Fallback chars-per-token estimate
+
+    # JSON response field names for structured output
+    REF_SCORE_FIELD = "ref_scores"  # Binary scores per citation
+    EXPLANATION_FIELD = "explanation"  # Rationale strings per citation
+    LANGUAGE_FIELD = "language"  # Detected language code
+    CITED_REF_FIELD = "cited_ref"  # Citation reference key
+    PROCESSED_AT_FIELD = "processed_datetime"  # Processing timestamp
+
+    # Valid explanation levels for score validation
+    VALID_LEVELS: Set[int] = \
+        {ExplainsFlag.NOT_EXPLAINED, ExplainsFlag.EXPLAINED}
+
+    def __init__(
+            self,
+            api_key: Optional[str] = None,
+            model: str = "gpt-4o-mini",
+            max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS,
+            token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS,
+    ) -> None:
+        """Initialize the commentary scorer with OpenAI client.
+        Args:
+            api_key: OpenAI API key. If None, uses OPENAI_API_KEY env var
+            model: OpenAI model name (default: gpt-4o-mini for cost efficiency)
+            max_prompt_tokens: Maximum tokens for input prompt (includes commentary + citations)
+            token_margin: Reserved tokens for model response (ensures budget compliance)
+        Raises:
+            ValueError: If model is not supported or parameters are invalid
+            Exception: If OpenAI client initialization fails
+        """
+
+        self.model = model
+        self.max_prompt_tokens = max_prompt_tokens
+        self.token_margin = token_margin
+
+        try:
+            # Initialize OpenAI client with deterministic settings for consistent scoring
+            self.llm = ChatOpenAI(
+                model_name=model,
+                temperature=0,  # Deterministic output for consistent grading
+                openai_api_key=api_key,
+                model_kwargs={
+                    "top_p": 0,  # No nucleus sampling
+                    "frequency_penalty": 0,  # No frequency penalties
+                    "presence_penalty": 0,  # No presence penalties
+                    "seed": 42,  # Fixed seed for reproducibility
+                },
+            )
+        except Exception as e:
+            logger.error(f"Failed to initialize ChatOpenAI: {e}")
+            raise
+
+        logger.info(f"Initialized CommentaryScorer with model {model}")
+
+    def _count_tokens(self, text: str) -> int:
+        """Count tokens in text using the model's tokenizer.
+        """
+        try:
+            encoding = tiktoken.encoding_for_model(self.model)
+            return len(encoding.encode(text))
+        except Exception as e:
+            logger.warning(f"Failed to get exact token count: {e}")
+            # Fallback to character-based estimation
+            return max(1, len(text) // self.DEFAULT_TOKEN_CHAR_RATIO)
+
+    def _validate_level(self, score: Any) -> int:
+        try:
+            score = int(score)
+        except (ValueError, TypeError):
+            logger.warning(
+                f"Invalid reference score '{score}', defaulting to 0"
+                )
+            return ExplainsFlag.NOT_EXPLAINED
+
+        if score not in self.VALID_LEVELS:
+            clamped = ExplainsFlag.EXPLAINED if score >= 1 else ExplainsFlag.NOT_EXPLAINED
+            logger.warning(
+                f"Reference score {score} out of range, clamping to {clamped}"
+                )
+            return clamped
+
+        return score
+
+    def _invoke_llm(self, prompt: str, function_schema: Dict[str, Any]) \
+            -> Dict[str, Any]:
+        """Invoke the language model with function calling.
+        """
+        try:
+            response = self.llm.invoke(
+                [HumanMessage(content=prompt)],
+                functions=[function_schema],
+                function_call={"name": function_schema["name"]}
+            )
+            function_call = getattr(response, "additional_kwargs", {}).get(
+                "function_call"
+                )
+            if not function_call:
+                raise ValueError("No function call found in LLM response")
+
+            arguments = function_call.get("arguments", "{}")
+            return json.loads(arguments)
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse LLM response JSON: {e}")
+            raise ValueError(f"Invalid JSON in LLM response: {e}")
+        except Exception as e:
+            logger.error(f"LLM invocation failed: {e}")
+            raise
+
+    def _build_function_schema(self, cited_keys: List[str]) -> Dict[str, Any]:
+        if not cited_keys:
+            raise ValueError("cited_keys cannot be empty")
+
+        return {
+            "name": "score_multiple_explanations",
+            "description": "Binary labeling: does the commentary actually interpret/explain each cited text?",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    self.REF_SCORE_FIELD: {
+                        "type": "object",
+                        "properties": {
+                            key: {"type": "integer","minimum": 0,"maximum": 1}
+                            for key in cited_keys
+                        },
+                        "required": cited_keys,
+                        "additionalProperties": False,
+                    },
+                    self.EXPLANATION_FIELD: {
+                        "type": "object",
+                        "properties": {
+                            key: {
+                                "type": "string",
+                                "maxLength": 150,
+                                "description": (
+                                    "Brief rationale. Start with: "
+                                    "\"Explained spans: '<phrase1>'; '<phrase2>'\" "
+                                    "then 1–2 sentences why 0/1."
+                                ),
+                            }
+                            for key in cited_keys
+                        },
+                        "required": cited_keys,
+                        "additionalProperties": False,
+                    },
+                },
+                "required": [self.REF_SCORE_FIELD,self.EXPLANATION_FIELD],
+                "additionalProperties": False,
+            },
+        }
+
+    def _create_failure_scoring_output(
+            self,
+            commentary_ref: str,
+            processed_datetime: datetime,
+            request_status_message: str
+    ) -> CommentaryScoringOutput:
+        """Create standardized failure output for error cases.
+        Returns: CommentaryScoringOutput: Failure result with error details
+        """
+        logger.warning(request_status_message)
+        return CommentaryScoringOutput(
+            commentary_ref=commentary_ref,
+            ref_scores={},
+            scores_explanation={},
+            processed_datetime=str(processed_datetime),
+            request_status_message=request_status_message,
+            request_status=RequestStatus.FAILURE
+        )
+
+    def _build_scoring_prompt(self, cited_refs: Dict[str, str], commentary_text: str) -> str:
+        """Build the prompt for scoring commentary explanations.
+        """
+        refs_section = "\n".join(f"- {key}: {text}" for key, text in cited_refs.items())
+
+        return textwrap.dedent(f"""You are labeling whether a commentary ACTUALLY EXPLAINS each cited text.
+
+        COMMENTARY TEXT:
+        {commentary_text}
+
+        CITED TEXTS:
+        {refs_section}
+
+        TASK (binary per citation):
+        Return {ExplainsFlag.EXPLAINED} if the commentary provides any substantive interpretation or explanation
+        of ANY PART of the cited text (including methodological interpretation, e.g., reading a word
+        as a symbol) — not just quoting or paraphrasing.
+
+        Return {ExplainsFlag.NOT_EXPLAINED} if:
+        • The citation is used for another goal (decorative, rhetorical, prooftext with no interpretation).
+        • The commentary cites Source A only via Source B, but adds NO new interpretation of A beyond B.
+          (Inherited interpretation does NOT count as explanation of A.)
+        • It merely references or paraphrases without interpreting.
+
+        Important:
+        • If the commentary explains only PARTS of the citation, still return 1.
+        • In your explanation, list the exact phrases from the cited text that ARE explained (if any),
+          then give a concise rationale for {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED}.
+
+        RETURN JSON WITH:
+        1. {self.REF_SCORE_FIELD}: object of {ExplainsFlag.NOT_EXPLAINED}/{ExplainsFlag.EXPLAINED} per citation key
+        2. {self.EXPLANATION_FIELD}: object of brief rationales. Begin each value with:
+           Explained spans: '<phrase1>'; '<phrase2>' (or 'None'), then 1–2 sentences of rationale.
+        """)
+
+    def process_commentary_by_content(
+            self,
+            commentary_text: str,
+            cited_refs: Dict[str, str],
+            commentary_ref: str = ""
+    ) -> CommentaryScoringOutput:
+        """Score how well a commentary explains its cited texts.
+        """
+        if not cited_refs:
+            return self._create_failure_scoring_output(commentary_ref=commentary_ref,
+                                                       processed_datetime=datetime.now(timezone.utc),
+                                                       request_status_message=f"Commentary {commentary_ref} doesn't cite anything. ")
+
+        if not commentary_text:
+            return self._create_failure_scoring_output(commentary_ref=commentary_ref,
+                                                       processed_datetime=datetime.now(timezone.utc),
+                                                       request_status_message=f"Commentary {commentary_ref} is empty. ")
+
+        if not commentary_text.strip():
+
+            return self._create_failure_scoring_output(
+                commentary_ref=commentary_ref,
+                processed_datetime=datetime.now(timezone.utc),
+                request_status_message=f"Commentary's {commentary_ref} text is empty "
+                )
+
+        token_count = self._count_tokens(commentary_text)
+        max_allowed_tokens = self.max_prompt_tokens - self.token_margin
+
+        if token_count > max_allowed_tokens:
+            # TODO: add long commentary support
+            return self._create_failure_scoring_output(commentary_ref=commentary_ref,
+                                                       processed_datetime=datetime.now(timezone.utc),
+                                                       request_status_message=(f"{commentary_ref}'s input too long "
+                                      f"({token_count} tokens > {max_allowed_tokens} limit). "))
+
+        logger.info(
+            f"Processing commentary with {token_count} tokens, "
+            f"{len(cited_refs)} citations"
+        )
+        try:
+            # Build prompt and schema
+            prompt = self._build_scoring_prompt(cited_refs, commentary_text)
+            schema = self._build_function_schema(list(cited_refs.keys()))
+
+            # Get LLM response
+            raw_response = self._invoke_llm(prompt, schema)
+
+            # Validate and normalize scores
+            raw_scores = raw_response.get(self.REF_SCORE_FIELD, {})
+            validated_scores = {
+                key: self._validate_level(score)
+                for key, score in raw_scores.items()
+            }
+            # Create structured result
+            result = CommentaryScoringOutput(
+                commentary_ref=commentary_ref,
+                ref_scores=validated_scores,
+                scores_explanation=raw_response.get(
+                    self.EXPLANATION_FIELD, {}
+                    ),
+                processed_datetime=str(datetime.now(timezone.utc)),
+                request_status_message="",
+                request_status=RequestStatus.SUCCESS)
+
+            explained = sum(validated_scores.values())
+            total = len(validated_scores)
+            logger.info(
+                f"Scored commentary {commentary_ref}: explained {explained}/{total} "
+                f"({(explained / total * 100 if total else 0):.0f}%)"
+            )
+
+            return result
+
+        except Exception as e:
+            return self._create_failure_scoring_output(
+                commentary_ref=commentary_ref,
+                processed_datetime=datetime.now(timezone.utc),
+                request_status_message=f"Commentary {commentary_ref} scoring failed: {e}"
+            )
\ No newline at end of file
diff --git a/app/commentary_scoring/tasks.py b/app/commentary_scoring/tasks.py
new file mode 100644
index 0000000..c270128
--- /dev/null
+++ b/app/commentary_scoring/tasks.py
@@ -0,0 +1,13 @@
+from celery import shared_task
+from commentary_scoring.commentary_scoring import score_one_commentary
+from sefaria_llm_interface.commentary_scoring import (
+    CommentaryScoringInput,
+)
+from dataclasses import asdict
+
+
+@shared_task(name='llm.score_commentary')
+def score_sheet_task(raw_input: dict) -> dict:
+    inp = CommentaryScoringInput(**raw_input)
+    out = score_one_commentary(inp)
+    return asdict(out)
\ No newline at end of file
diff --git a/app/commentary_scoring/text_utils.py b/app/commentary_scoring/text_utils.py
new file mode 100644
index 0000000..46f4e4c
--- /dev/null
+++ b/app/commentary_scoring/text_utils.py
@@ -0,0 +1,34 @@
+import re
+from typing import Union, List
+
+# Regular expression to match HTML tags (e.g., <div>, <b>, </p>, etc.)
+TAG_RE = re.compile(r"<[^>]+>")
+
+
+def strip_html(s: str) -> str:
+    """
+    Remove all HTML tags from a given string.
+    """
+    return TAG_RE.sub("", s)
+
+
+def flatten_text(x: Union[str, List, tuple]) -> str:
+    """
+    Recursively flatten a nested structure of strings, lists, or tuples into a single string.
+    """
+    if isinstance(x, str):
+        return x
+    if isinstance(x, (list, tuple)):
+        # Recursively flatten all elements and join with spaces
+        return " ".join(flatten_text(el) for el in x)
+    # If it's not a string or list/tuple, convert it to string
+    return str(x)
+
+
+def to_plain_text(raw: Union[str, List, tuple]) -> str:
+    """
+    Convert raw input (possibly nested and HTML-formatted) to clean plain text.
+    """
+    flat = flatten_text(raw)  # Step 1: Flatten nested structure
+    clean = strip_html(flat)  # Step 2: Remove HTML tags
+    return re.sub(r"\s+", " ", clean).strip()  # Step 3: Normalize whitespace
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
new file mode 100644
index 0000000..a086340
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/__init__.py
@@ -0,0 +1,3 @@
+from sefaria_llm_interface.commentary_scoring.commentary_scoring_input import *
+from sefaria_llm_interface.commentary_scoring.commentary_scoring_output import *
+
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
new file mode 100644
index 0000000..e6b4c2a
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_input.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass
+from typing import Dict
+
+
+@dataclass
+class CommentaryScoringInput:
+    commentary_text: str
+    cited_refs: Dict[str, str]
+    commentary_ref: str
+
diff --git a/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
new file mode 100644
index 0000000..27b86b4
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/commentary_scoring/commentary_scoring_output.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+from typing import Dict
+from datetime import datetime
+
+
+@dataclass
+class CommentaryScoringOutput:
+    commentary_ref: str
+    ref_scores: Dict[str, int]
+    scores_explanation: Dict[str, str]
+    processed_datetime: str
+    request_status_message: str
+    request_status: int
+
+    def __post_init__(self):
+        if isinstance(self.processed_datetime, datetime):
+            self.processed_datetime = self.processed_datetime.isoformat()
\ No newline at end of file