From 990102b6a59bdee2fdad834c43f7117567922b5f Mon Sep 17 00:00:00 2001 From: Andy Taylor Date: Tue, 10 Mar 2026 19:55:14 +0000 Subject: [PATCH] feat: add syllabus-driven chunked audio/video generation Add a syllabus workflow that automates generating NotebookLM audio/video overviews for an entire eBook, broken into logical chapter episodes. New modules: - models.py: shared dataclasses (extracted from notebooklm.py) - syllabus.py: pure-logic syllabus parsing, state management, chunk selection New CLI commands (rich_help_panel="Syllabus"): - syllabus: send structured prompt to NotebookLM chat API, parse response into numbered episode plan, save as syllabus_state.json - generate-next: fire generation for next pending episode, poll to completion, persist task IDs for Ctrl+C recovery, --no-wait for fire-and-forget mode - status: display progress table, --poll to check API, --tail for live-updating display Key design decisions: - Auto-syllabus via NotebookLM chat with fixed-size fallback on parse failure - Stateful next-chunk stepping (accommodates rate limits and session breaks) - Atomic state writes (mkstemp + fsync + os.replace) for crash safety - Priority-based chunk selection: GENERATING > FAILED > PENDING - Episode titles sanitized before filesystem/API use (LLM output is adversarial input) Includes brainstorm and plan documents, comprehensive tests (118 passing), and updated README with full workflow documentation. Co-Authored-By: Claude Opus 4.6 --- README.md | 120 +++- ...unked-audio-video-generation-brainstorm.md | 169 ++++++ docs/codemap.md | 50 +- docs/guide-generate-overviews.md | 47 ++ docs/guide-study-workflow.md | 18 + ...eat-chunked-audio-video-generation-plan.md | 571 ++++++++++++++++++ docs/troubleshooting.md | 46 ++ docs/use-cases.md | 47 ++ src/pdf_by_chapters/cli.py | 482 ++++++++++++++- src/pdf_by_chapters/models.py | 29 + src/pdf_by_chapters/notebooklm.py | 141 ++++- src/pdf_by_chapters/syllabus.py | 392 ++++++++++++ tests/conftest.py | 6 + tests/unit/test_cli.py | 198 ++++++ tests/unit/test_notebooklm.py | 25 + tests/unit/test_syllabus.py | 326 ++++++++++ 16 files changed, 2624 insertions(+), 43 deletions(-) create mode 100644 docs/brainstorms/2026-03-10-chunked-audio-video-generation-brainstorm.md create mode 100644 docs/plans/2026-03-10-feat-chunked-audio-video-generation-plan.md create mode 100644 src/pdf_by_chapters/models.py create mode 100644 src/pdf_by_chapters/syllabus.py create mode 100644 tests/unit/test_syllabus.py diff --git a/README.md b/README.md index e699bba..45b1e82 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,10 @@ Split ebook PDFs by chapter using PDF bookmarks, then upload chapters to Google - [generate — Create audio/video overviews](#generate--create-audiovideo-overviews) - [download — Fetch generated artifacts](#download--fetch-generated-artifacts) - [delete — Remove a notebook](#delete--remove-a-notebook) +- [Syllabus Workflow — Automated Chunked Generation](#syllabus-workflow--automated-chunked-generation) + - [syllabus — Generate a podcast plan](#syllabus--generate-a-podcast-plan) + - [generate-next — Generate the next episode](#generate-next--generate-the-next-episode) + - [status — Check progress](#status--check-progress) - [Typical Workflow](#typical-workflow) - [Options Reference](#options-reference) - [How Chapter Detection Works](#how-chapter-detection-works) @@ -179,40 +183,132 @@ pdf-by-chapters delete -n NOTEBOOK_ID You will be prompted for confirmation before deletion. +## Syllabus Workflow — Automated Chunked Generation + +Instead of manually choosing chapter ranges, let NotebookLM's AI create a podcast syllabus that groups chapters into logical episodes, then step through generation one episode at a time. + +### `syllabus` — Generate a podcast plan + +Ask NotebookLM to analyse all chapters and create an episode plan: + +```bash +pdf-by-chapters syllabus -n NOTEBOOK_ID -o ./chapters --no-video +``` + +This sends a structured prompt to NotebookLM's chat API, parses the response into a numbered syllabus, and saves it as a state file (`syllabus_state.json`). If parsing fails, it falls back to fixed-size chunks. + +Customise the maximum chapters per episode: + +```bash +pdf-by-chapters syllabus -n NOTEBOOK_ID --max-chapters 3 +``` + +> **Note:** The `syllabus` command uses NotebookLM's chat API, which may trigger Google's backend to auto-generate artifacts (audio overview, slide deck) as a side effect. These are created by NotebookLM's platform behaviour, not by this tool, and are separate from the artifacts created by `generate-next`. + +### `generate-next` — Generate the next episode + +Generate the next pending episode from the syllabus: + +```bash +pdf-by-chapters generate-next -o ./chapters +``` + +This reads the state file, picks the next pending episode, fires the generation request, and polls until complete. The notebook ID comes from the state file — no need to pass `-n`. + +For non-blocking mode (returns immediately, ideal for scripting or agent workflows): + +```bash +pdf-by-chapters generate-next -o ./chapters --no-wait +``` + +Target a specific episode: + +```bash +pdf-by-chapters generate-next -o ./chapters --episode 3 +``` + +If interrupted with Ctrl+C, task IDs are already saved to the state file. Resume with `status --poll`. + +### `status` — Check progress + +View the syllabus and generation status: + +```bash +pdf-by-chapters status -o ./chapters +``` + +Poll the NotebookLM API to update in-progress artifacts: + +```bash +pdf-by-chapters status -o ./chapters --poll +``` + +Live-updating display that polls until all generating chunks complete: + +```bash +pdf-by-chapters status -o ./chapters --tail +``` + ## Typical Workflow +### Manual (per-range) + ```bash -# 1. Split and upload a book (or a whole directory of books) +# 1. Split and upload a book pdf-by-chapters process "Fundamentals of Data Engineering.pdf" # 2. Find the notebook ID pdf-by-chapters list -# 3. Check which chapters are in the notebook -pdf-by-chapters list -n NOTEBOOK_ID - -# 4. Generate audio/video for chapters 1-3 +# 3. Generate audio/video for chapters 1-3 pdf-by-chapters generate -n NOTEBOOK_ID -c 1-3 -# 5. Generate for the next batch +# 4. Generate for the next batch pdf-by-chapters generate -n NOTEBOOK_ID -c 4-6 -# 6. Download everything +# 5. Download everything pdf-by-chapters download -n NOTEBOOK_ID -o ./overviews ``` +### Automated (syllabus-driven) + +```bash +# 1. Split and upload +pdf-by-chapters process "Fundamentals of Data Engineering.pdf" +export NOTEBOOK_ID= + +# 2. Generate a podcast syllabus (audio only) +pdf-by-chapters syllabus -n $NOTEBOOK_ID -o ./chapters --no-video + +# 3. Generate episodes one at a time +pdf-by-chapters generate-next -o ./chapters --no-wait +pdf-by-chapters status -o ./chapters --poll # check when ready +pdf-by-chapters generate-next -o ./chapters --no-wait +# ... repeat for each episode + +# 4. Download everything +pdf-by-chapters download -n $NOTEBOOK_ID -o ./overviews +``` + ## Options Reference | Option | Command | Description | Default | |---|---|---|---| | `source` | split, process | PDF file or directory of PDFs (positional arg) | — | -| `-o, --output-dir` | split, process, download | Output directory | `./chapters` (split/process), `./overviews` (download) | +| `-o, --output-dir` | split, process, download, syllabus, generate-next, status | Output directory | `./chapters` / `./overviews` | | `-l, --level` | split, process | TOC level to split on (1 = top-level chapters) | `1` | -| `-n, --notebook-id` | process, list, generate, download, delete | NotebookLM notebook ID | — | +| `-n, --notebook-id` | process, list, generate, download, delete, syllabus | NotebookLM notebook ID | — | | `-c, --chapters` | generate, download | Chapter range, e.g. `1-3` (1-indexed, inclusive) | — | -| `--no-audio` | generate | Skip audio overview generation | — | -| `--no-video` | generate | Skip video overview generation | — | -| `-t, --timeout` | generate | Timeout in seconds for generation polling | `900` (15 min) | +| `--no-audio` | generate, syllabus, generate-next | Skip audio overview generation | — | +| `--no-video` | generate, syllabus, generate-next | Skip video overview generation | — | +| `-t, --timeout` | generate, generate-next | Timeout in seconds for generation polling | `900` (15 min) | +| `-m, --max-chapters` | syllabus | Maximum chapters per episode | `2` | +| `-b, --book-name` | syllabus | Book name for state file | output dir name | +| `--force` | syllabus | Overwrite existing syllabus with in-progress chunks | — | +| `-e, --episode` | generate-next | Target a specific episode by number | — | +| `--no-wait` | generate-next | Start generation and return immediately | — | +| `--poll` | status | Check API for status of generating chunks | — | +| `--tail` | status | Live-updating display until generation completes | — | ## How Chapter Detection Works diff --git a/docs/brainstorms/2026-03-10-chunked-audio-video-generation-brainstorm.md b/docs/brainstorms/2026-03-10-chunked-audio-video-generation-brainstorm.md new file mode 100644 index 0000000..4d3cc6f --- /dev/null +++ b/docs/brainstorms/2026-03-10-chunked-audio-video-generation-brainstorm.md @@ -0,0 +1,169 @@ +# Brainstorm: Chunked Audio/Video Generation via Auto-Syllabus + +**Date:** 2026-03-10 +**Status:** Draft + +## What We're Building + +A workflow that automates generating NotebookLM audio/video overviews for an entire book, broken into logical chapter chunks. The tool: + +1. Sends a "syllabus generator" prompt to NotebookLM's chat API, scoped to all uploaded chapter sources +2. Parses the response into a numbered chunk plan (episode title + chapter numbers per chunk) +3. Persists the plan as a JSON state file alongside the chapter PDFs +4. Lets the user step through chunk generation one-at-a-time via a `generate-next` command +5. Renames each completed artifact in NotebookLM using the syllabus episode title +6. Tracks progress across sessions so the user can resume at any time + +### Why This Approach + +- **Auto-syllabus via chat**: NotebookLM's AI has read all the chapter sources and can group them by related concepts (1-2 chapters per chunk). This produces smarter groupings than fixed-size chunking (e.g. it won't split a two-part chapter across chunks). +- **Stateful stepping over batch**: NotebookLM has rate limits and quota constraints. Audio/video generation takes minutes per chunk. A fire-and-forget batch risks hitting rate limits mid-run with no easy recovery. Stateful stepping lets the user control pacing, walk away, and resume later. +- **JSON state file**: Simple, inspectable, portable. Saved alongside chapter PDFs so it gets cleaned up naturally with the output directory. +- **Episode title naming**: Each artifact gets renamed in NotebookLM's UI to the syllabus-generated episode title (e.g. "Foundations & The Data Engineering Lifecycle"). More meaningful than `-1`, `-2` numbering, and visible in the NotebookLM web UI. + +## Key Decisions + +### 1. Chunk Definition: Auto-Syllabus via NotebookLM Chat + +The tool sends a structured prompt to `client.chat.ask()` requesting a numbered syllabus with strict 1-2 chapter limits per chunk. The prompt requests a specific format to make parsing reliable: + +``` +Episode N: "" +Chapters: X-Y +Summary: ... +``` + +**Fallback**: If parsing fails (regex can't extract structured chunks), fall back to fixed-size chunks of 2 chapters each and warn the user. The user can also manually edit the JSON state file to adjust groupings before running `generate-next`. + +### 2. Execution Model: Stateful Next-Chunk Stepping + +- `syllabus` command creates the plan and saves state +- `generate-next` reads state, generates the next pending chunk, updates state +- User controls when to invoke each step (accommodates rate limits, quotas, time) +- State file tracks: notebook_id, book_name, chunks[], per-chunk status + artifact_ids + +### 3. State Persistence: JSON File Alongside Chapters + +```json +{ + "notebook_id": "decbc8d1-...", + "book_name": "Fundamentals_of_Data_Engineering_Joe_Reis", + "created": "2026-03-10T14:52:00Z", + "chunks": [ + { + "episode": 1, + "title": "Foundations & The Data Engineering Lifecycle", + "chapters": [1, 2], + "source_ids": ["abc-123", "def-456"], + "status": "completed", + "artifacts": { + "audio": {"task_id": "...", "artifact_id": "...", "status": "completed"}, + "video": {"task_id": "...", "artifact_id": "...", "status": "completed"} + } + }, + { + "episode": 2, + "title": "Designing Good Data Architecture", + "chapters": [3, 4], + "source_ids": ["ghi-789", "jkl-012"], + "status": "pending", + "artifacts": {} + } + ] +} +``` + +File location: `/syllabus_state.json` + +### 4. Artifact Scope: Audio and Video Only + +Matches the existing `generate` command's capabilities. The upstream library supports many more artifact types (quizzes, flashcards, reports, etc.) but we're keeping scope tight. Can be extended later. + +### 5. Naming Convention: Episode Title from Syllabus + +After an artifact completes generation, the tool calls `client.artifacts.rename()` to set the title to the syllabus episode title. This makes artifacts identifiable in the NotebookLM web UI. + +For downloaded files, the naming convention is: `_episode__.` +e.g. `fundamentals_of_data_engineering_episode_01_audio.mp3` + +### 6. CLI Commands: New Top-Level Commands + +Three new commands added to the flat command structure: + +| Command | Purpose | +|---------|---------| +| `syllabus` | Generate a chunk plan via NotebookLM chat, save to state file | +| `generate-next` | Generate audio/video for the next pending chunk | +| `status` | Show chunk plan with per-chunk generation progress | + +Existing commands (`split`, `process`, `generate`, `download`, `delete`, `list`) remain unchanged. + +## Workflow Example + +```bash +# Step 1: Split and upload (existing) +pdf-by-chapters process ./Fundamentals_of_Data_Engineering.pdf +export NOTEBOOK_ID=decbc8d1-... + +# Step 2: Generate syllabus +pdf-by-chapters syllabus -n $NOTEBOOK_ID -o ./chapters +# -> Sends prompt to NotebookLM chat +# -> Parses response into chunks +# -> Saves ./chapters/syllabus_state.json +# -> Displays the syllabus table + +# Step 3: Generate chunks one at a time +pdf-by-chapters generate-next -o ./chapters +# -> Reads state, finds chunk 1 +# -> Generates audio + video for chapters 1-2 +# -> Polls until complete +# -> Renames artifacts to episode title +# -> Updates state file +# -> Warns: "Rate limits apply. Wait before generating next chunk." + +# Step 4: Check progress +pdf-by-chapters status -o ./chapters +# -> Shows table: Episode | Title | Chapters | Audio | Video + +# Step 5: Generate next chunk when ready +pdf-by-chapters generate-next -o ./chapters +# -> Picks up chunk 2 automatically + +# Step 6: Download all completed artifacts +pdf-by-chapters download -n $NOTEBOOK_ID -o ./overviews +``` + +## Technical Integration Points + +### notebooklm.py additions +- `generate_syllabus()` - sends chat prompt, parses response, returns chunk plan +- `generate_next_chunk()` - reads state, generates artifacts for next pending chunk, renames on completion +- Reuses existing `_request_chapter_artifact()` for the actual generation +- Reuses existing polling logic from `generate_for_chapters()` + +### cli.py additions +- `syllabus` command - orchestrates syllabus generation, displays plan table +- `generate-next` command - orchestrates next chunk generation, displays progress +- `status` command - reads state file, displays progress table + +### New module: syllabus.py (optional) +- Syllabus prompt template +- Response parsing (regex-based) +- State file read/write +- Fixed-size fallback logic + +Could also live in `notebooklm.py` if the module doesn't get too large. + +## Resolved Questions + +1. **Syllabus prompt design**: Include source titles in the prompt. The tool lists all uploaded sources by number and title, embedding them in the prompt so the LLM has an explicit chapter-to-number mapping. This makes the response easier to parse and reduces ambiguity. + +2. **Source ID mapping**: Parse chapter numbers from source titles using regex (e.g. `chapter_03` -> 3) rather than relying on alphabetical sort position. Falls back to positional sort if title parsing fails. More robust against out-of-order uploads or extra sources. + +3. **Regeneration**: Yes - `generate-next` supports a `--episode N` flag to target a specific chunk. Resets that chunk's status to pending and regenerates. Useful for retries after failures or getting a different take. + +4. **Chunk size override**: Yes - `syllabus` accepts `--max-chapters N` (default 2) to constrain the LLM's grouping. The prompt template interpolates this value. Useful for shorter books where 3-4 chapters per chunk makes more sense. + +## Open Questions + +None - all questions resolved during brainstorm. diff --git a/docs/codemap.md b/docs/codemap.md index 269aaf0..b989a46 100644 --- a/docs/codemap.md +++ b/docs/codemap.md @@ -16,6 +16,8 @@ graph LR CLI[cli.py
Typer commands] SPL[splitter.py
PDF splitting] NLM[notebooklm.py
API integration] + MOD[models.py
Shared dataclasses] + SYL[syllabus.py
State & parsing] end subgraph "External" @@ -26,6 +28,7 @@ graph LR CHAPS[Chapter PDFs] AUDIO[Audio overviews] VIDEO[Video explainers] + STATE[syllabus_state.json] end PDF --> SPL @@ -33,7 +36,10 @@ graph LR CHAPS --> NLM NLM --> NLMAPI NLMAPI --> AUDIO & VIDEO - CLI --> SPL & NLM + CLI --> SPL & NLM & SYL + NLM --> MOD + SYL --> MOD + SYL --> STATE ``` ## Module Breakdown @@ -50,6 +56,9 @@ Entry point using [Typer](https://typer.tiangolo.com/). Routes to splitter and N | `download` | Download generated artifacts | `notebooklm` | | `list` | List notebooks or sources | `notebooklm` | | `delete` | Delete a notebook | `notebooklm` | +| `syllabus` | Generate podcast syllabus via chat | `notebooklm` → `syllabus` | +| `generate-next` | Generate next pending episode | `notebooklm` → `syllabus` | +| `status` | Show syllabus generation progress | `syllabus` (+ `notebooklm` with `--poll`) | ### splitter.py — PDF Chapter Splitting @@ -114,12 +123,44 @@ sequenceDiagram **Chapter-aware generation:** Unlike `repo-artefacts` which generates for the whole repo, this tool selects specific NotebookLM sources by chapter range, allowing focused overviews of specific sections. +### models.py — Shared Dataclasses + +Holds dataclasses shared between modules, preventing circular imports and keeping `syllabus.py` testable without `notebooklm-py` installed. + +- `UploadResult`, `NotebookInfo`, `SourceInfo` — API result types +- `ChunkResult` — per-artifact-type generation result + +### syllabus.py — Syllabus State & Parsing + +Pure logic module (no Rich, no Typer, no async). Follows the `splitter.py` pattern. + +- `ChunkStatus(StrEnum)` — 4-state machine: `PENDING → GENERATING → COMPLETED | FAILED` +- `SyllabusState`, `SyllabusChunk`, `ChunkArtifact` — state file dataclasses +- `build_prompt()` — constructs the syllabus generation prompt with numbered source titles +- `parse_syllabus_response()` — regex parsing of LLM response into episodes (binary success/fallback) +- `build_fixed_size_chunks()` — deterministic fallback when LLM parsing fails +- `map_sources_to_chapters()` — maps chapter numbers to source IDs via title regex +- `read_state()` / `write_state()` — atomic JSON state persistence with `fsync` + `os.replace()` +- `get_next_chunk()` — priority selection: GENERATING (resume) > FAILED (retry) > PENDING (new) + +```mermaid +stateDiagram-v2 + [*] --> PENDING + PENDING --> GENERATING: generate-next + GENERATING --> COMPLETED: poll detects completion + GENERATING --> FAILED: poll detects failure / timeout + FAILED --> GENERATING: generate-next (retry) + COMPLETED --> PENDING: --episode N (regenerate) +``` + ## Interfaces | Module | Exports | Used By | |--------|---------|---------| -| `splitter` | `split_pdf_by_chapters()`, `sanitize_filename()` | `cli.split`, `cli.process` | -| `notebooklm` | `upload_chapters()`, `generate_for_chapters()`, `download_artifacts()`, `list_notebooks()`, `list_sources()`, `delete_notebook()` | `cli.*` | +| `models` | `UploadResult`, `NotebookInfo`, `SourceInfo`, `ChunkResult` | `notebooklm`, `cli` | +| `splitter` | `split_pdf_by_chapters()`, `sanitize_filename()` | `cli.split`, `cli.process`, `notebooklm` | +| `notebooklm` | `upload_chapters()`, `generate_for_chapters()`, `download_artifacts()`, `list_notebooks()`, `list_sources()`, `delete_notebook()`, `create_syllabus()`, `start_chunk_generation()`, `poll_chunk_status()`, `generate_chunk()` | `cli.*` | +| `syllabus` | `SyllabusState`, `SyllabusChunk`, `ChunkStatus`, `build_prompt()`, `parse_syllabus_response()`, `read_state()`, `write_state()`, `get_next_chunk()` | `cli.syllabus`, `cli.generate_next`, `cli.status` | ## Dependencies @@ -127,6 +168,9 @@ sequenceDiagram graph BT CLI[cli.py] --> SPL[splitter.py] CLI --> NLM[notebooklm.py] + CLI --> SYL[syllabus.py] + NLM --> MOD[models.py] + SYL --> MOD SPL -.-> PYMUPDF[pymupdf] NLM -.-> NLMPY[notebooklm-py] diff --git a/docs/guide-generate-overviews.md b/docs/guide-generate-overviews.md index f899314..b17717b 100644 --- a/docs/guide-generate-overviews.md +++ b/docs/guide-generate-overviews.md @@ -102,6 +102,51 @@ flowchart LR Smaller ranges = more detailed overviews. Start small. +## Automated: Syllabus Workflow + +Instead of manually choosing chapter ranges, let NotebookLM create a podcast syllabus that groups chapters into logical episodes. + +### Step 1: Generate a Syllabus + +```bash +pdf-by-chapters syllabus -n NOTEBOOK_ID -o ./chapters --no-video +``` + +This sends a prompt to NotebookLM's chat, asking it to group your chapters into 1-2 chapter episodes by topic. The result is saved as `syllabus_state.json`. + +### Step 2: Generate Episodes One at a Time + +```bash +# Non-blocking (returns immediately) +pdf-by-chapters generate-next -o ./chapters --no-wait + +# Or blocking (waits for completion, Ctrl+C safe) +pdf-by-chapters generate-next -o ./chapters +``` + +### Step 3: Check Progress + +```bash +pdf-by-chapters status -o ./chapters --poll +``` + +Use `--tail` for a live-updating display that polls every 30 seconds. + +### Step 4: Repeat + +Run `generate-next` again for the next episode. The tool automatically picks the next pending episode from the syllabus. + +```mermaid +flowchart LR + A[syllabus] --> B[generate-next] + B --> C[status --poll] + C --> D{All done?} + D -->|No| B + D -->|Yes| E[download] +``` + +> **Known behaviour:** The `syllabus` command uses NotebookLM's chat API (`chat.ask()`), which may trigger Google's backend to auto-generate artifacts (an audio overview and slide deck) as a platform side effect. These are separate from the scoped artifacts created by `generate-next` and can be safely ignored or deleted. + ## ❌ Something Went Wrong? See [Troubleshooting](troubleshooting.md) for: @@ -109,3 +154,5 @@ See [Troubleshooting](troubleshooting.md) for: - Generation timeout → try smaller ranges or audio-only - Auth errors → re-run `notebooklm login` - Download fails → artifact may not be ready yet +- Syllabus parsing failed → falls back to fixed-size chunks automatically +- Duplicate audio content → ensure `generate-next` is using scoped `source_ids` diff --git a/docs/guide-study-workflow.md b/docs/guide-study-workflow.md index b38251c..112059e 100644 --- a/docs/guide-study-workflow.md +++ b/docs/guide-study-workflow.md @@ -99,3 +99,21 @@ flowchart LR | Low focus day | 1 chapter | Deep, manageable | | Normal day | 2–3 chapters | Good balance | | Hyperfocus mode | 4–6 chapters | Ride the wave | + +### Syllabus Mode (Recommended) + +Instead of manually choosing batches, use the `syllabus` command to let NotebookLM group chapters by topic. This removes the decision fatigue of "which chapters should I batch together?" + +```bash +pdf-by-chapters syllabus -n $NOTEBOOK_ID -o ./chapters --no-video +``` + +Then generate episodes one at a time as you're ready for them: + +```bash +pdf-by-chapters generate-next -o ./chapters --no-wait +``` + +Check progress anytime: `pdf-by-chapters status -o ./chapters --poll` + +See [Generating Audio & Video Overviews](guide-generate-overviews.md) for the full syllabus workflow. diff --git a/docs/plans/2026-03-10-feat-chunked-audio-video-generation-plan.md b/docs/plans/2026-03-10-feat-chunked-audio-video-generation-plan.md new file mode 100644 index 0000000..e698d4d --- /dev/null +++ b/docs/plans/2026-03-10-feat-chunked-audio-video-generation-plan.md @@ -0,0 +1,571 @@ +--- +title: "feat: Chunked Audio/Video Generation via Auto-Syllabus" +type: feat +status: active +date: 2026-03-10 +origin: docs/brainstorms/2026-03-10-chunked-audio-video-generation-brainstorm.md +deepened: 2026-03-10 +--- + +# feat: Chunked Audio/Video Generation via Auto-Syllabus + +## Enhancement Summary + +**Deepened on:** 2026-03-10 +**Research agents used:** Python reviewer, Architecture strategist, Code simplicity reviewer, Pattern recognition specialist, Best practices researcher, Security sentinel, Python testing patterns + +### Key Improvements from Research +1. **Extract `models.py`** -- shared dataclasses move to a dedicated module, preserving the leaf-module independence pattern and keeping `syllabus.py` testable without `notebooklm-py` +2. **Dependency-inject the client** -- async functions accept the client as a parameter instead of opening their own context; caller owns the lifecycle +3. **Use `StrEnum` for chunk status** -- type safety via `ChunkStatus(StrEnum)` instead of bare strings +4. **Use `dict[int, SyllabusChunk]`** internally for O(1) episode lookup; serialize as list in JSON +5. **Sanitize all LLM-derived strings** -- episode titles MUST pass through `sanitize_filename()` before filesystem or API use + +### Scope Reductions (YAGNI) +1. **CUT Phase 4** (download --syllabus integration) -- over-coupling; add later if manual renaming proves painful +2. **CUT `validate_source_ids()`** -- the API already returns clear errors when source IDs are stale +3. **CUT state file version field** -- for a personal tool, re-run `syllabus --force` on schema changes +4. **CUT partial-parse-repair** -- binary success/fallback only; no "some chapters parsed, append rest as fixed-size" +5. **SIMPLIFY to 4 chunk states** -- derive "partial" from per-artifact data at display time, don't store it +6. **SIMPLIFY `book_name`** -- default to `output_dir.name`, accept `--book-name` CLI override; no extra API call + +--- + +## Overview + +Add a syllabus-driven workflow for generating NotebookLM audio/video overviews across an entire book, broken into logical chapter chunks. Three new CLI commands (`syllabus`, `generate-next`, `status`) and two new modules (`models.py`, `syllabus.py`) enable: auto-generating a chunk plan via NotebookLM's chat API, stepping through generation one chunk at a time with persistent state, and tracking progress across sessions. + +## Problem Statement / Motivation + +When a 19-chapter book is uploaded to NotebookLM, generating audio/video overviews requires: +1. Manually deciding which chapters to group into each episode +2. Running `pdf-by-chapters generate -c 1-2`, then `-c 3-4`, then `-c 5-6`... N times +3. Tracking which ranges are done, which failed, which need retrying +4. Waiting between each invocation due to rate limits and generation time + +This is tedious, error-prone, and breaks across sessions. The feature automates the chunking decision (via NotebookLM's AI), persists progress, and lets the user step through generation at their own pace. + +## Proposed Solution + +### Architecture + +``` + +-----------------+ + | cli.py | 3 new commands + | syllabus | (presentation layer) + | generate-next | + | status | + +--------+--------+ + | + +--------------+-----------+ + | | | + +------+------+ +----+----+ +----+------+ + | syllabus.py | |models.py| |notebooklm | + | (new) | | (new) | |.py (ext) | + +-------------+ +---------+ +-----------+ + | prompt tmpl | UploadRes | create_ | + | parse resp | NbInfo | syllabus() | + | state r/w | SourceInfo| _poll_until | + | fixed fallbk | ChunkRes | _complete()| + | src mapping | | | + +------+------+ +----+----+ +-----+-----+ + | ^ | + +--------------+-------------+ + | + +---------+---------+ + | syllabus_state.json| + +-------------------+ +``` + +**Key architectural change from original plan:** Extract `models.py` to hold all shared dataclasses (`UploadResult`, `NotebookInfo`, `SourceInfo` moved from `notebooklm.py`, plus new `ChunkResult`). This preserves the leaf-module invariant -- both `syllabus.py` and `notebooklm.py` import from `models.py`, neither imports from each other. `syllabus.py` remains testable without `notebooklm-py` installed. (See Architecture strategist review.) + +### New Module: `src/pdf_by_chapters/models.py` + +Shared dataclasses extracted from `notebooklm.py`: +- `UploadResult` (moved) +- `NotebookInfo` (moved) +- `SourceInfo` (moved) +- `ChunkResult` (new -- replaces raw `dict[str, str]` return type) + +### New Module: `src/pdf_by_chapters/syllabus.py` + +Pure logic module (no Rich, no Typer, no async). Follows the `splitter.py` pattern: +- `ChunkStatus(StrEnum)` -- `PENDING`, `GENERATING`, `COMPLETED`, `FAILED` +- Dataclasses: `ChunkArtifact`, `SyllabusChunk`, `SyllabusState` +- `SyllabusState.chunks` is `dict[int, SyllabusChunk]` internally (O(1) lookup), serialized as list +- Prompt template, response parsing, fixed-size fallback, state I/O, source mapping +- Raises `ValueError` / custom exceptions for errors; CLI catches and translates to red output + `typer.Exit(1)` + +### Extended: `src/pdf_by_chapters/notebooklm.py` + +Two new async functions that **accept the client as a parameter** (dependency injection): +- `create_syllabus(client, notebook_id, prompt) -> str` -- sends chat prompt, returns raw AI response +- `generate_chunk(client, notebook_id, source_ids, ...) -> ChunkResult` -- generates audio/video, polls to completion, renames artifacts (best-effort) + +Shared polling logic extracted from existing `generate_for_chapters()`: +- `_poll_until_complete(client, notebook_id, tasks, timeout)` -- reusable poll loop, eliminates duplication + +### Extended: `src/pdf_by_chapters/cli.py` + +Three new `@app.command()` functions with `rich_help_panel="Syllabus"` for visual grouping in `--help`. + +## Technical Approach + +### Chunk Status State Machine + +``` +pending ──> generating ──> completed + │ + └──> failed ──> (user runs generate-next) ──> generating +``` + +4 states (not 5). "Partial completion" (audio done, video failed) is derived at display time from per-artifact statuses. `generate-next` inspects artifact-level data to retry only the failed type. + +```python +class ChunkStatus(StrEnum): + PENDING = "pending" + GENERATING = "generating" + COMPLETED = "completed" + FAILED = "failed" +``` + +`generate-next` auto-selects: first `GENERATING` chunk (resume interrupted), then first `FAILED` chunk (retry), then first `PENDING` chunk (new work). + +### State File Schema (`syllabus_state.json`) + +```json +{ + "notebook_id": "decbc8d1-951e-4ab1-9275-23b4714a6b2b", + "book_name": "Fundamentals_of_Data_Engineering", + "created": "2026-03-10T14:52:00Z", + "max_chapters": 2, + "generate_audio": true, + "generate_video": true, + "chunks": [ + { + "episode": 1, + "title": "Foundations & The Data Engineering Lifecycle", + "chapters": [1, 2], + "source_ids": ["abc-123", "def-456"], + "status": "completed", + "artifacts": { + "audio": {"task_id": "xxx", "status": "completed"}, + "video": {"task_id": "yyy", "status": "completed"} + } + } + ] +} +``` + +Design decisions: +- **No version field** -- YAGNI for a personal tool. If schema changes, re-run `syllabus --force`. (See Simplicity review.) +- **`generate_audio`/`generate_video` flags** captured at syllabus time, overridable per-invocation on `generate-next` +- **Artifacts track `task_id`** (same as `artifact_id` in notebooklm-py) and per-type status +- **Atomic writes**: `tempfile.mkstemp(dir=state_path.parent)` + `os.fsync()` + `os.replace()`. Temp file in same directory ensures same-filesystem for POSIX atomicity. Cleanup temp on failure. (See Best practices research.) +- **Chunks stored as list in JSON** but loaded into `dict[int, SyllabusChunk]` keyed by episode for O(1) access + +### Deserialization Validation + +State file loaded via `SyllabusState.from_json()` classmethod with structural validation: + +```python +@classmethod +def from_json(cls, data: dict[str, Any]) -> SyllabusState: + """Load state from parsed JSON with structural validation.""" + try: + chunks_list = [SyllabusChunk.from_json(c) for c in data["chunks"]] + except (KeyError, TypeError) as exc: + raise SyllabusStateError(f"Corrupt state file: {exc}") from exc + return cls( + notebook_id=data["notebook_id"], + book_name=data["book_name"], + chunks={c.episode: c for c in chunks_list}, + ... + ) +``` + +Custom exceptions in `syllabus.py`: `SyllabusParseError`, `SyllabusStateError`. CLI catches these and translates to `console.print("[red]...") + typer.Exit(1)`. + +### Syllabus Prompt Design + +The prompt is sent to `client.chat.ask(notebook_id, question, source_ids=all_source_ids)` and includes the numbered source titles for reliable mapping: + +``` +I have uploaded several sources, each representing a sequential chapter +from a single technical eBook. Here are the chapters: + +1. fundamentals_of_data_engineering_chapter_01_preface.pdf +2. fundamentals_of_data_engineering_chapter_02_foundation_and_building_blocks.pdf +... + +Please divide these chapters into a "Podcast Syllabus" consisting of +logical chunks. Strictly limit each chunk to at most {max_chapters} +chapters. Group them by related technical concepts. + +Format your response EXACTLY as follows, one entry per chunk: + +Episode 1: "Episode Title Here" +Chapters: 1, 2 +Summary: One or two sentence summary. + +Episode 2: "Episode Title Here" +Chapters: 3 +Summary: One or two sentence summary. + +Use ONLY the chapter numbers listed above. Output ONLY the syllabus. +``` + +### Parsing Strategy + +Named regex constant with comment: + +```python +# Matches: Episode 1: "Title Here"\nChapters: 1, 2\nSummary: ... +_EPISODE_RE = re.compile( + r'Episode\s+(\d+):\s*"([^"]+)"\s*\n' + r'Chapters?:\s*([\d,\s]+)\s*\n' + r'Summary:\s*(.+)', + re.IGNORECASE, +) +``` + +**Binary success/fallback** (no partial-parse-repair): +- If regex extracts episodes AND every chapter number appears in at least one episode: **accept** +- Otherwise: fall back entirely to fixed-size chunks, log raw response at DEBUG level, warn user + +ReDoS risk: **none** -- `[\d,\s]+` and `[^"]+` are non-overlapping with their delimiters. Confirmed safe by security review. + +### Source ID Mapping + +Parse chapter number from source title: `r'chapter_(\d+)'` (case-insensitive). + +All-or-nothing: if ANY source title fails to parse, fall back to positional sort for ALL sources. + +**Decoupling from notebooklm.py**: `map_sources_to_chapters()` accepts `list[tuple[str, str]]` (id, title pairs) not `list[SourceInfo]`. The CLI extracts the tuples before calling. This keeps `syllabus.py` independent of `notebooklm-py`. (See Pattern recognition review.) + +### Security: LLM Output Sanitization + +Episode titles from LLM output are **adversarial input** (could contain path traversal, shell metacharacters, or Unicode control characters via indirect prompt injection from uploaded PDFs). + +**Mandatory mitigations** (see Security sentinel Finding 6): +1. Pass ALL LLM-derived episode titles through `sanitize_filename()` before any filesystem or API use +2. Truncate to 100 characters before API rename calls +3. After constructing any file path, validate it resolves under `output_dir`: + ```python + full_path = (output_dir / filename).resolve() + if not str(full_path).startswith(str(output_dir.resolve())): + raise ValueError(f"Path traversal detected: {filename}") + ``` + +### Async Function Design + +**Client as parameter, not owned** (see Python reviewer): + +```python +# Caller owns the client lifecycle +async def create_syllabus( + client: NotebookLMClient, + notebook_id: str, + prompt: str, +) -> str: + """Send syllabus prompt to NotebookLM chat. Returns raw AI response.""" + +async def generate_chunk( + client: NotebookLMClient, + notebook_id: str, + source_ids: list[str], + episode_title: str, + generate_audio: bool = True, + generate_video: bool = True, + timeout: int = 900, +) -> ChunkResult: + """Generate audio/video for a chunk, poll to completion, rename artifacts.""" +``` + +CLI wraps with a single `asyncio.run()` call that owns the client context: + +```python +async def _run_generate_next(state: SyllabusState, chunk: SyllabusChunk, ...) -> ...: + async with await NotebookLMClient.from_storage() as client: + result = await generate_chunk(client, state.notebook_id, ...) + return result + +asyncio.run(_run_generate_next(...)) +``` + +### Shared Polling Loop + +Extract from existing `generate_for_chapters()` to avoid duplicating ~40 lines: + +```python +async def _poll_until_complete( + client: NotebookLMClient, + notebook_id: str, + tasks: dict[str, str], # {label: task_id} + timeout: int = 900, + poll_interval: int = 30, +) -> dict[str, str]: + """Poll artifact generation tasks until complete or timeout. + + Returns {label: "completed"|"failed"}. + """ +``` + +Both `generate_for_chapters()` and `generate_chunk()` call this shared helper. + +### SpecFlow Gap Resolutions + +| Gap | Resolution | +|-----|-----------| +| **State file overwrite** | Refuse if any chunk status != "pending". Add `--force` flag to override. | +| **Partial completion** | Track audio/video independently at artifact level. Chunk-level status is `FAILED` if any artifact failed. `generate-next` retries only failed artifact types. | +| **Atomic writes** | `tempfile.mkstemp(dir=parent)` + `os.fsync()` + `os.replace()`. Cleanup temp on failure. | +| **Stale source_ids** | ~~Pre-validate~~ CUT. API returns clear errors. | +| **Rename failure** | Best-effort. Log warning, mark chunk as completed regardless. | +| **All completed** | Print "All N episodes completed. Use --episode N to regenerate a specific one." Exit 0. | +| **--no-audio/--no-video** | Supported on `generate-next` with `help=` text matching existing `generate` command. | +| **--timeout** | Supported on `generate-next`. Default 900s, matching existing `generate`. | +| **--episode N out of range** | Error: "Episode N not found. Syllabus has episodes 1-M." | +| **--episode N regeneration** | Resets chunk status to pending. Warns about orphaned artifacts. Does not delete. | +| **No state file** | Clear error: "No syllabus found. Run `pdf-by-chapters syllabus` first." | +| **Download integration** | ~~Phase 4~~ CUT. Add later if manual renaming proves painful. | +| **book_name** | Defaults to `output_dir.name`. Accept `--book-name` CLI override. No API call. | +| **Concurrency** | Document unsupported. No file locking. | + +## Implementation Phases + +### Phase 0: Extract shared models (`models.py`) + +**New file:** `src/pdf_by_chapters/models.py` + +Move from `notebooklm.py`: +- `UploadResult`, `NotebookInfo`, `SourceInfo` + +Add new: +- `ChunkResult` dataclass (replaces `dict[str, str]` return type) + +Update imports in `notebooklm.py` and `cli.py`. Update test imports. + +This is a pure refactoring commit -- no behavior change, all tests pass. + +### Phase 1: State Management Foundation (`syllabus.py`) + +**New file:** `src/pdf_by_chapters/syllabus.py` + +Deliverables: +- `ChunkStatus(StrEnum)` -- 4 states +- Dataclasses: `ChunkArtifact`, `SyllabusChunk`, `SyllabusState` with `to_json()`/`from_json()` methods +- `SyllabusState.chunks` as `dict[int, SyllabusChunk]`, serialized as list +- Custom exceptions: `SyllabusParseError`, `SyllabusStateError` +- `SYLLABUS_PROMPT_TEMPLATE` -- named constant +- `_EPISODE_RE` -- named regex constant with comment +- `parse_syllabus_response(response: str, source_map: dict[int, str]) -> dict[int, SyllabusChunk]` +- `build_fixed_size_chunks(source_map: dict[int, str], max_chapters: int) -> dict[int, SyllabusChunk]` +- `map_sources_to_chapters(sources: list[tuple[str, str]]) -> dict[int, str]` -- accepts (id, title) tuples +- `read_state(state_path: Path) -> SyllabusState` -- validate structure on load +- `write_state(state: SyllabusState, state_path: Path) -> None` -- atomic write with fsync +- `get_next_chunk(state: SyllabusState) -> SyllabusChunk | None` -- priority: generating > failed > pending +- `STATE_FILENAME = "syllabus_state.json"` + +**Test file:** `tests/unit/test_syllabus.py` + +Tests (heavy use of `pytest.mark.parametrize`): +- `TestParseSyllabusResponse` -- clean parse, zero parse (fallback), preamble text, unicode titles, empty string, whitespace-only +- `TestBuildFixedSizeChunks` -- even/odd splits, single chapter, chunk_size > total, chunk_size=0 raises ValueError +- `TestMapSourcesToChapters` -- standard format, case-insensitive, no-match (fallback), duplicates, empty list +- `TestReadWriteState` -- round-trip, atomic write (.tmp cleanup), corrupt JSON, missing keys +- `TestGetNextChunk` -- priority ordering, all completed returns None, generating takes priority over failed + +### Phase 2: NotebookLM Integration (`notebooklm.py`) + +**Additions to** `src/pdf_by_chapters/notebooklm.py`: + +```python +async def create_syllabus( + client: NotebookLMClient, + notebook_id: str, + prompt: str, +) -> str: + """Send syllabus prompt to NotebookLM chat. Returns raw AI response.""" + +async def _poll_until_complete( + client: NotebookLMClient, + notebook_id: str, + tasks: dict[str, str], + timeout: int = 900, + poll_interval: int = 30, +) -> dict[str, str]: + """Shared polling loop for artifact generation. Returns {label: status}.""" + +async def generate_chunk( + client: NotebookLMClient, + notebook_id: str, + source_ids: list[str], + episode_title: str, + generate_audio: bool = True, + generate_video: bool = True, + timeout: int = 900, +) -> ChunkResult: + """Generate audio/video for a chunk, poll to completion, rename artifacts (best-effort).""" +``` + +Refactor existing `generate_for_chapters()` to use `_poll_until_complete()` (DRY). + +**Updates to** `tests/conftest.py`: +- Add `client.chat.ask` as `AsyncMock` to `mock_notebooklm_client` +- Add `client.artifacts.rename` as `AsyncMock` + +**Test additions in** `tests/unit/test_notebooklm.py`: +- `TestCreateSyllabus` -- sends prompt, returns answer, handles empty response +- `TestGenerateChunk` -- happy path, one artifact fails, rename failure (best-effort), timeout +- `TestPollUntilComplete` -- all complete, one fails, timeout + +### Phase 3: CLI Commands (`cli.py`) + +Three new commands with `rich_help_panel="Syllabus"`: + +#### `syllabus` command + +```python +@app.command(rich_help_panel="Syllabus") +def syllabus( + notebook_id: str | None = typer.Option( + None, "--notebook-id", "-n", envvar="NOTEBOOK_ID", + help="Notebook ID to generate syllabus for.", + ), + output_dir: Path = typer.Option(Path("./chapters"), "--output-dir", "-o"), + max_chapters: int = typer.Option(2, "--max-chapters", "-m", + help="Maximum chapters per episode (default: 2).", + ), + book_name: str | None = typer.Option(None, "--book-name", "-b", + help="Book name for state file. Defaults to output directory name.", + ), + force: bool = typer.Option(False, "--force", + help="Overwrite existing syllabus even if chunks are in progress.", + ), + no_audio: bool = typer.Option(False, "--no-audio", + help="Skip audio generation.", + ), + no_video: bool = typer.Option(False, "--no-video", + help="Skip video generation.", + ), +) -> None: +``` + +All imports lazy (inside function body). Catches `SyllabusParseError`/`SyllabusStateError` and translates to `console.print("[red]...") + typer.Exit(1)`. + +#### `generate-next` command + +```python +@app.command("generate-next", rich_help_panel="Syllabus") +def generate_next( + output_dir: Path = typer.Option(Path("./chapters"), "--output-dir", "-o"), + episode: int | None = typer.Option(None, "--episode", "-e", + help="Target a specific episode by number.", + ), + no_audio: bool = typer.Option(False, "--no-audio", + help="Skip audio generation.", + ), + no_video: bool = typer.Option(False, "--no-video", + help="Skip video generation.", + ), + timeout: int = typer.Option(900, "--timeout", "-t", + help="Timeout in seconds (default: 900 = 15min).", + ), +) -> None: + """Generate audio/video for the next pending episode. + + Uses notebook_id from the syllabus state file (not --notebook-id). + """ +``` + +#### `status` command + +```python +@app.command(rich_help_panel="Syllabus") +def status( + output_dir: Path = typer.Option(Path("./chapters"), "--output-dir", "-o"), +) -> None: +``` + +**Test additions in** `tests/unit/test_cli.py`: +- `TestSyllabusCommand` -- happy path, force overwrite, existing state refusal, no sources +- `TestGenerateNextCommand` -- happy path, --episode targeting, no state file, all completed +- `TestStatusCommand` -- happy path, no state file + +### Phase 4: Documentation + +- Update `docs/guide-generate-overviews.md` with syllabus workflow +- Update `docs/guide-study-workflow.md` with end-to-end example +- Update `docs/codemap.md` with new modules (`models.py`, `syllabus.py`) + +## Acceptance Criteria + +### Core Functionality +- [ ] `syllabus` generates valid state file for a multi-chapter book +- [ ] `syllabus` refuses to overwrite non-pending state without `--force` +- [ ] `syllabus` falls back to fixed-size chunks when chat response is unparseable +- [ ] `syllabus --max-chapters N` constrains chunk sizes in the prompt +- [ ] `generate-next` processes one chunk and updates state file atomically +- [ ] `generate-next` resumes correctly after process interruption (chunk left in "generating") +- [ ] `generate-next` retries only failed artifact types on partial completion +- [ ] `generate-next --episode N` targets a specific episode +- [ ] `generate-next` renames artifacts in NotebookLM (best-effort, non-blocking) +- [ ] `generate-next` prints clear message when all episodes are completed +- [ ] `status` displays correct progress table with per-artifact status +- [ ] All LLM-derived episode titles pass through `sanitize_filename()` before filesystem/API use + +### Error Handling +- [ ] Missing state file produces clear error with remedy +- [ ] Invalid `--episode N` (out of range) produces clear error +- [ ] Rate-limited generation detected and reported to user +- [ ] Corrupt state file produces clear error via `SyllabusStateError` (not a traceback) +- [ ] `syllabus.py` raises exceptions; `cli.py` catches and translates to `[red]` + `typer.Exit(1)` + +### Testing +- [ ] Syllabus parsing: parametrized across clean, zero (fallback), edge cases +- [ ] Source mapping: parametrized across parseable, unparseable (fallback), empty +- [ ] State management: round-trip, atomic writes (verify .tmp cleanup), corruption +- [ ] CLI commands: all happy paths + key error paths via CliRunner (classes named `Test*Command`) +- [ ] Async functions: mocked notebooklm-py client with `chat.ask` and `artifacts.rename` +- [ ] Pure logic in `syllabus.py` tested without mocks (mock-free majority) + +## Dependencies & Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|------------| +| NotebookLM chat response format changes | Medium | High | Fixed-size fallback auto-activates. User can edit state JSON. | +| Rate limiting blocks sequential generation | High | Medium | Stateful stepping by design. User controls pacing. | +| `notebooklm-py` API changes | Low | High | Pin minimum version. Wrap API calls with try/except. | +| Prompt produces poor chapter groupings | Medium | Low | User can edit state file. `--force` allows re-running. | +| Path traversal via LLM episode titles | Medium | High | `sanitize_filename()` + path containment check. | +| State file corruption from Ctrl+C | Low | Medium | Atomic writes with fsync. | + +## Sources & References + +### Origin + +- **Brainstorm document:** [docs/brainstorms/2026-03-10-chunked-audio-video-generation-brainstorm.md](docs/brainstorms/2026-03-10-chunked-audio-video-generation-brainstorm.md) -- Key decisions: auto-syllabus via chat, stateful next-chunk stepping, JSON state file, episode title naming in NotebookLM, audio/video only scope. + +### Internal References + +- CLI patterns: `src/pdf_by_chapters/cli.py` -- Typer conventions, async wrapping, lazy imports +- API integration: `src/pdf_by_chapters/notebooklm.py` -- client context manager, polling, `_request_chapter_artifact()` +- Module pattern: `src/pdf_by_chapters/splitter.py` -- pure logic module template +- Filename sanitization: `src/pdf_by_chapters/splitter.py:sanitize_filename()` -- MUST be used on all LLM output +- Test fixtures: `tests/conftest.py` -- `mock_notebooklm_client`, `patch_notebooklm` + +### External References + +- notebooklm-py Chat API: `client.chat.ask(notebook_id, question, source_ids=...)` returns `AskResult` +- notebooklm-py Artifacts API: `generate_audio()`, `generate_video()`, `rename()`, `poll_status()` +- notebooklm-py Types: `AskResult`, `GenerationStatus` (with `is_rate_limited`), `AudioFormat`, `VideoStyle` + +### Commit Strategy + +4 logical commits matching the phases: +1. `refactor: extract shared dataclasses to models.py` (Phase 0) +2. `feat: add syllabus state management module` (Phase 1 + tests) +3. `feat: add syllabus chat and chunk generation functions` (Phase 2 + tests) +4. `feat: add syllabus, generate-next, status CLI commands` (Phase 3 + tests) +5. `docs: add syllabus workflow documentation` (Phase 4) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6c3c159..1139971 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -97,6 +97,49 @@ notebooklm login Opens browser for Google sign-in. Cookies stored locally. +## Syllabus Issues + +### Syllabus parsing failed + +**Symptom:** `Could not parse syllabus: No episodes found in LLM response` + +**Cause:** NotebookLM's chat response didn't match the expected format. The tool automatically falls back to fixed-size chunks (default 2 chapters per episode). + +**Fixes:** +- Re-run `syllabus --force` to try again (LLM responses vary) +- Adjust `--max-chapters` to change grouping size +- Manually edit `syllabus_state.json` to customise episode groupings + +### Auto-generated artifacts from syllabus command + +**Symptom:** Running `syllabus` creates unexpected audio/slide deck artifacts in NotebookLM. + +**Cause:** This is a NotebookLM platform behaviour. When `chat.ask()` sends a message, Google's backend proactively auto-generates artifacts (audio overview, slide deck) as a side effect. This happens in the web UI too. + +**Workaround:** These auto-generated artifacts are separate from the scoped artifacts created by `generate-next`. They can be safely ignored or deleted via `notebooklm artifact delete `. + +### Duplicate/identical audio content across episodes + +**Symptom:** Different episodes produce identical-sounding audio. + +**Cause:** NotebookLM's audio generation may not fully respect `source_ids` scoping, especially with the DEEP_DIVE format. The model may pull context from the entire notebook. + +**Fixes:** +- Re-run `syllabus --force` to regenerate with updated scoped instructions +- The tool now includes chapter titles in the generation instructions to help NotebookLM focus +- Use `--episode N` to regenerate a specific episode + +### State file stuck in "generating" + +**Symptom:** `generate-next` keeps trying to resume a chunk that was interrupted. + +**Cause:** The process was killed before completion. Task IDs are saved to the state file. + +**Fixes:** +- Run `status --poll` to check if the generation actually completed on NotebookLM's side +- Use `--episode N` to reset and regenerate the stuck episode +- Manually edit `syllabus_state.json` and change the chunk status to `"pending"` + ## Common Errors | Error | Cause | Fix | @@ -105,3 +148,6 @@ Opens browser for Google sign-in. Cookies stored locally. | `Invalid chapter range '1-3'` | Wrong format | Use `--chapters 1-3` | | `start must be >= 1` | Zero or negative chapter number | Chapters are 1-indexed | | `pymupdf not found` | Missing dependency | `uv pip install pymupdf` | +| `No syllabus found` | State file missing | Run `pdf-by-chapters syllabus` first | +| `Syllabus already exists with in-progress chunks` | Existing state has non-pending chunks | Use `--force` to overwrite | +| `Episode N not found` | Invalid `--episode` number | Check syllabus with `status` | diff --git a/docs/use-cases.md b/docs/use-cases.md index dad4624..6c4e571 100644 --- a/docs/use-cases.md +++ b/docs/use-cases.md @@ -108,3 +108,50 @@ pdf-by-chapters split ./ebooks/ -o ./chapters # Process all (each gets its own notebook) pdf-by-chapters process ./ebooks/ ``` + +## UC7: Automated syllabus-driven generation + +Generate a full podcast series from a book with AI-driven chapter grouping. + +```bash +# 1. Upload chapters +pdf-by-chapters process "Data Engineering.pdf" +export NOTEBOOK_ID= + +# 2. Generate syllabus (audio only) +pdf-by-chapters syllabus -n $NOTEBOOK_ID -o ./chapters --no-video + +# 3. Generate episodes one at a time (non-blocking) +pdf-by-chapters generate-next -o ./chapters --no-wait +pdf-by-chapters status -o ./chapters --poll # check when ready + +# 4. Repeat for each episode +pdf-by-chapters generate-next -o ./chapters --no-wait + +# 5. Monitor with live display +pdf-by-chapters status -o ./chapters --tail +``` + +```mermaid +flowchart TD + A[process PDF] --> B[syllabus --no-video] + B --> C[generate-next --no-wait] + C --> D[status --poll] + D --> E{Completed?} + E -->|No| D + E -->|Yes| F{More episodes?} + F -->|Yes| C + F -->|No| G[download] +``` + +## UC8: Resume interrupted generation + +If `generate-next` is interrupted (Ctrl+C, connection loss), task IDs are saved. + +```bash +# Check what's in progress +pdf-by-chapters status -o ./chapters --poll + +# The generating chunk will either complete or be retried automatically +pdf-by-chapters generate-next -o ./chapters +``` diff --git a/src/pdf_by_chapters/cli.py b/src/pdf_by_chapters/cli.py index d2ad142..37baa08 100644 --- a/src/pdf_by_chapters/cli.py +++ b/src/pdf_by_chapters/cli.py @@ -3,6 +3,7 @@ import asyncio import logging from pathlib import Path +from typing import Any import typer from rich.console import Console @@ -94,7 +95,8 @@ def process( Handles single files or directories. For directories, each PDF gets its own subdirectory and notebook. """ - from pdf_by_chapters.notebooklm import UploadResult, upload_chapters + from pdf_by_chapters.models import UploadResult + from pdf_by_chapters.notebooklm import upload_chapters pdfs = _resolve_pdfs(source) all_uploads: list[tuple[list[Path], str, str | None]] = [] @@ -253,3 +255,481 @@ def delete_cmd( nb_id = _get_notebook_id(notebook_id) typer.confirm(f"Delete notebook {nb_id}?", abort=True) asyncio.run(delete_notebook(nb_id)) + + +# --------------------------------------------------------------------------- +# Syllabus workflow commands +# --------------------------------------------------------------------------- + + +@app.command(rich_help_panel="Syllabus") +def syllabus( + notebook_id: str | None = typer.Option( + None, + "--notebook-id", + "-n", + envvar="NOTEBOOK_ID", + help="Notebook ID to generate syllabus for.", + ), + output_dir: Path = typer.Option(Path("./chapters"), "--output-dir", "-o"), + max_chapters: int = typer.Option( + 2, "--max-chapters", "-m", help="Maximum chapters per episode (default: 2)." + ), + book_name: str | None = typer.Option( + None, "--book-name", "-b", help="Book name for state file. Defaults to output dir name." + ), + force: bool = typer.Option( + False, "--force", help="Overwrite existing syllabus even if chunks are in progress." + ), + no_audio: bool = typer.Option(False, "--no-audio", help="Skip audio generation."), + no_video: bool = typer.Option(False, "--no-video", help="Skip video generation."), +) -> None: + """Generate a podcast syllabus via NotebookLM chat and save as a plan.""" + from datetime import UTC, datetime + + from notebooklm import NotebookLMClient + + from pdf_by_chapters.notebooklm import create_syllabus as _create_syllabus + from pdf_by_chapters.notebooklm import list_sources + from pdf_by_chapters.syllabus import ( + STATE_FILENAME, + SyllabusParseError, + SyllabusState, + SyllabusStateError, + build_fixed_size_chunks, + build_prompt, + has_non_pending_chunks, + map_sources_to_chapters, + parse_syllabus_response, + read_state, + write_state, + ) + + nb_id = _get_notebook_id(notebook_id) + state_path = output_dir / STATE_FILENAME + resolved_book_name = book_name or output_dir.resolve().name + + # Check for existing state + if state_path.is_file() and not force: + try: + existing = read_state(state_path) + if has_non_pending_chunks(existing): + console.print( + "[red]Syllabus already exists with in-progress chunks. " + "Use --force to overwrite.[/red]" + ) + raise typer.Exit(1) + except SyllabusStateError: + pass # Corrupt file — safe to overwrite + + # Fetch sources and build mapping + sources_list = asyncio.run(list_sources(nb_id)) + if not sources_list: + console.print("[red]No sources found in notebook. Upload chapters first.[/red]") + raise typer.Exit(1) + + source_tuples = [(s.id, s.title) for s in sources_list] + source_map, title_map = map_sources_to_chapters(source_tuples) + + # Build and send prompt + prompt = build_prompt(source_tuples, max_chapters) + + async def _run_syllabus() -> str: + async with await NotebookLMClient.from_storage() as client: + return await _create_syllabus(client, nb_id, prompt) + + response = asyncio.run(_run_syllabus()) + + # Parse response, fall back to fixed-size on failure + try: + chunks = parse_syllabus_response(response, source_map, title_map) + console.print("[green]Syllabus parsed successfully from NotebookLM.[/green]") + except SyllabusParseError as exc: + console.print(f"[yellow]Could not parse syllabus: {exc}[/yellow]") + console.print("[yellow]Falling back to fixed-size chunks.[/yellow]") + chunks = build_fixed_size_chunks(source_map, max_chapters, title_map) + + # Build and save state + state = SyllabusState( + notebook_id=nb_id, + book_name=resolved_book_name, + created=datetime.now(UTC).isoformat(), + max_chapters=max_chapters, + generate_audio=not no_audio, + generate_video=not no_video, + chunks=chunks, + ) + write_state(state, state_path) + + # Display syllabus table + table = Table(title=f"Syllabus: {resolved_book_name}") + table.add_column("#", justify="right", style="dim") + table.add_column("Title", style="bold") + table.add_column("Chapters", style="cyan") + table.add_column("Status") + for chunk in state.chunks.values(): + ch_str = ", ".join(str(c) for c in chunk.chapters) + table.add_row(str(chunk.episode), chunk.title, ch_str, chunk.status.value) + console.print(table) + console.print(f"\nState saved to {state_path}") + console.print(f"Next: Run [bold]pdf-by-chapters generate-next -o {output_dir}[/bold]") + + +@app.command("generate-next", rich_help_panel="Syllabus") +def generate_next( + output_dir: Path = typer.Option(Path("./chapters"), "--output-dir", "-o"), + episode: int | None = typer.Option( + None, "--episode", "-e", help="Target a specific episode by number." + ), + no_audio: bool = typer.Option(False, "--no-audio", help="Skip audio generation."), + no_video: bool = typer.Option(False, "--no-video", help="Skip video generation."), + no_wait: bool = typer.Option( + False, "--no-wait", help="Start generation and return immediately without polling." + ), + timeout: int = typer.Option( + 900, "--timeout", "-t", help="Timeout in seconds (default: 900 = 15min)." + ), +) -> None: + """Generate audio/video for the next pending episode. + + Uses notebook_id from the syllabus state file (not --notebook-id). + With --no-wait, fires the request and returns immediately. Use 'status --poll' + to check progress later. + """ + from notebooklm import NotebookLMClient + + from pdf_by_chapters.notebooklm import start_chunk_generation + from pdf_by_chapters.syllabus import ( + STATE_FILENAME, + ChunkArtifact, + ChunkStatus, + SyllabusStateError, + get_next_chunk, + read_state, + write_state, + ) + + state_path = output_dir / STATE_FILENAME + + try: + state = read_state(state_path) + except SyllabusStateError as exc: + console.print(f"[red]{exc}[/red]") + raise typer.Exit(1) from None + + # Select chunk + if episode is not None: + if episode not in state.chunks: + max_ep = max(state.chunks.keys()) if state.chunks else 0 + console.print( + f"[red]Episode {episode} not found. Syllabus has episodes 1-{max_ep}.[/red]" + ) + raise typer.Exit(1) + chunk = state.chunks[episode] + if chunk.status == ChunkStatus.COMPLETED: + console.print( + f"[yellow]Episode {episode} already completed. " + "Resetting to pending (previous artifacts remain in NotebookLM).[/yellow]" + ) + chunk.status = ChunkStatus.PENDING + chunk.artifacts = {} + else: + chunk = get_next_chunk(state) + if chunk is None: + total = len(state.chunks) + console.print( + f"[green]All {total} episodes completed. " + "Use --episode N to regenerate a specific one.[/green]" + ) + raise typer.Exit(0) + + # Determine audio/video flags + gen_audio = (not no_audio) and state.generate_audio + gen_video = (not no_video) and state.generate_video + + console.print( + f"Generating episode {chunk.episode}: [bold]{chunk.title}[/bold] " + f"(chapters {', '.join(str(c) for c in chunk.chapters)})" + ) + + # Fire generation requests and persist task IDs immediately + # (recoverable on Ctrl+C — task_ids saved before polling starts) + + async def _start() -> dict[str, str]: + async with await NotebookLMClient.from_storage() as client: + return await start_chunk_generation( + client, + state.notebook_id, + chunk.source_ids, + chunk.title, + generate_audio=gen_audio, + generate_video=gen_video, + chapter_titles=chunk.chapter_titles or None, + ) + + tasks = asyncio.run(_start()) + if not tasks: + console.print("[red]Failed to start any generation requests.[/red]") + chunk.status = ChunkStatus.FAILED + write_state(state, state_path) + raise typer.Exit(1) + + for label, task_id in tasks.items(): + chunk.artifacts[label] = ChunkArtifact(task_id=task_id, status="in_progress") + chunk.status = ChunkStatus.GENERATING + write_state(state, state_path) + + if no_wait: + console.print( + f"[green]Generation started for episode {chunk.episode}.[/green]\n" + f"Use [bold]pdf-by-chapters status -o {output_dir} --poll[/bold] to check progress." + ) + return + + console.print("[dim]Generation started. Polling every 30s... (Ctrl+C is safe)[/dim]") + + # Now poll to completion + from pdf_by_chapters.notebooklm import poll_chunk_status + + async def _poll_loop() -> None: + elapsed = 0 + poll_interval = 30 + async with await NotebookLMClient.from_storage() as client: + while elapsed < timeout: + await asyncio.sleep(poll_interval) + elapsed += poll_interval + pending_tasks = { + label: art.task_id + for label, art in chunk.artifacts.items() + if art.task_id and art.status not in ("completed", "failed") + } + if not pending_tasks: + break + statuses = await poll_chunk_status(client, state.notebook_id, pending_tasks) + for label, new_status in statuses.items(): + chunk.artifacts[label].status = new_status + write_state(state, state_path) + + # Check if all done + if all(a.status in ("completed", "failed") for a in chunk.artifacts.values()): + break + + # Rename completed artifacts (best-effort) + safe_title = sanitize_filename(chunk.title)[:100] + if safe_title: + import contextlib + + for _label, art in chunk.artifacts.items(): + if art.task_id and art.status == "completed": + with contextlib.suppress(Exception): + await client.artifacts.rename( + state.notebook_id, art.task_id, safe_title + ) + + try: + asyncio.run(_poll_loop()) + except KeyboardInterrupt: + console.print( + "\n[yellow]Interrupted. Task IDs saved to state file.[/yellow]\n" + f"Resume with: [bold]pdf-by-chapters status -o {output_dir} --poll[/bold]" + ) + raise typer.Exit(0) from None + + # Update final chunk status + all_done = all(a.status == "completed" for a in chunk.artifacts.values()) + chunk.status = ChunkStatus.COMPLETED if all_done else ChunkStatus.FAILED + write_state(state, state_path) + + if chunk.status == ChunkStatus.COMPLETED: + console.print(f"[green]Episode {chunk.episode} completed.[/green]") + else: + console.print(f"[yellow]Episode {chunk.episode} had failures.[/yellow]") + for label, art in chunk.artifacts.items(): + if art.status != "completed": + console.print(f" {label}: {art.status}") + + console.print("\n[dim]Rate limits apply. Wait before generating the next chunk.[/dim]") + + +@app.command(rich_help_panel="Syllabus") +def status( + output_dir: Path = typer.Option(Path("./chapters"), "--output-dir", "-o"), + poll: bool = typer.Option(False, "--poll", help="Check API for status of generating chunks."), + tail: bool = typer.Option( + False, "--tail", help="Live display. Polls every 30s until generation completes." + ), +) -> None: + """Show syllabus progress for chunked generation. + + Use --poll to check the NotebookLM API for in-progress artifacts and + update the state file. Use --tail for a live-updating display. + """ + from pdf_by_chapters.syllabus import ( + STATE_FILENAME, + ChunkStatus, + SyllabusStateError, + read_state, + write_state, + ) + + state_path = output_dir / STATE_FILENAME + + try: + state = read_state(state_path) + except SyllabusStateError as exc: + console.print(f"[red]{exc}[/red]") + raise typer.Exit(1) from None + + # If --poll, check API for generating chunks and update state + if poll: + generating = [c for c in state.chunks.values() if c.status == ChunkStatus.GENERATING] + if generating: + import contextlib + + from notebooklm import NotebookLMClient + + from pdf_by_chapters.notebooklm import poll_chunk_status + from pdf_by_chapters.splitter import sanitize_filename + + async def _poll_all() -> None: + async with await NotebookLMClient.from_storage() as client: + for chunk in generating: + tasks = { + label: art.task_id + for label, art in chunk.artifacts.items() + if art.task_id and art.status != "completed" + } + if not tasks: + continue + statuses = await poll_chunk_status(client, state.notebook_id, tasks) + for label, new_status in statuses.items(): + chunk.artifacts[label].status = new_status + + # Update chunk-level status + all_done = all(a.status == "completed" for a in chunk.artifacts.values()) + any_failed = any(a.status == "failed" for a in chunk.artifacts.values()) + if all_done: + chunk.status = ChunkStatus.COMPLETED + # Best-effort rename + safe_title = sanitize_filename(chunk.title)[:100] + if safe_title: + for _label, art in chunk.artifacts.items(): + if art.task_id and art.status == "completed": + with contextlib.suppress(Exception): + await client.artifacts.rename( + state.notebook_id, + art.task_id, + safe_title, + ) + elif any_failed: + chunk.status = ChunkStatus.FAILED + + asyncio.run(_poll_all()) + write_state(state, state_path) + console.print("[dim]Polled API for in-progress artifacts.[/dim]\n") + else: + console.print("[dim]No generating chunks to poll.[/dim]\n") + + def _build_status_table(st: Any, elapsed_str: str = "") -> Table: + _styles = { + ChunkStatus.COMPLETED: "[green]completed[/green]", + ChunkStatus.GENERATING: "[yellow]generating[/yellow]", + ChunkStatus.FAILED: "[red]failed[/red]", + ChunkStatus.PENDING: "[dim]pending[/dim]", + } + tbl = Table(title=f"Syllabus: {st.book_name}") + tbl.add_column("#", justify="right", style="dim") + tbl.add_column("Title", style="bold") + tbl.add_column("Chapters", style="cyan") + tbl.add_column("Audio") + tbl.add_column("Video") + tbl.add_column("Status") + + done = 0 + for ch in sorted(st.chunks.values(), key=lambda c: c.episode): + ch_str = ", ".join(str(n) for n in ch.chapters) + a = ch.artifacts.get("audio") + v = ch.artifacts.get("video") + tbl.add_row( + str(ch.episode), + ch.title, + ch_str, + a.status if a else "-", + v.status if v else "-", + _styles.get(ch.status, ch.status.value), + ) + if ch.status == ChunkStatus.COMPLETED: + done += 1 + + tbl.caption = f"{done}/{len(st.chunks)} episodes completed" + if elapsed_str: + tbl.caption += f" | Elapsed: {elapsed_str}" + return tbl + + if tail: + import time + + from notebooklm import NotebookLMClient + from rich.live import Live + + from pdf_by_chapters.notebooklm import poll_chunk_status + + poll_interval = 30 + start_time = time.monotonic() + + def _elapsed() -> str: + secs = int(time.monotonic() - start_time) + return f"{secs // 60:02d}:{secs % 60:02d}" + + live = Live( + _build_status_table(state, _elapsed()), + console=console, + refresh_per_second=1, + ) + with live: + while any(c.status == ChunkStatus.GENERATING for c in state.chunks.values()): + time.sleep(poll_interval) + + # Poll API + generating = [ + c for c in state.chunks.values() if c.status == ChunkStatus.GENERATING + ] + if generating: + gen_chunks = list(generating) + + async def _poll_gen(chunks_to_poll: list) -> None: + async with await NotebookLMClient.from_storage() as client: + for chunk in chunks_to_poll: + tasks = { + label: art.task_id + for label, art in chunk.artifacts.items() + if art.task_id and art.status != "completed" + } + if not tasks: + continue + statuses = await poll_chunk_status( + client, state.notebook_id, tasks + ) + for label, new_st in statuses.items(): + chunk.artifacts[label].status = new_st + all_done = all( + a.status == "completed" for a in chunk.artifacts.values() + ) + any_failed = any( + a.status == "failed" for a in chunk.artifacts.values() + ) + if all_done: + chunk.status = ChunkStatus.COMPLETED + elif any_failed: + chunk.status = ChunkStatus.FAILED + + asyncio.run(_poll_gen(gen_chunks)) + write_state(state, state_path) + + live.update(_build_status_table(state, _elapsed())) + + # Final update + live.update(_build_status_table(state, _elapsed())) + return + + console.print(_build_status_table(state)) diff --git a/src/pdf_by_chapters/models.py b/src/pdf_by_chapters/models.py new file mode 100644 index 0000000..fb60d8b --- /dev/null +++ b/src/pdf_by_chapters/models.py @@ -0,0 +1,29 @@ +"""Shared data models for pdf-by-chapters.""" + +from dataclasses import dataclass + + +@dataclass +class UploadResult: + """Result of uploading chapters to a notebook.""" + + id: str + title: str + chapters: int + + +@dataclass +class NotebookInfo: + """Summary of a NotebookLM notebook.""" + + id: str + title: str + sources_count: int + + +@dataclass +class SourceInfo: + """Summary of a source within a notebook.""" + + id: str + title: str diff --git a/src/pdf_by_chapters/notebooklm.py b/src/pdf_by_chapters/notebooklm.py index 1b813a1..7438922 100644 --- a/src/pdf_by_chapters/notebooklm.py +++ b/src/pdf_by_chapters/notebooklm.py @@ -2,38 +2,13 @@ import asyncio import logging -from dataclasses import dataclass from pathlib import Path from notebooklm import AudioFormat, NotebookLMClient, VideoStyle -logger = logging.getLogger(__name__) - - -@dataclass -class UploadResult: - """Result of uploading chapters to a notebook.""" - - id: str - title: str - chapters: int - - -@dataclass -class NotebookInfo: - """Summary of a NotebookLM notebook.""" - - id: str - title: str - sources_count: int - - -@dataclass -class SourceInfo: - """Summary of a source within a notebook.""" +from pdf_by_chapters.models import NotebookInfo, SourceInfo, UploadResult - id: str - title: str +logger = logging.getLogger(__name__) async def upload_chapters( @@ -276,3 +251,115 @@ async def delete_notebook(notebook_id: str) -> None: async with await NotebookLMClient.from_storage() as client: await client.notebooks.delete(notebook_id) logger.info("Deleted notebook %s", notebook_id) + + +async def create_syllabus( + client: NotebookLMClient, + notebook_id: str, + prompt: str, +) -> str: + """Send syllabus prompt to NotebookLM chat. + + Args: + client: An open NotebookLM client. + notebook_id: The notebook ID. + prompt: The syllabus generation prompt. + + Returns: + Raw AI response text. + """ + result = await client.chat.ask(notebook_id, prompt) + return result.answer + + +def _build_instructions(episode_title: str, chapter_titles: list[str] | None) -> dict[str, str]: + """Build scoped instructions referencing specific chapter titles.""" + if chapter_titles: + ch_list = ", ".join(chapter_titles) + return { + "audio": ( + f"Focus ONLY on these specific chapters: {ch_list}. " + f"Create an engaging audio deep-dive covering: {episode_title}. " + "Do not discuss content from other chapters." + ), + "video": ( + f"Focus ONLY on these specific chapters: {ch_list}. " + f"Create a visual explainer covering: {episode_title}. " + "Do not discuss content from other chapters." + ), + } + return { + "audio": f"Create an engaging audio overview: {episode_title}", + "video": f"Create a visual explainer: {episode_title}", + } + + +async def start_chunk_generation( + client: NotebookLMClient, + notebook_id: str, + source_ids: list[str], + episode_title: str, + generate_audio: bool = True, + generate_video: bool = True, + chapter_titles: list[str] | None = None, +) -> dict[str, str]: + """Fire off generation requests without polling. Returns {label: task_id}. + + Args: + client: An open NotebookLM client. + notebook_id: The notebook ID. + source_ids: Source IDs for this chunk's chapters. + episode_title: Title for the episode. + generate_audio: Whether to generate audio. + generate_video: Whether to generate video. + chapter_titles: Actual chapter titles for scoped instructions. + + Returns: + Mapping of label ("audio"/"video") -> task_id for started tasks. + """ + instructions = _build_instructions(episode_title, chapter_titles) + tasks: dict[str, str] = {} + for label, should_gen in [("audio", generate_audio), ("video", generate_video)]: + if not should_gen: + continue + try: + logger.info("Requesting %s for '%s'...", label, episode_title) + tasks[label] = await _request_chapter_artifact( + client, notebook_id, label, source_ids, instructions[label] + ) + except Exception as e: + logger.error("Failed to request %s: %s", label, e) + return tasks + + +async def poll_chunk_status( + client: NotebookLMClient, + notebook_id: str, + tasks: dict[str, str], +) -> dict[str, str]: + """Single poll of artifact generation status. Returns {label: status_str}. + + Args: + client: An open NotebookLM client. + notebook_id: The notebook ID. + tasks: Mapping of label -> task_id. + + Returns: + Mapping of label -> status string ("completed", "failed", "in_progress", "pending"). + """ + results: dict[str, str] = {} + for label, task_id in tasks.items(): + try: + status = await client.artifacts.poll_status(notebook_id, task_id) + if status.is_complete: + results[label] = "completed" + elif status.is_failed: + results[label] = "failed" + elif status.is_in_progress: + results[label] = "in_progress" + else: + results[label] = "pending" + except Exception as e: + logger.warning("Poll error for %s: %s", label, e) + results[label] = "unknown" + return results diff --git a/src/pdf_by_chapters/syllabus.py b/src/pdf_by_chapters/syllabus.py new file mode 100644 index 0000000..ac22dfe --- /dev/null +++ b/src/pdf_by_chapters/syllabus.py @@ -0,0 +1,392 @@ +"""Syllabus generation, parsing, and state management for chunked audio/video.""" + +import contextlib +import json +import logging +import os +import re +import tempfile +from dataclasses import dataclass, field +from enum import StrEnum +from pathlib import Path +from typing import Any + +logger = logging.getLogger(__name__) + +STATE_FILENAME = "syllabus_state.json" + +# Matches: Episode 1: "Title Here"\nChapters: 1, 2\nSummary: ... +_EPISODE_RE = re.compile( + r'Episode\s+(\d+):\s*"([^"]+)"\s*\n' + r"Chapters?:\s*([\d,\s]+)\s*\n" + r"Summary:\s*(.+)", + re.IGNORECASE, +) + +_CHAPTER_NUM_RE = re.compile(r"chapter_(\d+)", re.IGNORECASE) + +SYLLABUS_PROMPT_TEMPLATE = """\ +I have uploaded several sources, each representing a sequential chapter \ +from a single technical eBook. Here are the chapters: + +{source_list} + +Please divide these chapters into a "Podcast Syllabus" consisting of \ +logical chunks. Strictly limit each chunk to at most {max_chapters} \ +chapters. Group them by related technical concepts. + +Format your response EXACTLY as follows, one entry per chunk: + +Episode 1: "Episode Title Here" +Chapters: 1, 2 +Summary: One or two sentence summary. + +Episode 2: "Episode Title Here" +Chapters: 3 +Summary: One or two sentence summary. + +Use ONLY the chapter numbers listed above. Output ONLY the syllabus.""" + + +class SyllabusParseError(Exception): + """Raised when the LLM syllabus response cannot be parsed.""" + + +class SyllabusStateError(Exception): + """Raised when the state file is missing, corrupt, or invalid.""" + + +class ChunkStatus(StrEnum): + PENDING = "pending" + GENERATING = "generating" + COMPLETED = "completed" + FAILED = "failed" + + +@dataclass +class ChunkArtifact: + """Tracks a single artifact (audio or video) within a chunk.""" + + task_id: str = "" + status: str = "pending" + + def to_json(self) -> dict[str, str]: + return {"task_id": self.task_id, "status": self.status} + + @classmethod + def from_json(cls, data: dict[str, str]) -> "ChunkArtifact": + return cls(task_id=data.get("task_id", ""), status=data.get("status", "pending")) + + +@dataclass +class SyllabusChunk: + """A single episode in the syllabus plan.""" + + episode: int + title: str + chapters: list[int] + source_ids: list[str] + chapter_titles: list[str] = field(default_factory=list) + status: ChunkStatus = ChunkStatus.PENDING + artifacts: dict[str, ChunkArtifact] = field(default_factory=dict) + + def to_json(self) -> dict[str, Any]: + return { + "episode": self.episode, + "title": self.title, + "chapters": self.chapters, + "source_ids": self.source_ids, + "chapter_titles": self.chapter_titles, + "status": self.status.value, + "artifacts": {k: v.to_json() for k, v in self.artifacts.items()}, + } + + @classmethod + def from_json(cls, data: dict[str, Any]) -> "SyllabusChunk": + artifacts = {k: ChunkArtifact.from_json(v) for k, v in data.get("artifacts", {}).items()} + return cls( + episode=data["episode"], + title=data["title"], + chapters=data["chapters"], + source_ids=data["source_ids"], + chapter_titles=data.get("chapter_titles", []), + status=ChunkStatus(data.get("status", "pending")), + artifacts=artifacts, + ) + + +@dataclass +class SyllabusState: + """Root state object for the syllabus workflow.""" + + notebook_id: str + book_name: str + created: str + max_chapters: int + generate_audio: bool + generate_video: bool + chunks: dict[int, SyllabusChunk] + + def to_json(self) -> dict[str, Any]: + return { + "notebook_id": self.notebook_id, + "book_name": self.book_name, + "created": self.created, + "max_chapters": self.max_chapters, + "generate_audio": self.generate_audio, + "generate_video": self.generate_video, + "chunks": [chunk.to_json() for chunk in self.chunks.values()], + } + + @classmethod + def from_json(cls, data: dict[str, Any]) -> "SyllabusState": + """Load state from parsed JSON with structural validation. + + Raises: + SyllabusStateError: If required fields are missing or malformed. + """ + try: + chunks_list = [SyllabusChunk.from_json(c) for c in data["chunks"]] + except (KeyError, TypeError, ValueError) as exc: + raise SyllabusStateError(f"Corrupt state file: {exc}") from exc + + try: + return cls( + notebook_id=data["notebook_id"], + book_name=data["book_name"], + created=data.get("created", ""), + max_chapters=data.get("max_chapters", 2), + generate_audio=data.get("generate_audio", True), + generate_video=data.get("generate_video", True), + chunks={c.episode: c for c in chunks_list}, + ) + except KeyError as exc: + raise SyllabusStateError(f"Missing required field: {exc}") from exc + + +def build_prompt(sources: list[tuple[str, str]], max_chapters: int) -> str: + """Build the syllabus generation prompt with numbered source titles. + + Args: + sources: List of (source_id, title) tuples, in chapter order. + max_chapters: Maximum chapters per episode. + + Returns: + Formatted prompt string. + """ + source_list = "\n".join(f"{i}. {title}" for i, (_, title) in enumerate(sources, 1)) + return SYLLABUS_PROMPT_TEMPLATE.format(source_list=source_list, max_chapters=max_chapters) + + +def parse_syllabus_response( + response: str, + source_map: dict[int, str], + title_map: dict[int, str] | None = None, +) -> dict[int, SyllabusChunk]: + """Parse LLM syllabus response into chunks. + + Uses binary success/fallback: if all chapters are covered by the + parsed episodes, accept. Otherwise raise SyllabusParseError. + + Args: + response: Raw LLM response text. + source_map: Mapping of chapter_number -> source_id. + title_map: Mapping of chapter_number -> source title. + + Returns: + Dict of episode_number -> SyllabusChunk. + + Raises: + SyllabusParseError: If the response cannot be fully parsed. + """ + logger.debug("Raw syllabus response: %s", response) + title_map = title_map or {} + + matches = _EPISODE_RE.findall(response) + if not matches: + raise SyllabusParseError("No episodes found in LLM response") + + all_chapter_nums = set(source_map.keys()) + chunks: dict[int, SyllabusChunk] = {} + assigned_chapters: set[int] = set() + + for ep_str, title, chapters_str, *_ in matches: + episode = int(ep_str) + chapter_nums = [int(c.strip()) for c in chapters_str.split(",") if c.strip()] + source_ids = [source_map[c] for c in chapter_nums if c in source_map] + chapter_titles = [title_map[c] for c in chapter_nums if c in title_map] + assigned_chapters.update(chapter_nums) + + chunks[episode] = SyllabusChunk( + episode=episode, + title=title.strip(), + chapters=chapter_nums, + source_ids=source_ids, + chapter_titles=chapter_titles, + ) + + missing = all_chapter_nums - assigned_chapters + if missing: + raise SyllabusParseError(f"Chapters {sorted(missing)} not assigned to any episode") + + return chunks + + +def build_fixed_size_chunks( + source_map: dict[int, str], + max_chapters: int, + title_map: dict[int, str] | None = None, +) -> dict[int, SyllabusChunk]: + """Build fixed-size chapter chunks as a fallback. + + Args: + source_map: Mapping of chapter_number -> source_id. + max_chapters: Maximum chapters per chunk. + title_map: Mapping of chapter_number -> source title. + + Returns: + Dict of episode_number -> SyllabusChunk. + + Raises: + ValueError: If max_chapters < 1 or source_map is empty. + """ + if max_chapters < 1: + raise ValueError("max_chapters must be >= 1") + if not source_map: + raise ValueError("source_map is empty") + + title_map = title_map or {} + sorted_chapters = sorted(source_map.keys()) + chunks: dict[int, SyllabusChunk] = {} + episode = 1 + + for i in range(0, len(sorted_chapters), max_chapters): + chapter_nums = sorted_chapters[i : i + max_chapters] + source_ids = [source_map[c] for c in chapter_nums] + chapter_titles = [title_map[c] for c in chapter_nums if c in title_map] + chapter_range = f"{chapter_nums[0]}-{chapter_nums[-1]}" + chunks[episode] = SyllabusChunk( + episode=episode, + title=f"Chapters {chapter_range}", + chapters=chapter_nums, + source_ids=source_ids, + chapter_titles=chapter_titles, + ) + episode += 1 + + return chunks + + +def map_sources_to_chapters( + sources: list[tuple[str, str]], +) -> tuple[dict[int, str], dict[int, str]]: + """Map chapter numbers to source IDs and titles by parsing source titles. + + All-or-nothing: if any source title fails to parse, falls back to + positional indexing for all sources. + + Args: + sources: List of (source_id, title) tuples. + + Returns: + Tuple of (chapter_number -> source_id, chapter_number -> title). + """ + if not sources: + return {}, {} + + id_map: dict[int, str] = {} + title_map: dict[int, str] = {} + for source_id, title in sources: + match = _CHAPTER_NUM_RE.search(title or "") + if not match: + logger.warning( + "Cannot parse chapter number from '%s'; using positional fallback", + title, + ) + ids = {i + 1: sid for i, (sid, _) in enumerate(sources)} + titles = {i + 1: t for i, (_, t) in enumerate(sources)} + return ids, titles + chapter_num = int(match.group(1)) + id_map[chapter_num] = source_id + title_map[chapter_num] = title or "" + + return id_map, title_map + + +def read_state(state_path: Path) -> SyllabusState: + """Load syllabus state from a JSON file. + + Args: + state_path: Path to the state file. + + Returns: + Parsed SyllabusState. + + Raises: + SyllabusStateError: If the file is missing, corrupt, or invalid. + """ + if not state_path.is_file(): + raise SyllabusStateError( + f"No syllabus found at {state_path}. Run 'pdf-by-chapters syllabus' first." + ) + + try: + data = json.loads(state_path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError) as exc: + raise SyllabusStateError(f"Cannot read state file: {exc}") from exc + + return SyllabusState.from_json(data) + + +def write_state(state: SyllabusState, state_path: Path) -> None: + """Atomically write syllabus state to a JSON file. + + Uses write-to-temp-then-rename for crash safety. + + Args: + state: The state to persist. + state_path: Target file path. + """ + state_path.parent.mkdir(parents=True, exist_ok=True) + data = json.dumps(state.to_json(), indent=2, ensure_ascii=False) + + tmp_fd, tmp_path = tempfile.mkstemp( + dir=str(state_path.parent), + suffix=".tmp", + prefix=".syllabus_state_", + ) + try: + with os.fdopen(tmp_fd, "w", encoding="utf-8") as f: + f.write(data) + f.write("\n") + f.flush() + os.fsync(f.fileno()) + os.replace(tmp_path, str(state_path)) + except BaseException: + with contextlib.suppress(OSError): + os.unlink(tmp_path) + raise + + +def get_next_chunk(state: SyllabusState) -> SyllabusChunk | None: + """Select the next chunk to generate, by priority. + + Priority: GENERATING (resume interrupted) > FAILED (retry) > PENDING (new). + + Args: + state: Current syllabus state. + + Returns: + The next chunk to process, or None if all are completed. + """ + priority = [ChunkStatus.GENERATING, ChunkStatus.FAILED, ChunkStatus.PENDING] + for target_status in priority: + for chunk in sorted(state.chunks.values(), key=lambda c: c.episode): + if chunk.status == target_status: + return chunk + return None + + +def has_non_pending_chunks(state: SyllabusState) -> bool: + """Check if any chunks have progressed beyond pending.""" + return any(c.status != ChunkStatus.PENDING for c in state.chunks.values()) diff --git a/tests/conftest.py b/tests/conftest.py index 3006a8c..476af34 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -91,6 +91,12 @@ def mock_notebooklm_client(): client.artifacts.download_audio.return_value = None client.artifacts.download_video.return_value = None + client.artifacts.rename.return_value = None + + mock_ask_result = MagicMock() + mock_ask_result.answer = 'Episode 1: "Test Episode"\nChapters: 1\nSummary: Test.' + mock_ask_result.conversation_id = "test-conv-id" + client.chat.ask.return_value = mock_ask_result return client diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 0bfb685..88285de 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -1,10 +1,18 @@ """Unit tests for pdf_by_chapters.cli.""" +import json from unittest.mock import patch from typer.testing import CliRunner from pdf_by_chapters.cli import app +from pdf_by_chapters.syllabus import ( + ChunkArtifact, + ChunkStatus, + SyllabusChunk, + SyllabusState, + write_state, +) runner = CliRunner() @@ -150,3 +158,193 @@ def test_delete_with_confirmation(self, patch_notebooklm): def test_delete_aborted(self, patch_notebooklm): result = runner.invoke(app, ["delete", "-n", "test-id"], input="n\n") assert result.exit_code != 0 + + +def _make_state(tmp_path, **overrides): + """Helper to create a state file for CLI tests.""" + state = SyllabusState( + notebook_id=overrides.get("notebook_id", "nb-123"), + book_name=overrides.get("book_name", "Test_Book"), + created="2026-03-10T00:00:00Z", + max_chapters=2, + generate_audio=True, + generate_video=True, + chunks=overrides.get( + "chunks", + { + 1: SyllabusChunk( + episode=1, + title="Foundations", + chapters=[1, 2], + source_ids=["s1", "s2"], + status=ChunkStatus.PENDING, + ), + }, + ), + ) + state_path = tmp_path / "syllabus_state.json" + write_state(state, state_path) + return state_path + + +class TestSyllabusCommand: + """Tests for the syllabus CLI command.""" + + def test_missing_notebook_id(self, tmp_path): + result = runner.invoke(app, ["syllabus", "-o", str(tmp_path)]) + assert result.exit_code != 0 + + def test_no_sources_error(self, patch_notebooklm, tmp_path): + client, _ = patch_notebooklm + client.sources.list.return_value = [] + result = runner.invoke(app, ["syllabus", "-n", "nb-123", "-o", str(tmp_path)]) + assert result.exit_code != 0 + assert "No sources" in result.stdout + + def test_creates_state_file(self, patch_notebooklm, tmp_path): + _client, _ = patch_notebooklm + result = runner.invoke(app, ["syllabus", "-n", "nb-123", "-o", str(tmp_path)]) + assert result.exit_code == 0 + state_file = tmp_path / "syllabus_state.json" + assert state_file.is_file() + data = json.loads(state_file.read_text()) + assert data["notebook_id"] == "nb-123" + assert len(data["chunks"]) >= 1 + + def test_refuses_overwrite_without_force(self, patch_notebooklm, tmp_path): + # Create existing state with a completed chunk + _make_state( + tmp_path, + chunks={ + 1: SyllabusChunk( + episode=1, + title="Done", + chapters=[1], + source_ids=["s1"], + status=ChunkStatus.COMPLETED, + ), + }, + ) + result = runner.invoke(app, ["syllabus", "-n", "nb-123", "-o", str(tmp_path)]) + assert result.exit_code != 0 + assert "force" in result.stdout.lower() + + def test_force_overwrites(self, patch_notebooklm, tmp_path): + _make_state( + tmp_path, + chunks={ + 1: SyllabusChunk( + episode=1, + title="Done", + chapters=[1], + source_ids=["s1"], + status=ChunkStatus.COMPLETED, + ), + }, + ) + result = runner.invoke(app, ["syllabus", "-n", "nb-123", "-o", str(tmp_path), "--force"]) + assert result.exit_code == 0 + + +class TestGenerateNextCommand: + """Tests for the generate-next CLI command.""" + + def test_no_state_file(self, tmp_path): + result = runner.invoke(app, ["generate-next", "-o", str(tmp_path)]) + assert result.exit_code != 0 + assert "No syllabus" in result.stdout + + @patch("pdf_by_chapters.notebooklm.asyncio.sleep") + def test_all_completed(self, _mock_sleep, patch_notebooklm, tmp_path): + _make_state( + tmp_path, + chunks={ + 1: SyllabusChunk( + episode=1, + title="Done", + chapters=[1], + source_ids=["s1"], + status=ChunkStatus.COMPLETED, + ), + }, + ) + result = runner.invoke(app, ["generate-next", "-o", str(tmp_path)]) + assert result.exit_code == 0 + assert "completed" in result.stdout.lower() + + @patch("pdf_by_chapters.notebooklm.asyncio.sleep") + def test_generates_pending_chunk(self, _mock_sleep, patch_notebooklm, tmp_path): + _make_state(tmp_path) + result = runner.invoke(app, ["generate-next", "-o", str(tmp_path)]) + assert result.exit_code == 0 + # Verify state file updated + data = json.loads((tmp_path / "syllabus_state.json").read_text()) + assert data["chunks"][0]["status"] == "completed" + + @patch("pdf_by_chapters.notebooklm.asyncio.sleep") + def test_episode_targeting(self, _mock_sleep, patch_notebooklm, tmp_path): + _make_state( + tmp_path, + chunks={ + 1: SyllabusChunk( + episode=1, + title="First", + chapters=[1], + source_ids=["s1"], + status=ChunkStatus.COMPLETED, + ), + 2: SyllabusChunk( + episode=2, + title="Second", + chapters=[2], + source_ids=["s2"], + status=ChunkStatus.PENDING, + ), + }, + ) + result = runner.invoke(app, ["generate-next", "-o", str(tmp_path), "-e", "1"]) + assert result.exit_code == 0 + + def test_invalid_episode(self, patch_notebooklm, tmp_path): + _make_state(tmp_path) + result = runner.invoke(app, ["generate-next", "-o", str(tmp_path), "-e", "99"]) + assert result.exit_code != 0 + assert "not found" in result.stdout.lower() + + +class TestStatusCommand: + """Tests for the status CLI command.""" + + def test_no_state_file(self, tmp_path): + result = runner.invoke(app, ["status", "-o", str(tmp_path)]) + assert result.exit_code != 0 + + def test_displays_progress(self, tmp_path): + _make_state( + tmp_path, + chunks={ + 1: SyllabusChunk( + episode=1, + title="Foundations", + chapters=[1, 2], + source_ids=["s1", "s2"], + status=ChunkStatus.COMPLETED, + artifacts={ + "audio": ChunkArtifact(status="completed"), + "video": ChunkArtifact(status="completed"), + }, + ), + 2: SyllabusChunk( + episode=2, + title="Advanced", + chapters=[3], + source_ids=["s3"], + status=ChunkStatus.PENDING, + ), + }, + ) + result = runner.invoke(app, ["status", "-o", str(tmp_path)]) + assert result.exit_code == 0 + assert "Foundations" in result.stdout + assert "Advanced" in result.stdout + assert "1/2" in result.stdout diff --git a/tests/unit/test_notebooklm.py b/tests/unit/test_notebooklm.py index a083801..19cb451 100644 --- a/tests/unit/test_notebooklm.py +++ b/tests/unit/test_notebooklm.py @@ -3,6 +3,7 @@ from unittest.mock import AsyncMock, MagicMock, patch from pdf_by_chapters.notebooklm import ( + create_syllabus, delete_notebook, download_artifacts, generate_for_chapters, @@ -231,3 +232,27 @@ async def test_deletes_by_id(self, patch_notebooklm): client, _ = patch_notebooklm await delete_notebook("test-notebook-id") client.notebooks.delete.assert_called_once_with("test-notebook-id") + + +class TestCreateSyllabus: + """Tests for create_syllabus.""" + + async def test_returns_answer(self, patch_notebooklm): + client, _ = patch_notebooklm + result = await create_syllabus(client, "nb-123", "Create a syllabus") + assert "Episode 1" in result + client.chat.ask.assert_called_once_with("nb-123", "Create a syllabus") + + async def test_empty_response(self, patch_notebooklm): + client, _ = patch_notebooklm + mock_result = MagicMock() + mock_result.answer = "" + client.chat.ask.return_value = mock_result + result = await create_syllabus(client, "nb-123", "prompt") + assert result == "" + + async def test_passes_notebook_id(self, patch_notebooklm): + client, _ = patch_notebooklm + await create_syllabus(client, "my-nb-id", "prompt") + call_args = client.chat.ask.call_args + assert call_args.args[0] == "my-nb-id" diff --git a/tests/unit/test_syllabus.py b/tests/unit/test_syllabus.py new file mode 100644 index 0000000..0f8ccc9 --- /dev/null +++ b/tests/unit/test_syllabus.py @@ -0,0 +1,326 @@ +"""Unit tests for pdf_by_chapters.syllabus.""" + +import json + +import pytest + +from pdf_by_chapters.syllabus import ( + ChunkStatus, + SyllabusChunk, + SyllabusParseError, + SyllabusState, + SyllabusStateError, + build_fixed_size_chunks, + build_prompt, + get_next_chunk, + has_non_pending_chunks, + map_sources_to_chapters, + parse_syllabus_response, + read_state, + write_state, +) + + +@pytest.fixture() +def source_map(): + """Standard 5-chapter source mapping.""" + return {1: "s1", 2: "s2", 3: "s3", 4: "s4", 5: "s5"} + + +@pytest.fixture() +def sample_state(source_map): + """A SyllabusState with mixed chunk statuses.""" + return SyllabusState( + notebook_id="nb-123", + book_name="Test_Book", + created="2026-03-10T00:00:00Z", + max_chapters=2, + generate_audio=True, + generate_video=True, + chunks={ + 1: SyllabusChunk( + episode=1, + title="Foundations", + chapters=[1, 2], + source_ids=["s1", "s2"], + status=ChunkStatus.COMPLETED, + ), + 2: SyllabusChunk( + episode=2, + title="Deep Dive", + chapters=[3, 4], + source_ids=["s3", "s4"], + status=ChunkStatus.PENDING, + ), + 3: SyllabusChunk( + episode=3, + title="Advanced", + chapters=[5], + source_ids=["s5"], + status=ChunkStatus.PENDING, + ), + }, + ) + + +class TestBuildPrompt: + """Tests for build_prompt.""" + + def test_includes_source_titles(self): + sources = [("s1", "chapter_01_intro.pdf"), ("s2", "chapter_02_basics.pdf")] + prompt = build_prompt(sources, max_chapters=2) + assert "1. chapter_01_intro.pdf" in prompt + assert "2. chapter_02_basics.pdf" in prompt + + def test_includes_max_chapters(self): + sources = [("s1", "ch1.pdf")] + prompt = build_prompt(sources, max_chapters=3) + assert "at most 3" in prompt + + +class TestParseSyllabusResponse: + """Tests for parse_syllabus_response.""" + + def test_clean_parse(self, source_map): + response = ( + 'Episode 1: "Foundations"\n' + "Chapters: 1, 2\n" + "Summary: Covers the basics.\n\n" + 'Episode 2: "Intermediate"\n' + "Chapters: 3, 4\n" + "Summary: Goes deeper.\n\n" + 'Episode 3: "Advanced"\n' + "Chapters: 5\n" + "Summary: Expert topics.\n" + ) + chunks = parse_syllabus_response(response, source_map) + assert len(chunks) == 3 + assert chunks[1].title == "Foundations" + assert chunks[1].chapters == [1, 2] + assert chunks[1].source_ids == ["s1", "s2"] + assert chunks[3].chapters == [5] + + def test_empty_response_raises(self, source_map): + with pytest.raises(SyllabusParseError, match="No episodes found"): + parse_syllabus_response("", source_map) + + def test_unstructured_text_raises(self, source_map): + with pytest.raises(SyllabusParseError, match="No episodes found"): + parse_syllabus_response("Just some random text about chapters.", source_map) + + def test_missing_chapters_raises(self, source_map): + response = 'Episode 1: "Partial"\nChapters: 1, 2, 3\nSummary: Only covers three.\n' + with pytest.raises(SyllabusParseError, match="not assigned"): + parse_syllabus_response(response, source_map) + + def test_with_preamble_text(self, source_map): + response = ( + "Here is your podcast syllabus:\n\n" + 'Episode 1: "Part One"\n' + "Chapters: 1, 2, 3\n" + "Summary: First part.\n\n" + 'Episode 2: "Part Two"\n' + "Chapters: 4, 5\n" + "Summary: Second part.\n" + ) + chunks = parse_syllabus_response(response, source_map) + assert len(chunks) == 2 + + def test_single_chapter_episode(self): + source_map = {1: "s1"} + response = 'Episode 1: "Solo"\nChapters: 1\nSummary: Just one chapter.\n' + chunks = parse_syllabus_response(response, source_map) + assert chunks[1].chapters == [1] + + +class TestBuildFixedSizeChunks: + """Tests for build_fixed_size_chunks.""" + + @pytest.mark.parametrize( + "num_chapters,chunk_size,expected_count", + [ + pytest.param(5, 2, 3, id="uneven-split"), + pytest.param(4, 2, 2, id="even-split"), + pytest.param(1, 5, 1, id="single-chapter"), + pytest.param(3, 1, 3, id="chunk-size-one"), + pytest.param(3, 100, 1, id="chunk-larger-than-input"), + ], + ) + def test_chunk_count(self, num_chapters, chunk_size, expected_count): + source_map = {i + 1: f"s{i + 1}" for i in range(num_chapters)} + chunks = build_fixed_size_chunks(source_map, chunk_size) + assert len(chunks) == expected_count + + def test_no_items_lost(self, source_map): + chunks = build_fixed_size_chunks(source_map, 2) + all_chapters = [] + for chunk in chunks.values(): + all_chapters.extend(chunk.chapters) + assert sorted(all_chapters) == [1, 2, 3, 4, 5] + + def test_chunk_size_zero_raises(self, source_map): + with pytest.raises(ValueError, match="max_chapters must be >= 1"): + build_fixed_size_chunks(source_map, 0) + + def test_empty_source_map_raises(self): + with pytest.raises(ValueError, match="source_map is empty"): + build_fixed_size_chunks({}, 2) + + def test_titles_contain_chapter_range(self, source_map): + chunks = build_fixed_size_chunks(source_map, 2) + assert chunks[1].title == "Chapters 1-2" + assert chunks[3].title == "Chapters 5-5" + + def test_episodes_numbered_sequentially(self, source_map): + chunks = build_fixed_size_chunks(source_map, 2) + assert list(chunks.keys()) == [1, 2, 3] + + +class TestMapSourcesToChapters: + """Tests for map_sources_to_chapters.""" + + def test_standard_format(self): + sources = [ + ("s1", "book_chapter_01_intro.pdf"), + ("s2", "book_chapter_02_basics.pdf"), + ] + id_map, title_map = map_sources_to_chapters(sources) + assert id_map == {1: "s1", 2: "s2"} + assert title_map[1] == "book_chapter_01_intro.pdf" + + def test_case_insensitive(self): + sources = [("s1", "CHAPTER_01_UPPER.pdf")] + id_map, _ = map_sources_to_chapters(sources) + assert id_map == {1: "s1"} + + def test_double_digit(self): + sources = [("s10", "chapter_10_advanced.pdf")] + id_map, _ = map_sources_to_chapters(sources) + assert id_map == {10: "s10"} + + def test_no_match_falls_back_to_positional(self): + sources = [("s1", "random_document.pdf"), ("s2", "another_file.pdf")] + id_map, _ = map_sources_to_chapters(sources) + assert id_map == {1: "s1", 2: "s2"} + + def test_empty_list(self): + id_map, title_map = map_sources_to_chapters([]) + assert id_map == {} + assert title_map == {} + + def test_mixed_parseable_unparseable_falls_back(self): + sources = [ + ("s1", "chapter_01_intro.pdf"), + ("s2", "appendix.pdf"), + ] + id_map, _ = map_sources_to_chapters(sources) + assert id_map == {1: "s1", 2: "s2"} + + +class TestReadWriteState: + """Tests for read_state and write_state.""" + + def test_round_trip(self, tmp_path, sample_state): + state_path = tmp_path / "state.json" + write_state(sample_state, state_path) + loaded = read_state(state_path) + + assert loaded.notebook_id == sample_state.notebook_id + assert loaded.book_name == sample_state.book_name + assert len(loaded.chunks) == len(sample_state.chunks) + assert loaded.chunks[1].title == "Foundations" + assert loaded.chunks[1].status == ChunkStatus.COMPLETED + + def test_atomic_write_no_temp_left(self, tmp_path, sample_state): + state_path = tmp_path / "state.json" + write_state(sample_state, state_path) + files = list(tmp_path.iterdir()) + assert len(files) == 1 + assert files[0].name == "state.json" + + def test_read_missing_file_raises(self, tmp_path): + with pytest.raises(SyllabusStateError, match="No syllabus found"): + read_state(tmp_path / "nonexistent.json") + + def test_read_corrupt_json_raises(self, tmp_path): + bad_file = tmp_path / "state.json" + bad_file.write_text("{bad json", encoding="utf-8") + with pytest.raises(SyllabusStateError, match="Cannot read state file"): + read_state(bad_file) + + def test_read_missing_keys_raises(self, tmp_path): + bad_file = tmp_path / "state.json" + bad_file.write_text('{"chunks": [{}]}', encoding="utf-8") + with pytest.raises(SyllabusStateError): + read_state(bad_file) + + def test_write_creates_parent_dirs(self, tmp_path, sample_state): + deep_path = tmp_path / "a" / "b" / "state.json" + write_state(sample_state, deep_path) + assert deep_path.is_file() + + def test_json_is_readable(self, tmp_path, sample_state): + state_path = tmp_path / "state.json" + write_state(sample_state, state_path) + data = json.loads(state_path.read_text()) + assert data["notebook_id"] == "nb-123" + assert isinstance(data["chunks"], list) + assert data["chunks"][0]["episode"] == 1 + + +class TestGetNextChunk: + """Tests for get_next_chunk.""" + + def test_returns_pending_chunk(self, sample_state): + chunk = get_next_chunk(sample_state) + assert chunk is not None + assert chunk.episode == 2 + assert chunk.status == ChunkStatus.PENDING + + def test_generating_has_priority_over_pending(self, sample_state): + sample_state.chunks[3].status = ChunkStatus.GENERATING + chunk = get_next_chunk(sample_state) + assert chunk is not None + assert chunk.episode == 3 + + def test_failed_has_priority_over_pending(self, sample_state): + sample_state.chunks[3].status = ChunkStatus.FAILED + chunk = get_next_chunk(sample_state) + assert chunk is not None + assert chunk.episode == 3 + + def test_generating_has_priority_over_failed(self, sample_state): + sample_state.chunks[2].status = ChunkStatus.FAILED + sample_state.chunks[3].status = ChunkStatus.GENERATING + chunk = get_next_chunk(sample_state) + assert chunk is not None + assert chunk.episode == 3 + + def test_all_completed_returns_none(self, sample_state): + for c in sample_state.chunks.values(): + c.status = ChunkStatus.COMPLETED + assert get_next_chunk(sample_state) is None + + def test_empty_chunks_returns_none(self): + state = SyllabusState( + notebook_id="nb", + book_name="book", + created="", + max_chapters=2, + generate_audio=True, + generate_video=True, + chunks={}, + ) + assert get_next_chunk(state) is None + + +class TestHasNonPendingChunks: + """Tests for has_non_pending_chunks.""" + + def test_all_pending_returns_false(self, sample_state): + for c in sample_state.chunks.values(): + c.status = ChunkStatus.PENDING + assert not has_non_pending_chunks(sample_state) + + def test_one_completed_returns_true(self, sample_state): + assert has_non_pending_chunks(sample_state)