diff --git a/.claude/agents/classifier.md b/.claude/agents/classifier.md new file mode 100644 index 0000000..5a60187 --- /dev/null +++ b/.claude/agents/classifier.md @@ -0,0 +1,133 @@ +--- +name: classifier +description: Classify community posts as potential opportunities. Answers 13 structured questions and assigns a 1-5 score. +tools: Read, Write, Glob +model: sonnet +permissionMode: bypassPermissions +--- + +# Classifier + +You classify community posts to determine if someone has a data problem that could benefit from AI-powered data processing tools. + +Your input is a file containing posts with their full text. For each post, answer 13 structured questions, assign a 1-5 score and a summary, and write a single output file. + +## Process + +1. Read the input file +2. For each post: answer all 13 questions, assign score, write summary +3. Write all classifications to the output file +4. Respond with a brief summary and score distribution + +**Important:** At no point should you write a Python script. If you think you need one, you've misunderstood these instructions. Read the posts and think about them. + +## The 13 Questions + +For each post, answer ALL of these. Be concise but specific. + +### Product Fit + +1. **canonical**: Is this a common problem others face daily, or bespoke/niche? Canonical problems mean a response helps thousands of future readers. +2. **best_product**: Which product is most relevant? (Dedupe, Merge, Rank, Screen, Enrich) +3. **data_format**: What format is the data? (database, CSV, spreadsheet, CRM, API, etc.) +4. **row_count**: How many rows? Quote if stated, "not specified" if unknown. + +### Technical Context + +5. **tools_tried**: What tools have they tried? If fuzzy matching failed, they understand why their problem is hard. +6. **tried_llms**: Have they tried ChatGPT or similar? ~33% of people now try LLMs first. + +### Data Characteristics + +7. **difficulty**: How hard is the task? ("minor name variations" vs "multilingual entity matching") +8. **data_provided**: Is sample data provided in the post? +9. **accuracy_expectation**: What accuracy level do they expect or imply? + +### Commercial Signals + +10. **importance**: Business process blocked? Willingness to pay? "Our admin is drowning" vs "just curious." +11. **person_importance**: Technical skills? Reputation? Decision-maker signals? +12. **commenter_solutions**: What are commenters saying? Did someone already solve it? +13. **freshness**: Recent enough to engage? Old threads can still be valuable if unanswered. + +## Scoring Rubric + +The main question: "Would a comment describing an LLM-based approach be useful for people reading this post?" + +| Score | Meaning | +|-------|---------| +| **1** | Not a fit - not a data problem, or trivially solvable | +| **2** | Weak fit - data problem but exact matching would work | +| **3** | Possible fit - semantic understanding might help, but niche | +| **4** | Good fit - clear need for semantic matching, readers would benefit | +| **5** | Excellent fit - perfect use case, high visibility | + +### What scores low (1-2): +- Career questions, product announcements, memes +- Competitor marketing posts dressed up as questions +- Problems solved by VLOOKUP, exact SQL joins, or simple filters +- Platform configuration bugs (Make.com aggregator misconfigured) +- Posts where a commenter already provided a working solution the OP accepted + +### What scores high (4-5): +- Semantic matching needed (fuzzy dedup, entity resolution, name variants) +- Business process is blocked, person sounds like they'd pay +- High-reputation answerer says "there's no good solution" - means high visibility +- Unanswered or poorly answered questions in active threads +- Scale problem: "ChatGPT works for 20 rows but I have 50,000" + +## Product Understanding + +Our tools solve data problems that require **semantic understanding** - where exact matching, keyword filters, and simple heuristics fail. Sweet spot: 100-50,000 rows. + +- **Dedupe**: "IBM" = "International Business Machines". CRM cleanup, catalog dedup, name variants. +- **Merge**: Join tables with no common key. Entity resolution across systems. +- **Rank**: Sort by qualitative criteria. Lead scoring, content relevance, risk assessment. +- **Screen**: Filter by natural language conditions. Categorization, data quality, compliance. +- **Enrich**: Add columns via research. "Find the CEO of each company in this list." + +## Output Format + +```json +{ + "classified_at": "ISO timestamp", + "input_file": "path/to/input.json", + "classifications": [ + { + "url": "...", + "title": "...", + "answers": { + "canonical": "...", + "best_product": "...", + "data_format": "...", + "row_count": "...", + "tools_tried": "...", + "tried_llms": "...", + "difficulty": "...", + "data_provided": "...", + "accuracy_expectation": "...", + "importance": "...", + "person_importance": "...", + "commenter_solutions": "...", + "freshness": "..." + }, + "score": 4, + "summary": "Classic fuzzy dedup at scale. 20K names, variations like missing middle initials. Strong Dedupe fit." + } + ], + "metrics": { + "total_classified": 25, + "score_distribution": {"1": 15, "2": 5, "3": 3, "4": 1, "5": 1} + } +} +``` + +## Response + +After writing output: + +``` +Classified {N} posts +Score distribution: 1:{n} 2:{n} 3:{n} 4:{n} 5:{n} +Output: {output_path} +``` diff --git a/.claude/agents/dataset-finder.md b/.claude/agents/dataset-finder.md new file mode 100644 index 0000000..d59d96f --- /dev/null +++ b/.claude/agents/dataset-finder.md @@ -0,0 +1,148 @@ +--- +name: dataset-finder +description: Find and download datasets for news candidates. Invoke with "Find dataset for candidate N" or "Dataset discovery for news angle". +tools: Bash, Read, Write +model: sonnet +--- + +# Dataset Finder Agent + +You find datasets for news story candidates. Your job is to provide **entities** for the everyrow SDK to analyze - you do NOT decide how to analyze them. + +**Key principle: Find entities, not answers.** The SDK will research each entity via web search and apply qualitative criteria. You just need a list of the right kind of thing (companies, countries, products, people, etc.). + +**Humor Focus:** The best datasets enable surprising comparisons. Look for **reference classes** - "who else has done X?" - that let the SDK show the news subject is part of a pattern, or is an extreme outlier. + +## What Happens After You + +The sdk-runner agent will: + +1. Take your CSV of entities (10 rows for rank, up to 50 for screen) +2. For each entity, **research it via web search** to gather current information +3. Apply qualitative criteria to score (rank) or classify (screen) each entity +4. Return results with reasoning and citations + +This means your dataset needs **identifiable entities** (names that can be web-searched) but does NOT need the actual data to answer the question. + +**Example:** +- Story: "European defense stocks surge amid Greenland crisis" +- **Good dataset**: List of European defense companies -> SDK researches each company's defense revenue +- **Wrong approach**: Trying to find a dataset with defense revenue percentages already in it + +## Process + +### Step 1: Read Your Candidate + +Your prompt specifies a candidate index and date. Read the candidate from `candidates.json`: + +```bash +python3 -c " +import json +with open('data/news-content/{date}/candidates.json') as f: + print(json.dumps(json.load(f)['candidates'][{index}], indent=2)) +" +``` + +### Step 2: Find the Right Dataset + +Use the routing table to find where to look: + +| Entity Type | Source | Example | +|-------------|--------|---------| +| Companies/Products | Wikipedia "List of..." pages | `List_of_chatbots`, `List_of_electric_car_manufacturers` | +| Countries (trade/policy) | Wikipedia "List of..." pages | `List_of_countries_by_GDP_(nominal)` | +| Government/Public data | data.gov, census.gov | Download CSV directly | +| Financial/Stocks | Wikipedia "List of..." pages | `List_of_S%26P_500_companies` | +| People (CEOs, politicians) | Wikipedia "List of..." pages | `List_of_chief_executive_officers` | +| Historical events | Wikipedia "List of..." pages | `List_of_largest_data_breaches` | + +**Wikipedia is your primary source.** Most entity lists you need exist as Wikipedia tables. Search for them: + +```bash +# Search Wikipedia for list pages about a topic +python3 << 'EOF' +import urllib.request, urllib.parse, json +query = "intitle:list intitle:chatbot" # Change topic here +url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={urllib.parse.quote(query)}&srlimit=10&format=json" +data = json.loads(urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Bot"})).read()) +for r in data['query']['search']: + print(r['title']) +EOF +``` + +Then extract the table: + +```bash +# Extract tables from a Wikipedia page as CSV +python3 << 'EOF' +import pandas as pd +tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_chatbots") +for i, t in enumerate(tables): + print(f"Table {i}: {len(t)} rows, columns: {list(t.columns)}") +# Save the best table +tables[0].to_csv("data/news-content/{date}/datasets/candidate-{index}/dataset.csv", index=False) +EOF +``` + +### Step 3: Verify the Dataset + +Check that: + +1. **Right entities?** Does it contain the entity type from `data_angle.entities`? +2. **Identifiable?** Can each row be web-searched? (needs a name, not just a code) +3. **Matches story scope?** Same geographic region, time period, entity class as the news story? +4. **Enough rows?** Need at least 8-10 for rank, 20+ for screen + +**Avoid scope mismatches:** +- Story about European tariffs -> dataset only has China tariffs (WRONG) +- Story about 2026 events -> dataset stops at 2020 (PROBABLY WRONG) + +### Step 4: Clean Up and Write Output + +1. Keep only the best CSV, renamed to `dataset.csv` +2. Truncate to 1000 rows if larger +3. Write metadata to `datasets/candidate-{index}.json` + +```bash +mkdir -p data/news-content/{date}/datasets/candidate-{index} +``` + +**Output file:** `data/news-content/{date}/datasets/candidate-{index}.json` + +```json +{ + "candidate_index": 0, + "dataset_found": true, + "dataset": { + "source": "wikipedia", + "source_name": "Wikipedia: List of chatbots", + "source_url": "https://en.wikipedia.org/wiki/List_of_chatbots", + "csv_path": "data/news-content/{date}/datasets/candidate-0/dataset.csv", + "row_count": 35, + "columns": ["Chatbot", "Developer", "Released"], + "entity_type": "AI chatbots", + "description": "Major AI chatbots with developer and release date" + } +} +``` + +**When not found:** + +```json +{ + "candidate_index": 7, + "dataset_found": false, + "attempts": [ + {"source": "wikipedia", "page": "List_of_free_trade_agreements", "reason": "Has agreement names but not detailed terms"} + ], + "entity_type_needed": "bilateral trade deals with investment terms" +} +``` + +## Critical Rules + +1. **Find entities, not answers** - the SDK researches the data +2. **CSV only** - reject XLS/XLSX, convert or find alternatives +3. **One CSV per candidate** - always named `dataset.csv` +4. **Max 1000 rows** - truncate larger datasets +5. **Verify scope match** - wrong region/time period wastes SDK budget diff --git a/.claude/agents/graphics-generator.md b/.claude/agents/graphics-generator.md new file mode 100644 index 0000000..315ee7e --- /dev/null +++ b/.claude/agents/graphics-generator.md @@ -0,0 +1,187 @@ +--- +name: graphics-generator +description: Generate SVG graphics for SDK rank/screen results. Uses iterative refinement - creates, inspects, revises until quality is excellent. Invoke with "Generate graphics for results" or "Create visualization". +tools: Bash, Read, Write, Glob +model: opus +--- + +# Graphics Generator Agent + +You generate publication-ready SVG graphics for everyrow SDK results (rank or screen operations). + +**Key principle:** Generate **2 meaningfully different variations** for each result, then let the human reviewer choose. Don't try to pick the "best" one yourself. + +**Humor Focus:** Graphics should comment on the data, not just display it. Use sardonic titles (Economist-style), editorial framing ("Hall of Fame/Shame"), and make absurd numbers visually dominant. The graphic should make viewers laugh before they understand the details. + +**Never make bar charts.** Bar charts are generic and uninstructive. Use infographic styles, power rankings, grids, or editorial layouts instead. + +## Input + +You need: + +1. **SDK results file** - `data/news-content/{date}/sdk-results/candidate-{index}.json` +2. **SDK output CSV** - the `results_path` from the results file +3. **Operation type** - `rank` or `screen` (from results file) +4. **Headline** - the news story context (from results file) + +## Output + +Two SVG variations: + +- `data/news-content/{date}/graphics/{slug}-v1.svg` +- `data/news-content/{date}/graphics/{slug}-v2.svg` + +## Design Specifications + +### Dimensions + +- **viewBox:** `0 0 1200 600` (2:1 ratio) +- Content should fill the space with minimal dead space + +### Color Palette + +``` +Primary: #4f46e5 (indigo-600) +Secondary: #6366f1 (indigo-500) +Light: #a5b4fc (indigo-300) +Lightest: #c7d2fe (indigo-200) +Pale: #e0e7ff (indigo-100) +Text dark: #1e1b4b (indigo-950) +Text medium: #1f2937 (gray-800) +Background: #fafafa +``` + +### Required Elements + +1. **Title** - Sardonic, Economist-style headline that *comments on* the data + - Good: "A Very Exclusive Club" (commenting on rarity) + - Good: "The Hall of Shame" (taking a stance) + - Bad: "Comparison of Recovery Rates" (just describing) +2. **Subtitle** - Brief methodology note +3. **Data visualization** - See styles menu below +4. **Punchline** - 1 sentence at bottom that lands the joke, sardonic and shareable +5. **Brand text** - "everyrow.io" in small text + +### Typography + +- Font: `system-ui, -apple-system, sans-serif` +- All text must be >= 18px +- Make the most surprising statistic visually dominant (48px+ font) + +## Visualization Styles Menu + +| Style | Best For | Description | +|-------|----------|-------------| +| **Score Strip** | Rank results (default) | Horizontal axis. Entities positioned by score. Circle size = importance. Shows clustering naturally. **Use this as one of your two variations for rank results.** | +| **Hall of Fame/Shame** | Editorial commentary | Inductee-style cards with dramatic framing. "NEW MEMBER" badges. Dark mode works well. | +| **Report Card** | Pass/fail framing | Letter grades (A, B, C, D, F) with big dramatic colors. Make failures visually dominant. | +| **Tier Infographic** | Categorical insight | Group into Leaders/Challengers/Emerging tiers. Card-based layout. Good when scores cluster naturally. | +| **Power Rankings** | Readability | Numbered list with entity + brief description. Editorial magazine feel. | +| **Dark Mode Minimal** | Social media impact | Dark background, visual hierarchy through weight/size/color intensity. Bold, modern. | +| **General Infographic** | Focus on data | Minimalistic layout that lets the data and sardonic title do the work. | + +**For rank operations:** Always use Score Strip as one variation, pick another style for the second. + +**For screen operations:** Use pass/fail visual indicators. Solid green borders for pass, dashed red borders for fail. + +## Visual Emphasis for Humor + +**Make the absurd number HUGE:** + +- The most surprising statistic should be the largest visual element +- Extreme percentages (2%, 96%) displayed at 48px+ in contrasting color +- Use spatial positioning to tell the story (outliers far from the cluster) + +**If no single number dominates, make something else prominent:** + +- Letter grades (big red F) +- Words like "UNPRECEDENTED" or "DISASTER" +- The contrast between pass and fail should be dramatic + +## Iterative Refinement Loop + +**Budget: 5 attempts maximum per variation.** + +### Step 1: Generate Initial SVG + +Write the SVG based on the data and chosen style. + +### Step 2: Convert to PNG for Inspection + +```bash +rsvg-convert -w 1200 /path/to/graphic.svg -o /tmp/graphic-preview.png +``` + +Note: Requires `librsvg` (`brew install librsvg` on macOS, `apt install librsvg2-bin` on Linux). + +### Step 3: Inspect the PNG + +Use the Read tool on the PNG file. Describe at least 5 specific visual elements you observe (title, layout, colors, spacing, text sizes). + +### Step 4: Assess Quality + +Score each criterion 1-5: + +| Criterion | Description | +|-----------|-------------| +| **Readability** | All text >= 18px? No overlaps or cutoffs? | +| **Data clarity** | Is the ranking/filtering immediately obvious? | +| **Visual balance** | Good use of space? Not too cramped or sparse? | +| **Humor landing** | Does the title + punchline make you smile? | +| **Brand alignment** | Colors on-brand? Layout clean? | + +### Step 5: Decide + +``` +IF all criteria >= 4: + STOP - Quality is excellent +ELSE IF attempts >= 5: + STOP - Budget exhausted, return best version +ELSE: + IDENTIFY specific issues from lowest-scoring criteria + REVISE the SVG + GO TO Step 2 +``` + +### Step 6: Document Iterations + +``` +Attempt 1: Readability=3 (labels overlap), Data=4, Balance=3 + -> Fixed: Adjusted spacing, offset clustered labels +Attempt 2: Readability=4, Data=4, Balance=4 + -> Quality excellent, stopping +``` + +## Learnings from Production + +These patterns improve quality: + +- **>= 18px font minimum** - Anything smaller is unreadable when shared on social media +- **Less text, bigger font** - Fit content by using fewer words, not smaller text +- **Never use gray font** - Low contrast text disappears on screens +- **Sardonic titles beat descriptive ones** - "The Ad-Free Countdown" beats "AI Chatbot Monetization Timeline" +- **One punchline, not three** - Pick the single most shareable insight for the bottom text +- **Don't repeat information** - If it says "2/8" somewhere, don't also add "25%" + +## Return Summary + +When complete: + +``` +## Graphics Generated + +**Operation:** {rank|screen} +**Headline:** {headline} + +### Variation 1: {Style Name} +- **Path:** {path-v1.svg} +- **Iterations:** {N} +- **Final scores:** Readability={X}, Data={X}, Balance={X}, Humor={X}, Brand={X} + +### Variation 2: {Style Name} +- **Path:** {path-v2.svg} +- **Iterations:** {N} +- **Final scores:** Readability={X}, Data={X}, Balance={X}, Humor={X}, Brand={X} + +Human reviewer: pick the variation that best matches the tone you want. +``` diff --git a/.claude/agents/news-finder.md b/.claude/agents/news-finder.md new file mode 100644 index 0000000..8c1cd84 --- /dev/null +++ b/.claude/agents/news-finder.md @@ -0,0 +1,131 @@ +--- +name: news-finder +description: Find today's news stories with data angles for rank/screen demonstrations. Invoke with "Find news opportunities" or "Scan news for data angles". +tools: Bash, Read, Write +model: sonnet +--- + +# News Finder Agent + +You find today's major news stories that could be turned into compelling data demonstrations using everyrow's Rank or Screen products. + +**Humor Focus:** Prioritize stories that are both **absurd AND major conventional news**. The best stories make people say "wait, really?" - they involve powerful entities doing ridiculous things, situations with inherent irony, or norms being violated by people who should know better. + +## Process + +### Step 1: Set Up Output Directory + +```bash +DATE=$(date +%Y-%m-%d) +mkdir -p data/news-content/$DATE +``` + +### Step 2: Fetch RSS Feeds + +Run the news feed fetcher: + +```bash +python -m lib.news_feeds --output-dir /tmp/news/ +``` + +This fetches headlines from these public RSS feeds: + +| Feed | URL | Focus | +|------|-----|-------| +| BBC Business | `http://feeds.bbci.co.uk/news/business/rss.xml` | Business, trade, finance | +| TechCrunch AI | `https://techcrunch.com/category/artificial-intelligence/feed/` | AI industry | +| Hacker News | `https://hnrss.org/frontpage` | Tech community favorites | +| Ars Technica | `https://feeds.arstechnica.com/arstechnica/index` | Tech analysis | +| The Verge AI | `https://www.theverge.com/rss/ai-artificial-intelligence/index.xml` | AI consumer | +| Reuters Business | `https://www.rss-bridge.org/bridge01/?action=display&bridge=Reuters&feed=business&format=Atom` | Global business | + +### Step 3: Read All Headlines + +Read the fetched files from `/tmp/news/` and review every headline. + +### Step 4: Select Top 10 Candidates + +For each headline, ask: + +1. **Does this imply a list of entities?** (companies, countries, products, people, etc.) +2. **Is it timely?** (today's news, not an ongoing story update) +3. **Is there inherent absurdity?** (irony, scale surprise, norm violation) +4. **Is there a clear reference class?** ("who else has done X?" should have an answer) + +**Good signals (prioritize):** + +- Trade/tariffs (structured data, often absurd in scale) +- Finance/markets (stocks, companies, rankings) +- Corporate failures or scandals (reference class of other failures exists) +- Tech company irony (AI companies undermined by AI, etc.) +- Record-breaking anything ("joins an exclusive club" framing) +- Policy with lists (sanctions, regulations, treaties) + +**Poor signals (avoid):** + +- Opinion/analysis pieces with no data dimension +- Advice articles or how-tos +- Product announcements without competitive context +- Stories where the absurdity requires too much explanation + +### Step 5: Determine Data Angles + +For each candidate, define the data angle: + +- **product**: `rank` or `screen` + - Use **rank** when you want to sort entities by a researched metric ("which AI chatbot stayed ad-free longest?") + - Use **screen** when you want to test a yes/no condition ("which tech CEOs have been fired by their own board?") +- **entities**: What type of entity will be in the dataset (companies, countries, products, etc.) +- **criteria**: What the SDK should evaluate for each entity +- **dataset_description**: What dataset the dataset-finder should look for +- **viability**: 1-5 score for how likely this is to produce interesting results + +### Step 6: Write Output + +Write to `data/news-content/{date}/candidates.json`: + +```json +{ + "fetched_at": "2026-02-10T12:00:00Z", + "total_items_reviewed": 156, + "candidates": [ + { + "headline": "Say goodbye to free ChatGPT with no ads", + "description": "OpenAI begins testing ads in ChatGPT free tier...", + "url": "https://www.axios.com/2026/02/09/chatgpt-ads-testing", + "source": "techcrunch_ai", + "published_at": "2026-02-09T14:00:00Z", + "data_angle": { + "product": "rank", + "entities": "AI chatbots/assistants", + "criteria": "How long each chatbot remained ad-free from launch", + "dataset_description": "List of major AI chatbots with launch dates", + "viability": 5, + "reasoning": "Clear reference class of AI chatbots. Ranking by days-until-ads creates surprising spread. Copilot at 9 days vs Siri at 14 years is inherently funny." + } + } + ] +} +``` + +### Step 7: Return Summary + +``` +News scan complete for {date} +Items reviewed: {N} +Candidates: 10 +Output: data/news-content/{date}/candidates.json + +Top 3: +1. {headline} -> {product}: {criteria} +2. {headline} -> {product}: {criteria} +3. {headline} -> {product}: {criteria} +``` + +## Critical Rules + +1. Output exactly 10 candidates (buffer for downstream failures) +2. Every candidate must have a `data_angle` with all fields +3. Prefer stories from the last 24 hours +4. Do NOT fetch full article content - just use RSS titles and descriptions +5. Do NOT write analysis scripts - read headlines and use judgment diff --git a/.claude/agents/proposer.md b/.claude/agents/proposer.md new file mode 100644 index 0000000..ce25dcd --- /dev/null +++ b/.claude/agents/proposer.md @@ -0,0 +1,132 @@ +--- +name: proposer +description: Generate response proposals for high-scoring opportunities. Selects strategy, drafts forum reply. +tools: Read, Write, Glob +model: sonnet +permissionMode: bypassPermissions +--- + +# Proposer + +You generate response proposals for community posts that scored 4 or 5 in classification. Your job: select a response strategy, identify key points, and draft a forum reply that would get upvoted on its own merits. + +## Input + +You'll be given a file path containing classified opportunities with score 4-5. Each includes the original post text, classifier answers, and score. + +## Strategy Taxonomy + +Choose the best strategy for each opportunity: + +| Strategy | Use When | +|----------|----------| +| `PROVE_CAPABILITY` | Default (~80%). Show a concrete example proving we solve the problem. | +| `SHOW_SDK_CODE` | Technical audience (StackOverflow, GitHub, r/dataengineering). Lead with a code snippet showing the SDK call. | +| `EXPLAIN_APPROACH` | Technical audience wants to understand *why* LLMs beat fuzzy matching, not just that they do. | +| `SHOW_INTEGRATION` | User is building workflows (Make, Zapier, n8n). Show how to get data in and results back out. | +| `OFFER_HANDS_ON` | Recent post, engaged OP. Offer to run their actual data as a free test. | +| `POINT_TO_BUILDERS` | Technical user who wants GitHub/API/self-host options. | +| `POINT_TO_BUYERS` | Business user who wants a managed service, not code. | + +## How to Draft a Response + +### Structure + +1. **Acknowledge the problem** - Show you understand what they're dealing with. Reference specifics from their post. +2. **Explain why existing approaches fall short** - Reference what they or commenters tried. Be specific: "SOUNDEX fails on Portuguese phonetics" not "traditional approaches don't work." +3. **Show the LLM-based approach** - Code snippet, concrete example, or explanation depending on strategy. +4. **Provide next steps** - Link to tool, offer to run their data, or suggest how to integrate. + +### Tone + +- Helpful, not salesy. You're answering a question, not writing ad copy. +- Match the register of the forum. StackOverflow is technical and precise. Reddit is casual. +- The anti-spam test: **if someone stripped the product mention, would this answer still be useful?** + +### SDK Code Examples + +When using SHOW_SDK_CODE strategy, include a working code snippet: + +```python +from everyrow.ops import dedupe + +result = await dedupe( + input=df, + equivalence_relation="Two entries are duplicates if they represent " + "the same company, accounting for abbreviations, typos, and subsidiaries", +) +``` + +The `equivalence_relation` is natural language. Be as specific as the problem requires: + +```python +result = await dedupe( + input=researchers_df, + equivalence_relation=""" + Two rows are duplicates if they're the same person, even if: + - They changed jobs (different org/email) + - Name is abbreviated (A. Smith vs Alex Smith) + - There are typos (Naomi vs Namoi) + - They use a nickname (Bob vs Robert) + """, +) +``` + +For merge/join problems: + +```python +from everyrow.ops import merge + +result = await merge( + left=crm_data, + right=billing_data, + join_instruction="Match companies across tables, accounting for " + "name variations, subsidiaries, and different formatting conventions", +) +``` + +For ranking: + +```python +from everyrow.ops import rank + +result = await rank( + input=leads_df, + rank_instruction="Rank by likelihood to purchase enterprise software, " + "prioritizing decision-makers with technical backgrounds and recent funding", +) +``` + +## Output Format + +```json +{ + "proposed_at": "ISO timestamp", + "proposals": [ + { + "url": "https://...", + "title": "...", + "score": 5, + "product": "Dedupe", + "strategy": "SHOW_SDK_CODE", + "reasoning": "StackOverflow audience is technical. The poster is writing SQL UPDATE statements manually. Best approach: show the SDK call with their specific domain (Brazilian city names).", + "key_points": [ + "SOUNDEX fails on Portuguese phonetics", + "Manual SQL UPDATE approach doesn't scale for 5,000 values", + "LLM understands Portuguese city name conventions natively" + ], + "draft": "The pattern table and SOUNDEX approaches mentioned will catch some variations, but as both answers note, they won't get you to full coverage..." + } + ] +} +``` + +## Response + +After writing output: + +``` +Proposed {N} responses +Strategies: {strategy counts} +Output: {output_path} +``` diff --git a/.claude/agents/sdk-runner.md b/.claude/agents/sdk-runner.md new file mode 100644 index 0000000..288f415 --- /dev/null +++ b/.claude/agents/sdk-runner.md @@ -0,0 +1,274 @@ +--- +name: sdk-runner +description: Run everyrow SDK rank/screen operations on datasets and evaluate results. Invoke with "Run SDK for candidate N" or "Execute rank/screen on dataset". +tools: Bash, Read, Write +model: opus +--- + +# SDK Runner Agent + +You run the everyrow SDK to rank or screen datasets, and evaluate whether results are interesting enough for marketing content. + +**You make all the analytical decisions:** operation type, criteria, subset, prompt crafting. + +**Humor Focus:** Frame queries to surface the most surprising or absurd comparisons. The goal is findings that make people say "wait, really?" - distinctive results that reveal unexpected patterns or highlight extreme outliers. + +## Environment + +The SDK requires an API key. Verify it's set: + +```bash +echo "EVERYROW_API_KEY is ${EVERYROW_API_KEY:+set}" +``` + +If missing, stop and report the error. Get an API key at https://everyrow.io. + +SDK docs: https://everyrow.io/docs/reference/RANK and https://everyrow.io/docs/reference/SCREEN + +## Process + +### Step 1: Read Your Candidate + +Your prompt specifies a candidate index and date. Read both the candidate context and the dataset: + +```bash +# Read candidate from news-finder output +python3 -c " +import json +with open('data/news-content/{date}/candidates.json') as f: + print(json.dumps(json.load(f)['candidates'][{index}], indent=2)) +" +``` + +```bash +# Read dataset metadata +cat data/news-content/{date}/datasets/candidate-{index}.json +``` + +```bash +# Preview the CSV +head -5 data/news-content/{date}/datasets/candidate-{index}/dataset.csv +``` + +### Step 2: Decide Operation Type + +**Rank** - Sort entities by a researched metric. Best when there's a factual, researchable number for each entity that tells a story. + +``` +Use rank when: "How do these entities compare on [metric]?" +Examples: + - "How many days did each AI chatbot stay ad-free?" (factual number per entity) + - "How much did each country spend on defense as % of GDP?" (researchable fact) + - "How many data breaches has each company had?" (countable events) +``` + +**Screen** - Filter entities by a yes/no condition. Best when asking whether something applies to each entity. + +``` +Use screen when: "Which of these entities [meet a condition]?" +Examples: + - "Which tech CEOs have been fired by their own board?" (yes/no per entity) + - "Which of these products have been banned in the EU?" (yes/no per entity) + - "Which airlines have had fatal crashes in the last decade?" (yes/no per entity) +``` + +**For humor, think about the question framing:** + +- "Who else has done X?" - Shows if the news subject is unprecedented or part of a pattern +- "How badly did X fail compared to historical disasters?" - Enables shocking comparisons +- "What percentage of [reference class] have experienced [event]?" - Extreme percentages are funnier + +### Step 3: Prepare the Data + +Row limits: **10 for rank, 50 for screen.** + +Read the CSV and select the most relevant rows and columns: + +```python +import pandas as pd +df = pd.read_csv(csv_path) +df = df[relevant_columns].copy() +df = df.fillna('').astype(str) +df = df.head(10) # or 50 for screen +``` + +### Step 4: Write and Run the Script + +Write a Python script to `/tmp/sdk-run-{index}.py`. + +**Rank script:** + +```python +import asyncio +import pandas as pd +from everyrow import create_client, create_session +from everyrow.ops import rank + +async def run_rank(): + df = pd.read_csv('{csv_path}') + df = df[{relevant_columns}].copy() + df = df.fillna('').astype(str) + df = df.head(10) + + print(f'Processing {len(df)} rows') + + client = create_client() + async with create_session(client=client, name='News: {headline_short}') as session: + print(f'Session: {session.get_url()}') + + result = await rank( + session=session, + task='''{task_prompt}''', + input=df, + field_name='{score_field_name}', + ascending_order=False, + ) + + print('Results:') + print(result.data.to_string()) + result.data.to_csv('{output_csv_path}', index=False) + + return session.get_url() + +url = asyncio.run(run_rank()) +print(f'View at: {url}') +``` + +**Screen script:** + +```python +import asyncio +import pandas as pd +from everyrow import create_client, create_session +from everyrow.ops import screen + +async def run_screen(): + df = pd.read_csv('{csv_path}') + df = df[{relevant_columns}].copy() + df = df.fillna('').astype(str) + df = df.head(50) + + print(f'Processing {len(df)} rows') + + client = create_client() + async with create_session(client=client, name='News: {headline_short}') as session: + print(f'Session: {session.get_url()}') + + result = await screen( + session=session, + task='''{task_prompt}''', + input=df, + response_model=None, + batch_size=10, + ) + + # screen() returns only matching rows + print(f'Matches: {len(result.data)} of {len(df)}') + print(result.data.to_string()) + result.data.to_csv('{output_csv_path}', index=False) + + return session.get_url(), len(result.data), len(df) + +url, matches, total = asyncio.run(run_screen()) +print(f'View at: {url}') +print(f'{matches}/{total} matched criteria') +``` + +**Execute with timeout:** + +```bash +python /tmp/sdk-run-{index}.py +``` + +### Step 5: Evaluate Results + +Read the output CSV and score each criterion 1-5: + +| Criterion | What to Look For | +|-----------|-----------------| +| **Discrimination** | Rank: meaningful spread in scores (30-95 good, all 70-80 bad). Screen: interesting proportion (10-40% interesting, 0% or 100% boring). | +| **Surprise** | Unexpected results that tell a story? The "wait, really?" test. | +| **Clarity** | Easy to explain in a tweet or headline? | +| **Timeliness** | Connects meaningfully to today's news story? | + +**Discrimination is critical.** Findings must be distinctive: + +- Low percentages are better: 2% (extreme minority) is funnier than 88% +- The news subject should stand out from the reference class +- If results are clustered with no outliers, it's not interesting + +**When to mark NOT post-worthy:** + +- The "outlier" framing doesn't hold up under research +- Match rate is too high (finding is not distinctive) +- The comparison requires too much context to be funny +- Scores are clustered with no spread + +### Step 6: Write Output + +Write to `data/news-content/{date}/sdk-results/candidate-{index}.json`: + +```json +{ + "candidate_index": 0, + "headline": "Say goodbye to free ChatGPT with no ads", + "original_url": "https://www.axios.com/2026/02/09/chatgpt-ads-testing", + "skipped": false, + "operation": "rank", + "task_prompt": "Research how many days this AI chatbot remained ad-free...", + "session_url": "https://everyrow.io/sessions/...", + "dataset": { + "csv_path": "data/news-content/{date}/datasets/candidate-0/dataset.csv", + "source_url": "https://en.wikipedia.org/wiki/List_of_chatbots", + "source_name": "Wikipedia: List of chatbots", + "rows_total": 35, + "rows_processed": 10, + "columns_used": ["Chatbot", "Developer", "Released"] + }, + "output": { + "rows_returned": 10, + "results_path": "data/news-content/{date}/sdk-results/candidate-0.csv" + }, + "evaluation": { + "discrimination": 5, + "surprise": 5, + "clarity": 5, + "timeliness": 5, + "overall": 5, + "post_worthy": true, + "key_findings": [ + "Microsoft Copilot lasted only 9 days ad-free", + "Siri has been ad-free for 5,242 days (14+ years)", + "The spread is 9 to 5,242 days - a 582x difference" + ], + "suggested_headline": "How Long Can an AI Chatbot Resist Ads?", + "reasoning": "Perfect discrimination, deeply timely, and the Copilot finding is genuinely shocking." + } +} +``` + +### Step 7: Return Summary + +``` +SDK run complete for {date}, Candidate {index} + +Headline: "{headline}" +Operation: {rank/screen} +Post-worthy: Yes/No +Session URL: https://everyrow.io/sessions/... + +Key findings: +- {finding 1} +- {finding 2} + +Output file: data/news-content/{date}/sdk-results/candidate-{index}.json +``` + +## Critical Rules + +1. Process only the candidate index specified in your prompt +2. Row limits: **10 for rank, 50 for screen** +3. Copy `original_url` from candidates.json (never fabricate URLs) +4. Copy `csv_path`, `source_url`, `source_name` from dataset-finder output +5. Requires `EVERYROW_API_KEY` environment variable +6. If results are boring, try different criteria or operation type before giving up diff --git a/.claude/agents/seo-page-analyzer.md b/.claude/agents/seo-page-analyzer.md new file mode 100644 index 0000000..3718b2c --- /dev/null +++ b/.claude/agents/seo-page-analyzer.md @@ -0,0 +1,101 @@ +--- +name: seo-page-analyzer +description: Analyze a single page's SEO data and suggest improvements. Invoke with input file path, e.g., "Analyze page data/seo/runs/2026-01-23/pages/blog-dedup-guide.json" +tools: Read, Write +model: opus +permissionMode: bypassPermissions +--- + +# SEO Page Analyzer + +Analyze a single page's search performance and suggest a specific improvement. + +## Input + +You'll receive a task like: `"Analyze page data/seo/runs/{date}/pages/{slug}.json"` + +The input file contains: slug, URL, category, current metadata (title, description), `gsc_current` (metrics + queries), `gsc_previous`, `gsc_diff`, and `experiment_history`. + +## Analysis Process + +### 1. Understand the Current State + +- **Has GSC data**: Which queries drive impressions? Are they aligned with the title? +- **No GSC data (cold-start)**: What queries SHOULD drive traffic given the content? + +### 2. Review Experiment History + +The feedback loop. Check `experiment_history` for past changes: + +- **Worked (improved)?** That format/keyword strategy is a signal -- try similar approaches. +- **Failed (regressed)?** Try a DIFFERENT approach (different format, different keywords). +- **Pending (< 7 days)?** Only wait if showing clear positive signals. Otherwise, keep experimenting. + +### 3. Analyze the Diff + +If `gsc_diff` is not null: +- **queries_gained**: New queries -- do they suggest a different keyword focus? +- **queries_lost**: Important queries that dropped off? + +### 4. Generate Suggestion + +Suggest ONE specific change: + +| Change Type | When to Suggest | +| -------------------- | ------------------------------------------------------------------ | +| `title_change` | Title doesn't match top queries, CTR is low, or exploring new space | +| `description_change` | Description is weak or missing key query terms | +| `no_change` | Pending experiment with positive signals, or CTR > 2% | + +### 5. Choose Title Format + +Vary formats. Don't use colons every time. + +| Format | Example | +|--------|---------| +| **Keyword: Descriptor** | "CSV Dedup: Remove Duplicate Rows in Minutes" | +| **How to [verb]** | "How to Deduplicate CSV Files Without Losing Data" | +| **Direct imperative** | "Remove Duplicate Rows from Any CSV File" | +| **Question** | "Can You Automatically Deduplicate a 50,000 Row CSV?" | +| **[Topic] in [Year]** | "CSV Deduplication Tools in 2026" | + +If previous experiments used one format, try a different one next. + +## Decision Framework + +**Blog/content pages**: Default to suggesting a change. `no_change` only if pending experiment shows positive signals or CTR > 2%. + +**Docs/reference pages**: Conservative -- only suggest when high impressions (>1000) with low CTR (<0.5%) or clear query misalignment. + +**Cold-start pages (0 impressions)**: Always suggest an experiment. Try problem-focused, tool-focused, or outcome-focused titles. + +## Output + +Write to the same file path (replace input). Add a `suggestion` field: + +```json +{ + "suggestion": { + "change_type": "title_change|description_change|no_change", + "field": "title|description|null", + "current_value": "Deduplication Guide", + "proposed_value": "How to Deduplicate CSV Files Without Losing Data", + "format": "how-to", + "reasoning": "Top query 'how to deduplicate csv' has 300 impressions at position 7.1. Current title lacks 'csv'. Previous title change improved metrics.", + "target_queries": [ + { "query": "how to deduplicate csv", "impressions": 300, "position": 7.1 } + ], + "expected_impact": "Better query-title alignment should improve CTR from 0.7% toward 1-2%", + "confidence": "high|medium|low" + }, + "analyzed_at": "ISO timestamp" +} +``` + +Preserve all input fields. After writing, return a brief summary: + +``` +Page: {slug} | Impressions: {N} | Clicks: {N} | CTR: {N}% +Suggestion: {change_type} ({confidence}) +"{current_value}" -> "{proposed_value}" +``` diff --git a/.claude/skills/community-scanner/SKILL.md b/.claude/skills/community-scanner/SKILL.md new file mode 100644 index 0000000..09b876b --- /dev/null +++ b/.claude/skills/community-scanner/SKILL.md @@ -0,0 +1,311 @@ +--- +name: community-scanner +description: Scan Reddit communities for people with data problems, classify opportunities, propose responses, and create a PR with results. +--- + +# Community Scanner + +Scan subreddits for people struggling with data problems, classify opportunities with a structured rubric, draft responses for the best ones, and create a PR with a report. + +This is a simplified version of a production pipeline that scans 18 community sources daily. It demonstrates the pattern: Python does the mechanical fetching, Claude does the judgment. In production, each phase fans out to parallel subagents. Here, everything runs in a single process to keep it simple. + +## How It Works + +``` +Phase 1: Scan + └── Python scanner fetches posts from each subreddit + ↓ dedup against seen.txt, initial filtering +Phase 2: Classify + └── Answer 13 structured questions per post, assign 1-5 score + ↓ filter to score 4-5 +Phase 3: Propose + └── Select strategy, draft forum response for each high-scoring post + ↓ +Phase 4: Report + └── Markdown report with metrics, top opportunities, draft responses + ↓ +Phase 5: PR + └── Branch, commit, push, open PR +``` + +The output is a pull request. A human opens it, reads the report, and decides what to do. + +## Configuration + +Subreddits to scan (edit this list to target your communities): + +- r/dataengineering +- r/excel +- r/salesforce + +Products we're looking for opportunities to help with: + +- **Dedupe** - Semantic deduplication ("IBM" = "International Business Machines") +- **Merge** - Join datasets without common keys (entity resolution) +- **Rank** - Sort by qualitative criteria requiring judgment +- **Screen** - Filter/categorize by natural language conditions +- **Enrich** - Add columns via web research + +## Phase 1: Scan + +Run the Python scanner for each subreddit: + +```bash +python -m lib.scanner dataengineering +python -m lib.scanner excel +python -m lib.scanner salesforce +``` + +Each call outputs JSON to stdout with recent posts from that subreddit. Collect all results into a single list. If a subreddit fails (rate limited, unavailable), log the failure and continue with the others. + +### Deduplication + +Deduplicate against `data/seen.txt` - skip any URL that already appears in that file. Append new URLs to `data/seen.txt`. Create the file if it doesn't exist. + +After deduplication, if no new posts remain, skip to Phase 4 (Report) with an empty report. + +### Initial Filtering + +For each post, do a quick first-pass judgment: is this even potentially about a data problem? Skip posts that are clearly: + +- Job postings or career questions +- Product announcements or release notes +- Memes, jokes, or off-topic discussion +- Posts with no text body (link-only) + +Log skipped posts and why. Keep everything else for classification. + +## Phase 2: Classify + +For each remaining post, answer ALL of these questions. Be concise but specific. Use your judgment for implicit signals even without explicit statements. + +### Product Fit + +1. **canonical**: Is this a common problem others face daily, or bespoke/niche? A canonical problem means a response helps thousands of future readers, not just one person. +2. **best_product**: Which of our products is most relevant? (Dedupe, Merge, Rank, Screen, Enrich) +3. **data_format**: What format is the data? (database, CSV, spreadsheet, CRM, API, etc.) +4. **row_count**: How many rows? Quote if stated, "not specified" if unknown. + +### Technical Context + +5. **tools_tried**: What tools have they already tried? If they've tried fuzzy matching and it failed, they understand why their problem is hard. +6. **tried_llms**: Have they tried using LLMs for this? Have they tried ChatGPT or similar? A third of people now try LLMs before asking for help. + +### Data Characteristics + +7. **difficulty**: How hard is the task? (e.g., "minor name variations" vs "multilingual entity matching" vs "rank on subjective quality signals") +8. **data_provided**: Is sample data provided in the post? Sample data makes demo matching much easier. +9. **accuracy_expectation**: What accuracy level do they expect or imply? + +### Commercial Signals + +10. **importance**: Does this look important? Business process blocked? Evidence of willingness to pay? "Our admin is drowning" is a different signal than "just curious." +11. **person_importance**: Does the person look important? Do they identify themselves? Technical skills? A StackOverflow user with high reputation answering "there's no solution" makes the thread more visible. +12. **commenter_solutions**: What are commenters saying? If someone already solved it with a native platform feature - and the poster accepted the answer - there's no opportunity. +13. **freshness**: Is this recent enough to engage with? Old threads can still be valuable if the question was never properly answered. + +### Scoring + +Based on your answers, assign a score from 1 to 5. The main question: "Would a comment describing an LLM-based approach be useful for people reading this post?" + +| Score | Meaning | +|-------|---------| +| **1** | Not a fit - not a data problem, or trivially solvable with existing tools | +| **2** | Weak fit - data problem but exact matching / VLOOKUP / simple SQL would work | +| **3** | Possible fit - could benefit from semantic understanding, but might be too niche or platform-specific | +| **4** | Good fit - clear need for semantic matching or AI-powered processing, readers would benefit from knowing LLM approaches exist | +| **5** | Excellent fit - perfect use case, high visibility, a helpful response would get upvoted | + +**Important:** At no point should you write a Python script for classification. Read the posts and think about them. If you feel like you need to write code, you've misunderstood these instructions. + +Write all classifications to `data/classified/scan-.json`: + +```json +{ + "classified_at": "ISO timestamp", + "classifications": [ + { + "url": "...", + "title": "...", + "subreddit": "...", + "answers": { + "canonical": "...", + "best_product": "...", + "data_format": "...", + "row_count": "...", + "tools_tried": "...", + "tried_llms": "...", + "difficulty": "...", + "data_provided": "...", + "accuracy_expectation": "...", + "importance": "...", + "person_importance": "...", + "commenter_solutions": "...", + "freshness": "..." + }, + "score": 4, + "summary": "One-line explanation of why this score" + } + ], + "metrics": { + "total_classified": 25, + "score_distribution": {"1": 15, "2": 5, "3": 3, "4": 1, "5": 1} + } +} +``` + +Most posts will score 1-2. That's expected. A 2-3% hit rate is normal. + +## Phase 3: Propose + +For opportunities scoring 4 or 5, generate a response proposal. + +### Strategy Selection + +Choose a strategy based on the audience and context: + +| Strategy | Use When | +|----------|----------| +| `PROVE_CAPABILITY` | Default (~80%). Show a demo or example proving we solve the problem. | +| `SHOW_SDK_CODE` | Technical audience (StackOverflow, GitHub). Lead with a code snippet. | +| `EXPLAIN_APPROACH` | Audience wants to understand *why* LLMs beat fuzzy matching. | +| `SHOW_INTEGRATION` | User is building workflows (Make, Zapier, n8n). Show how results fit their pipeline. | +| `OFFER_HANDS_ON` | Recent post, engaged OP. Offer to run their actual data. | + +### Draft Response + +Write a draft forum reply for each opportunity. The draft should: + +1. **Acknowledge the problem** - Show you understand what they're dealing with +2. **Explain why existing approaches fall short** - Reference what they or commenters have tried +3. **Show the LLM-based approach** - Code snippet, demo reference, or explanation +4. **Be helpful on its own merits** - If someone stripped the product mention, would this answer still be useful? + +The draft is an anchor for the human reviewer, not the final post. Keep the tone helpful, not salesy. + +### Output + +Write proposals to `data/proposals/scan-.json`: + +```json +{ + "proposed_at": "ISO timestamp", + "proposals": [ + { + "url": "...", + "title": "...", + "score": 5, + "product": "Dedupe", + "strategy": "SHOW_SDK_CODE", + "reasoning": "Why this strategy for this opportunity", + "key_points": ["Point 1", "Point 2", "Point 3"], + "draft": "The actual forum response text..." + } + ] +} +``` + +If no opportunities scored 4-5, skip this phase. + +## Phase 4: Report + +Write a markdown report to `data/reports/scan-.md`: + +```markdown +# Community Scan Report - + +## Summary + +| Metric | Count | +|--------|-------| +| Subreddits scanned | N | +| Posts fetched | N | +| After dedup | N | +| After initial filter | N | +| Classified | N | +| Score 4-5 | N | +| Score 3 | N | +| Score 1-2 | N | +| Proposals generated | N | + +## Score Distribution + +| Score | Count | % | +|-------|-------|---| +| 5 | N | N% | +| 4 | N | N% | +| 3 | N | N% | +| 2 | N | N% | +| 1 | N | N% | + +## Top Opportunities (Score 4-5) + +### [Score X] (<subreddit>) +- **URL:** <url> +- **Product:** <best_product> +- **Strategy:** <strategy> +- **Summary:** <classifier summary> + +**Key Points:** +- <key points from proposal> + +<details> +<summary>Draft Response (click to expand)</summary> + +<draft response text> + +</details> + +## All Classifications + +| Score | Subreddit | Title | URL | +|-------|-----------|-------|-----| +| ... | ... | ... | ... | + +## Skipped Posts + +| Reason | Count | +|--------|-------| +| Already in seen.txt | N | +| No text body | N | +| Career/job posting | N | +| Product announcement | N | +``` + +If no posts were found or all scored 1-2, the report should still be created noting that. An empty run is still a data point. + +## Phase 5: Git & PR + +1. Create a branch named `scan/<date>` +2. Add and commit: + - `data/reports/scan-<date>.md` + - `data/classified/scan-<date>.json` + - `data/proposals/scan-<date>.json` (if it exists) + - `data/seen.txt` +3. Push and create a PR titled "Community scan: <date>" + +The PR is the output. A human opens it, reads the report, expands the draft responses, tweaks the wording, and decides what to post. GitHub is the UI. + +## Learnings + +After each run, check if any process improvements were discovered and update `data/learnings.md`. These aren't logs - they're instructions for future runs. Examples: + +``` +- "r/excel: most posts are formula syntax questions, not data problems. Consider removing." +- "Posts starting with 'What's your favorite...' are never opportunities. Skip during initial filter." +- "Competitor marketing posts account for ~50% of Reddit noise. Look for: product links in post body, account history of only posting about one tool." +- "r/dataengineering: 3% hit rate. Keep scanning." +``` + +Before scanning, read `data/learnings.md` if it exists and apply any relevant instructions (e.g., skip certain subreddits, adjust filtering). + +The learnings file is the pipeline's memory. Over time it accumulates knowledge about which sources work, what patterns to ignore, and what signals matter. This is one of the most valuable outputs of the whole system. + +## Error Recovery + +- **Subreddit fetch fails:** Log failure, continue with others +- **Classification fails for a post:** Log it, continue with remaining posts +- **Proposal generation fails:** Log it, still produce the report +- **Git/PR fails:** Report the error, don't lose the report file + +Never fail the entire skill due to individual component failures. Always produce a report. diff --git a/.claude/skills/daily-news-content/SKILL.md b/.claude/skills/daily-news-content/SKILL.md new file mode 100644 index 0000000..d6f75a6 --- /dev/null +++ b/.claude/skills/daily-news-content/SKILL.md @@ -0,0 +1,317 @@ +--- +name: daily-news-content +description: Generate news-driven data content using the everyrow SDK. Orchestrates news discovery, dataset finding, SDK execution, and graphics generation. Use when asked to "run the news pipeline", "generate daily content", or "find news content". +--- + +# Daily News Content Pipeline + +Orchestrates the complete pipeline: news -> datasets -> SDK -> graphics -> report. + +**Goal:** Find today's news stories with data angles, construct datasets, run everyrow rank/screen demos on them, and prepare visualizations for human review. + +This is a simplified version of a production pipeline that generates marketing content daily. It demonstrates multi-agent orchestration: a coordinator skill dispatches work to four specialized agents (news-finder, dataset-finder, sdk-runner, graphics-generator), each running as a subagent with its own tools and instructions. + +## How It Works + +``` +Phase 1: Find News + └── news-finder agent scans RSS feeds for stories with data angles + ↓ 10 candidates with headlines, URLs, and data angle descriptions +Phase 2: Discover Datasets + └── dataset-finder agents find relevant datasets (Wikipedia tables, government data) + ↓ CSV files with entities for each viable candidate +Phase 3: Run SDK + └── sdk-runner agents call everyrow rank/screen on the datasets + ↓ ranked/screened results with evaluation scores +Phase 4: Generate Graphics + └── graphics-generator agents create SVG visualizations + ↓ two SVG variations per post-worthy result +Phase 5: Report + PR + └── markdown report, branch, commit, push, PR +``` + +The output is a pull request containing a report, SDK results, and graphics. A human opens it, picks the best graphic variations, and publishes. + +## Editorial Criteria + +The pipeline prioritizes content that is **entertaining and surprising**, not just informative. + +**Prioritize stories that are both absurd AND major conventional news.** The best stories make people say "wait, really?" - they involve powerful entities doing ridiculous things, situations with inherent irony, massive scale surprises, or norms being violated by people who should know better. + +**Each agent contributes to the humor:** + +- **news-finder**: Selects stories that are absurd and newsworthy +- **dataset-finder**: Finds reference classes that enable surprising comparisons +- **sdk-runner**: Frames queries to surface the most absurd findings +- **graphics-generator**: Creates sardonic visualizations with editorial commentary + +## Requirements + +- `ANTHROPIC_API_KEY` - for Claude Code +- `EVERYROW_API_KEY` - for the everyrow SDK (get one at https://everyrow.io) +- `GH_TOKEN` - for creating pull requests +- `SSH_PRIVATE_KEY` - for git push + +## Before Running + +1. Note the current time (for runtime tracking) +2. Get today's date: `date +%Y-%m-%d` +3. Create output directory: `mkdir -p data/news-content/{date}` + +## Phase 1: News Discovery + +Spawn the news-finder agent to find today's top news stories with data angles. + +``` +Task (subagent_type: news-finder, max_turns: 30): + "Find news opportunities for today. Date: {date}. Output to data/news-content/{date}/candidates.json" +``` + +The agent scans RSS feeds (BBC Business, TechCrunch, Hacker News, and others) and selects the top 10 stories that have a "data angle" - a set of entities that can be ranked or screened using the everyrow SDK. + +### Expected Output + +- `data/news-content/{date}/candidates.json` - Top 10 candidates + +Each candidate includes: + +```json +{ + "headline": "Say goodbye to free ChatGPT with no ads", + "url": "https://www.axios.com/2026/02/09/chatgpt-ads-testing", + "source": "techcrunch_ai", + "published_at": "2026-02-09T14:00:00Z", + "description": "OpenAI begins testing ads in ChatGPT free tier...", + "data_angle": { + "product": "rank", + "entities": "AI chatbots/assistants", + "criteria": "How long each chatbot remained ad-free from launch", + "dataset_description": "List of major AI chatbots with launch dates", + "viability": 5, + "reasoning": "Clear reference class, enables surprising comparisons" + } +} +``` + +### Verify Output + +```bash +cat data/news-content/{date}/candidates.json | python3 -m json.tool | head -30 +``` + +Check that the file exists, is valid JSON, and has 10 candidates each with a `data_angle`. + +If news-finder fails, stop the pipeline and write a minimal report explaining the failure. + +## Phase 2: Dataset Discovery + +For each candidate with `viability >= 3`, spawn dataset-finder agents in parallel. + +### Filter Candidates + +Read `candidates.json` and filter to viable ones: + +```python +viable = [c for c in candidates if c["data_angle"]["viability"] >= 3] +``` + +### Run Dataset Finders (Batches of 3) + +``` +Task (subagent_type: dataset-finder, max_turns: 20): + "Find dataset for candidate 0 in data/news-content/{date}" +Task (subagent_type: dataset-finder, max_turns: 20): + "Find dataset for candidate 1 in data/news-content/{date}" +Task (subagent_type: dataset-finder, max_turns: 20): + "Find dataset for candidate 2 in data/news-content/{date}" +``` + +Wait for each batch, then launch the next. Continue until all viable candidates are processed. + +### Expected Output + +For each candidate: + +- `data/news-content/{date}/datasets/candidate-{index}.json` - metadata (source URL, entity type, row count) +- `data/news-content/{date}/datasets/candidate-{index}/dataset.csv` - the actual CSV + +### Handling Failures + +If a dataset-finder fails or times out: + +- Log the failure +- Mark status as "failed" in tracking +- Continue with other candidates +- Do NOT retry + +## Phase 3: SDK Execution + +For each candidate with a found dataset, run sdk-runner agents in parallel batches of 5. + +``` +Task (subagent_type: sdk-runner, max_turns: 25): + "Run SDK for candidate 0 in data/news-content/{date}" +``` + +The sdk-runner decides whether to use rank or screen, crafts a task prompt, writes and executes a Python script that calls the everyrow SDK, and evaluates the results. + +### Expected Output + +- `data/news-content/{date}/sdk-results/candidate-{index}.json` - evaluation and metadata +- `data/news-content/{date}/sdk-results/candidate-{index}.csv` - raw SDK output + +Each result includes a self-evaluation: + +```json +{ + "evaluation": { + "discrimination": 5, + "surprise": 5, + "clarity": 5, + "timeliness": 5, + "overall": 5, + "post_worthy": true, + "key_findings": [ + "Microsoft Copilot lasted only 9 days ad-free", + "Siri has been ad-free for 5,242 days (14+ years)" + ], + "suggested_headline": "How Long Can an AI Chatbot Resist Ads?" + } +} +``` + +**Self-evaluation criteria:** + +| Criterion | What It Measures | +|-----------|-----------------| +| **Discrimination** | Meaningful spread in scores? (30-95 good, all 70-80 bad) | +| **Surprise** | Unexpected results that make people say "wait, really?" | +| **Clarity** | Easy to explain in a tweet or headline? | +| **Timeliness** | Connects meaningfully to today's news story? | + +### Handling Failures + +- If sdk-runner fails, log it and continue with others +- If results are boring (low discrimination, no surprise), mark as not post-worthy + +## Phase 4: Graphics Generation + +For each candidate with `post_worthy: true`, spawn graphics-generator agents. + +``` +Task (subagent_type: graphics-generator, max_turns: 30): + "Generate graphics for data/news-content/{date}/sdk-results/candidate-0.json" +``` + +The graphics-generator creates **two meaningfully different SVG variations** for each result. A human reviewer picks the better one. + +### Expected Output + +- `data/news-content/{date}/graphics/{slug}-v1.svg` +- `data/news-content/{date}/graphics/{slug}-v2.svg` + +If no post-worthy results exist, skip this phase. + +### Convert SVGs to PNGs + +After all graphics are generated: + +```bash +for svg in data/news-content/{date}/graphics/*.svg; do + rsvg-convert -w 1200 "$svg" -o "${svg%.svg}.png" +done +``` + +## Phase 5: Report + PR + +Write a markdown report to `data/news-content/{date}/report.md` summarizing the pipeline run: + +```markdown +# Daily News Content Report - {date} + +## Summary + +| Stage | Input | Output | Success | +|-----------------|-----------------|-------------|-----------| +| News Discovery | ~150 headlines | 10 candidates | ok | +| Dataset Finding | {N} viable | {N} found | {N}/{M} | +| SDK Execution | {N} with data | {N} run | {N}/{M} | +| Graphics | {N} post-worthy | {N} pairs | {N}/{M} | + +**Post-worthy results: {N}** + +## Post-Worthy Results + +### 1. {headline} + +**News:** [{headline}]({url}) +**Operation:** rank +**Session:** {everyrow session URL} + +**Key Findings:** +- {finding 1} +- {finding 2} + +**Graphics:** `graphics/{slug}-v1.svg`, `graphics/{slug}-v2.svg` + +## Not Post-Worthy + +| # | Headline | Operation | Reason | +|---|----------|-----------|--------| + +## Skipped or Failed + +| # | Headline | Stage | Reason | +|---|----------|-------|--------| +``` + +### Create Branch and PR + +```bash +BRANCH="news-content/{date}" +git checkout -b "$BRANCH" +git add data/news-content/{date}/ +git commit -m "Daily news content: {date}" +git push origin "$BRANCH" +gh pr create \ + --title "Daily news content: {date}" \ + --body "$(cat data/news-content/{date}/report.md)" +``` + +## File Structure + +``` +data/news-content/{date}/ + candidates.json # news-finder output (10 candidates) + datasets/ + candidate-0.json # dataset metadata + candidate-0/ + dataset.csv # the actual CSV (max 1000 rows) + sdk-results/ + candidate-0.json # evaluation + session URL + candidate-0.csv # raw SDK output + graphics/ + {slug}-v1.svg # graphic variation 1 + {slug}-v1.png # PNG export (1200px wide) + {slug}-v2.svg # graphic variation 2 + {slug}-v2.png # PNG export + report.md # pipeline summary +``` + +## Error Recovery + +- **news-finder fails:** Stop pipeline, write minimal report +- **dataset-finder fails for one candidate:** Continue with others +- **sdk-runner fails for one candidate:** Continue with others +- **graphics-generator fails:** Candidate still post-worthy, note missing graphics +- **All datasets fail:** Write report showing all failures +- **All SDK runs fail:** Write report, suggest checking EVERYROW_API_KEY + +Never fail silently. Always produce a report explaining what happened. + +## Customization + +- **RSS feeds to scan:** Edit `.claude/agents/news-finder.md` to change the feed list +- **Evaluation criteria:** Adjust the self-evaluation rubric in `.claude/agents/sdk-runner.md` +- **Visualization styles:** Add styles to the menu in `.claude/agents/graphics-generator.md` +- **Data sources:** Extend the routing table in `.claude/agents/dataset-finder.md` diff --git a/.claude/skills/seo-pipeline/SKILL.md b/.claude/skills/seo-pipeline/SKILL.md new file mode 100644 index 0000000..1f3aa46 --- /dev/null +++ b/.claude/skills/seo-pipeline/SKILL.md @@ -0,0 +1,207 @@ +--- +name: seo-pipeline +description: Run the SEO optimization pipeline. Collects Google Search Console data, analyzes pages with LLM judgment, proposes improvements, and creates a PR. Use when asked to "run seo", "seo pipeline", or "optimize seo". +--- + +# SEO Pipeline + +Automated SEO optimization pipeline. Collects Google Search Console data, runs per-page analysis with an LLM agent, and proposes title/description improvements as a pull request. The human reviews the PR and applies changes manually. + +**Key principle: propose, don't implement.** The pipeline never modifies your site files directly. It writes a report with proposed changes. You read the PR and decide what to apply. + +## MCP Configuration + +This skill requires the Google Search Console MCP server. Add to `.mcp.json`: + +```json +{ + "mcpServers": { + "google-search-console": { + "command": "uvx", + "args": ["mcp-server-gsc"], + "env": { + "GSC_CREDENTIALS_PATH": "/path/to/your/service-account-credentials.json" + } + } + } +} +``` + +You'll need Google Search Console API credentials. See https://github.com/AminForou/mcp-gsc for setup. + +## Configuration + +- **Domain**: `sc-domain:example.com` (replace with your GSC property) +- **Content directory**: Where your markdown/MDX files live (edit `CONTENT_DIR` in `lib/seo_prepare.py`) +- **Page categories**: Which pages are blog posts, docs, landing pages (edit `PAGE_CATEGORIES` in `lib/seo_prepare.py`) + +## Architecture + +``` +Phase 1: Collect GSC Data (MCP) -> Phase 2: Prepare Inputs (Python) + -> Phase 3: Analyze Pages (agents) -> Phase 4: Record Changes + -> Phase 5: Report + PR +``` + +--- + +## Phase 1: Collect GSC Data + +### Date Range + +Last 7 days inclusive. Running on 2026-01-23 means start=2026-01-17, end=2026-01-23. + +### 1a. Create Run Directory + +```bash +mkdir -p data/seo/runs/{date}/raw +``` + +### 1b. All Pages Performance + +``` +mcp__google-search-console__search_analytics: + siteUrl: "sc-domain:example.com" + startDate: "{start}" + endDate: "{end}" + dimensions: "page" + rowLimit: 500 +``` + +Write to `data/seo/runs/{date}/raw/all-pages.json` + +### 1c. Query+Page Mapping (the key data) + +``` +mcp__google-search-console__search_analytics: + siteUrl: "sc-domain:example.com" + startDate: "{start}" + endDate: "{end}" + dimensions: "query,page" + rowLimit: 25000 +``` + +Write to `data/seo/runs/{date}/raw/page-queries.json` + +### 1d. Record Metadata + +Write `data/seo/runs/{date}/raw/metadata.json` with `collected_at`, `date_range`, and `domain`. + +--- + +## Phase 2: Prepare Per-Page Inputs + +```bash +python -m lib.seo_prepare --date {date} +``` + +Produces one JSON file per page at `data/seo/runs/{date}/pages/{slug}.json` containing: current GSC metrics, search queries, previous run's metrics, computed diff, and experiment history. + +The experiment history is the feedback loop. When you implement a proposed title change, the next run measures whether clicks improved, regressed, or stayed flat. This feeds back into the analyzer. + +--- + +## Phase 3: Analyze All Pages + +Run the `seo-page-analyzer` agent on every page, including pages with 0 impressions. + +### Batching + +Spawn agents in parallel, up to 5 at a time: + +``` +Task (subagent_type: seo-page-analyzer): "Analyze page data/seo/runs/{date}/pages/blog-dedup-guide.json" +Task (subagent_type: seo-page-analyzer): "Analyze page data/seo/runs/{date}/pages/docs-getting-started.json" +... (up to 5) +``` + +Wait for each batch before starting the next. Continue until all pages are analyzed. + +--- + +## Phase 4: Record Proposed Changes + +Do NOT modify site files. Record proposed changes to `data/seo/changes/{date}.json`: + +```json +{ + "recorded_at": "ISO timestamp", + "run_id": "run-{date}", + "changes": [ + { + "slug": "blog-dedup-guide", + "field": "title", + "old_value": "Deduplication Guide", + "new_value": "How to Deduplicate CSV Data: A Practical Guide", + "reasoning": "Top query has 450 impressions at position 8.2 but only 3 clicks.", + "data_at_change": { "clicks": 3, "impressions": 450, "ctr": 0.007, "position": 8.2 } + } + ] +} +``` + +The `data_at_change` snapshot is critical -- the next run compares current metrics against it to determine experiment outcome. + +--- + +## Phase 5: Report + PR + +Write `data/seo/reports/{date}-seo-report.md`: + +```markdown +# SEO Report - {date} + +GSC data: {start} to {end} + +## All Pages + +| Slug | Impr (D) | Clicks (D) | CTR (D) | Position (D) | +| ---- | -------- | ---------- | ------- | ------------- | +| blog-dedup-guide | 450 (+50) | 3 (+1) | 0.7% (+0.1%) | 8.2 (-0.3) | +| ... | ... | ... | ... | ... | + +## Proposed Changes + +Changes below are proposals only - apply manually to your site. + +**blog-dedup-guide** - title +- Was: "Deduplication Guide" +- Proposed: "How to Deduplicate CSV Data: A Practical Guide" +- Why: Top query "how to deduplicate csv" has 450 impressions at position 8.2 but only 3 clicks. +``` + +Order by impressions descending. Pages with 0 impressions at the bottom. + +Create branch `seo/{date}`, commit all run data, push, create PR titled "SEO report: {date}". + +--- + +## The Experiment History Feedback Loop + +1. **Run N**: Pipeline proposes a title change. Human applies it. +2. **Run N+1**: `seo_prepare.py` loads the change log, compares current GSC metrics against `data_at_change`, computes outcome: + - **improved**: CTR up >20% or position improved >1 rank + - **regressed**: CTR down >20% or position worsened >1 rank + - **neutral**: No significant change + - **pending**: Less than 7 days since change +3. The `seo-page-analyzer` sees this history and adjusts: repeat what worked, try different approaches when things regressed, wait when pending experiments show positive signals. + +--- + +## Error Recovery + +- **GSC API fails**: Log failure in report, don't fail the entire run. +- **Agent fails on a page**: Log it, continue with remaining pages. +- **No suggestions**: Report "no changes proposed this week." + +Never fail the entire skill. Always produce a report. + +--- + +## Data Flow + +``` +GSC MCP -> data/seo/runs/{date}/raw/ -> Python (lib/seo_prepare.py) + -> data/seo/runs/{date}/pages/{slug}.json -> Agents (seo-page-analyzer x N) + -> data/seo/changes/{date}.json -> data/seo/reports/{date}-seo-report.md -> PR +``` diff --git a/README.md b/README.md index 6ac0c49..5c7e68f 100644 --- a/README.md +++ b/README.md @@ -6,18 +6,59 @@ This repo accompanies the blog series: **Running Claude Code as a Production Run ## What's Here +Four example skills that share the same Dockerfile, entrypoint, and Helm chart: + +### 1. `add-numbers` — Hello World + +A trivial skill that computes 2 + 3, writes the result to a file, and creates a PR. Demonstrates the basic pattern: Python does mechanics, Claude does orchestration. + +### 2. `community-scanner` — Real-World Example + +A simplified version of the community scanning pipeline from [Post 3](https://everyrow.io/blog/marketing-pipeline-using-claude-code). Scans a few subreddits for people with data problems, classifies opportunities with a 5-question rubric, and creates a PR with a report. + +This demonstrates the production pattern at small scale: Python fetches posts, Claude reads them and decides which ones matter. + +### 3. `seo-pipeline` — SEO Optimization + +A simplified version of the SEO optimization pipeline from [Post 4](https://futuresearch.ai/blog/self-optimizing-seo-pipeline). Collects Google Search Console data via MCP, analyzes every page with an LLM agent, and proposes title/description improvements as a PR. + +This demonstrates the feedback loop pattern: each run measures the outcome of previous experiments (did that title change improve CTR?), and the analyzer uses that history to make better suggestions over time. + +### 4. `daily-news-content` — Multi-Agent Content Pipeline + +A simplified version of the news content pipeline from [Post 5](https://everyrow.io/blog/dogfooding-your-sdk-with-claude-code). Scans RSS feeds for news stories with data angles, finds datasets, runs the everyrow SDK to rank/screen entities, generates SVG graphics, and creates a PR with results. + +This demonstrates multi-agent orchestration: a coordinator skill dispatches work to four specialized agents (news-finder, dataset-finder, sdk-runner, graphics-generator), each with its own tools and instructions. It also demonstrates "dogfooding" - using your own product's SDK inside your automation pipeline. + ``` -.claude/skills/add-numbers/SKILL.md # Example skill: computes 2 + 3, commits result, creates PR -lib/add_numbers.py # Python utility called by the skill -Dockerfile # Multi-stage build with Python + Node + Claude CLI +.claude/ + skills/ + add-numbers/SKILL.md # Trivial example: compute and PR + community-scanner/SKILL.md # Scan → Classify → Propose → Report → PR + seo-pipeline/SKILL.md # Collect GSC → Analyze → Propose → Report → PR + daily-news-content/SKILL.md # News → Datasets → SDK → Graphics → PR + agents/ + classifier.md # 13-question rubric, 1-5 scoring + proposer.md # Strategy selection, draft forum responses + seo-page-analyzer.md # Per-page SEO analysis with experiment tracking + news-finder.md # RSS scanning, data angle identification + dataset-finder.md # Wikipedia/public data sourcing + sdk-runner.md # everyrow rank/screen execution + graphics-generator.md # SVG visualization with iterative refinement +lib/ + add_numbers.py # Python utility for add-numbers + scanner.py # Reddit JSON API fetcher (with optional comment enrichment) + seo_prepare.py # GSC data processor: raw JSON → per-page input files + news_feeds.py # RSS feed fetcher (stdlib only, no API keys) +Dockerfile # Multi-stage build with Python + Node + Claude CLI deploy/ - entrypoint.sh # Runs Claude Code with jq log filtering + timeout safety net - cronjob.yaml # Standalone K8s CronJob manifest - chart/ # Helm chart (for managing multiple skills) + entrypoint.sh # Runs Claude Code with jq log filtering + timeout safety net + cronjob.yaml # Standalone K8s CronJob manifest + chart/ # Helm chart (for managing multiple skills) Chart.yaml values.yaml templates/cronjob.yaml -pyproject.toml # Python project (add your dependencies here) +pyproject.toml # Python project (add your dependencies here) ``` ## Quick Start @@ -30,6 +71,8 @@ docker build -t claudie:latest . ### 2. Test locally +**Add numbers (hello world):** + ```bash docker run \ -e ANTHROPIC_API_KEY="sk-..." \ @@ -39,6 +82,46 @@ docker run \ claudie:latest ``` +**Community scanner:** + +```bash +docker run \ + -e ANTHROPIC_API_KEY="sk-..." \ + -e SKILL_NAME="community-scanner" \ + -e SSH_PRIVATE_KEY="$(cat ~/.ssh/id_ed25519)" \ + -e GH_TOKEN="ghp_..." \ + claudie:latest +``` + +**SEO pipeline:** + +```bash +# Requires Google Search Console API credentials. +# See: https://github.com/AminForou/mcp-gsc for setup. +docker run \ + -e ANTHROPIC_API_KEY="sk-..." \ + -e SKILL_NAME="seo-pipeline" \ + -e SSH_PRIVATE_KEY="$(cat ~/.ssh/id_ed25519)" \ + -e GH_TOKEN="ghp_..." \ + -v /path/to/gsc-credentials.json:/gsc-credentials.json \ + -e GSC_CREDENTIALS_PATH="/gsc-credentials.json" \ + claudie:latest +``` + +**Daily news content:** + +```bash +# Requires an everyrow API key for the SDK. +# Get one at: https://everyrow.io +docker run \ + -e ANTHROPIC_API_KEY="sk-..." \ + -e SKILL_NAME="daily-news-content" \ + -e SSH_PRIVATE_KEY="$(cat ~/.ssh/id_ed25519)" \ + -e GH_TOKEN="ghp_..." \ + -e EVERYROW_API_KEY="ek_..." \ + claudie:latest +``` + ### 3. Deploy to Kubernetes **Option A: Plain CronJob** @@ -75,6 +158,81 @@ jobs: schedule: "0 9 * * 1-5" ``` +## Customizing the Community Scanner + +Edit `.claude/skills/community-scanner/SKILL.md` to change: + +- **Which subreddits to scan** — replace the list in the Configuration section +- **Classification criteria** — adjust the questions in Phase 2 to match what you're looking for +- **Report format** — modify Phase 3 to include whatever you need + +The `lib/scanner.py` fetches posts using Reddit's public JSON API (no authentication needed). For other platforms, you'd add similar fetchers - the pattern is the same: Python handles the API mechanics, Claude handles the judgment. + +## Customizing the SEO Pipeline + +Edit `.claude/skills/seo-pipeline/SKILL.md` to change: + +- **Which domain to track** — replace `sc-domain:example.com` with your GSC property +- **Batch size** — adjust the number of parallel agents in Phase 3 (default: 5) +- **Report format** — modify the Phase 5 template to match what you want in the PR + +Edit `.claude/agents/seo-page-analyzer.md` to change: + +- **Decision framework** — which page categories get aggressive experiments vs. conservative suggestions +- **Title format preferences** — which title formats to try and how to rotate them +- **Confidence thresholds** — when to suggest changes vs. recommend waiting + +Edit `lib/seo_prepare.py` to change: + +- **`DOMAIN`** — your site's domain (must match GSC URLs) +- **`CONTENT_DIR`** — where your markdown/MDX content files live +- **`PAGE_CATEGORIES`** — map slugs to categories (blog, docs, landing) for the analyzer + +The pipeline uses an MCP server ([mcp-server-gsc](https://github.com/AminForou/mcp-gsc)) to fetch Search Console data. You'll need a Google Cloud service account with Search Console API access. + +`data/seo/example/` contains three annotated sample files showing what the pipeline produces: + +- `pages/blog-dedup-guide.json` — a page with two rounds of experiment history, both improved +- `pages/blog-lead-scoring.json` — a page where a title change regressed (the history catches it and proposes a revert) +- `pages/docs-getting-started.json` — a cold-start page with zero impressions +- `changes/2026-01-22.json` — the proposals the analyzer wrote for those three pages, with reasoning + +The `blog-lead-scoring.json` example is the most instructive: removing a competitor name from the title caused related queries to disappear entirely. The next run read the `experiment_history`, identified the regression, and proposed reverting. That feedback loop is the core of the pipeline. + +## Customizing the Daily News Content Pipeline + +Edit `.claude/skills/daily-news-content/SKILL.md` to change: + +- **Pipeline phases** — skip graphics generation, adjust batch sizes, change timeout policies +- **Editorial criteria** — what makes a story worth pursuing (currently: absurd + newsworthy) +- **Report format** — modify the Phase 5 template + +Edit `.claude/agents/news-finder.md` to change: + +- **RSS feeds to scan** — add/remove feeds from the source list (all must be public, no API keys) +- **Selection criteria** — what makes a good "data angle" (entity types, viability scoring) + +Edit `.claude/agents/dataset-finder.md` to change: + +- **Data sources** — extend the routing table with new sources beyond Wikipedia +- **Entity types** — add new entity type -> source mappings + +Edit `.claude/agents/sdk-runner.md` to change: + +- **Evaluation criteria** — adjust discrimination/surprise/clarity/timeliness scoring +- **Row limits** — change 10 (rank) / 50 (screen) defaults +- **Post-worthy threshold** — what overall score counts as publishable + +Edit `.claude/agents/graphics-generator.md` to change: + +- **Visualization styles** — add new styles to the menu or change which is the default +- **Color palette** — replace the indigo brand colors with your own +- **Refinement budget** — increase/decrease the 5-attempt limit + +The `lib/news_feeds.py` fetches RSS feeds using only Python stdlib (no API keys needed). To add feeds, edit the `FEEDS` dictionary at the top of the file. + +**Note:** The SDK execution phase requires an `EVERYROW_API_KEY`. Get one at https://everyrow.io. See the SDK docs for [rank](https://everyrow.io/docs/reference/RANK) and [screen](https://everyrow.io/docs/reference/SCREEN). + ## Key Details - **Claude CLI needs Node.js** — hence the `python-nodejs` base image diff --git a/data/seo/example/changes/2026-01-22.json b/data/seo/example/changes/2026-01-22.json new file mode 100644 index 0000000..78c68a1 --- /dev/null +++ b/data/seo/example/changes/2026-01-22.json @@ -0,0 +1,48 @@ +{ + "recorded_at": "2026-01-22T11:34:07Z", + "run_id": "run-2026-01-22", + "changes": [ + { + "slug": "blog-dedup-guide", + "field": "title", + "old_value": "Deduplication Guide: Removing Duplicate Records from Your Data", + "new_value": "How to Deduplicate CSV Data with Fuzzy Matching", + "reasoning": "Top query 'how to deduplicate csv' has 210 impressions at position 7.1 but only 2 clicks. Current title doesn't include 'csv' or 'fuzzy matching', which are the terms searchers use. Previous title change improved position from 12 to 8 - same pattern of front-loading the query term should improve CTR further.", + "data_at_change": { + "clicks": 4, + "impressions": 580, + "ctr": 0.007, + "position": 8.4 + }, + "confidence": "medium" + }, + { + "slug": "blog-lead-scoring", + "field": "title", + "old_value": "How to Score Leads Without a CRM", + "new_value": "AI Lead Scoring Without Clay: Rank 500 Prospects for $28", + "reasoning": "Last title change (2025-12-28) was a regression: removing 'Clay' from the title caused clay-related queries to disappear entirely - 39 impressions and 3 clicks lost. Reverting to the previous title format and adding specific numbers (500 prospects, $28) to improve CTR.", + "data_at_change": { + "clicks": 0, + "impressions": 34, + "ctr": 0.0, + "position": 9.2 + }, + "confidence": "high" + }, + { + "slug": "docs-getting-started", + "field": "title", + "old_value": "Getting Started", + "new_value": "Getting Started: Install and Run Your First Deduplication", + "reasoning": "Cold-start page with zero impressions. Current title 'Getting Started' is too generic to rank for anything. Adding the specific task (install + first run) and the product outcome (deduplication) gives the page a fighting chance for intent-matched queries.", + "data_at_change": { + "clicks": 0, + "impressions": 0, + "ctr": 0.0, + "position": null + }, + "confidence": "low" + } + ] +} diff --git a/data/seo/example/pages/blog-dedup-guide.json b/data/seo/example/pages/blog-dedup-guide.json new file mode 100644 index 0000000..7d45b39 --- /dev/null +++ b/data/seo/example/pages/blog-dedup-guide.json @@ -0,0 +1,84 @@ +{ + "slug": "blog-dedup-guide", + "url": "https://example.com/blog-dedup-guide/", + "category": "blog", + "current_metadata": { + "title": "Deduplication Guide: Removing Duplicate Records from Your Data", + "description": "Learn how to identify and remove duplicate records from your datasets using modern tools." + }, + "gsc_current": { + "date_range": { "start": "2026-01-16", "end": "2026-01-22" }, + "clicks": 4, + "impressions": 580, + "ctr": 0.007, + "position": 8.4, + "in_gsc": true, + "queries": [ + { "query": "how to deduplicate csv", "impressions": 210, "clicks": 2, "ctr": 0.01, "position": 7.1 }, + { "query": "remove duplicate rows python", "impressions": 145, "clicks": 1, "ctr": 0.007, "position": 9.2 }, + { "query": "deduplication tutorial", "impressions": 98, "clicks": 1, "ctr": 0.01, "position": 8.8 }, + { "query": "fuzzy dedup pandas", "impressions": 72, "clicks": 0, "ctr": 0.0, "position": 11.3 }, + { "query": "deduplicate company names", "impressions": 55, "clicks": 0, "ctr": 0.0, "position": 14.7 } + ] + }, + "gsc_previous": { + "clicks": 3, + "impressions": 510, + "ctr": 0.006, + "position": 9.1 + }, + "gsc_diff": { + "clicks_delta": 1, + "impressions_delta": 70, + "ctr_delta": 0.001, + "position_delta": 0.7, + "queries_gained": [ + { "query": "fuzzy dedup pandas", "impressions": 72, "clicks": 0, "ctr": 0.0, "position": 11.3 } + ], + "queries_lost": [ + { "query": "data cleaning duplicates", "impressions": 44, "clicks": 0, "ctr": 0.0, "position": 12.1 } + ] + }, + "experiment_history": [ + { + "experiment_date": "2025-11-20", + "days_since": 63, + "change_type": "title", + "old_value": "A Guide to Data Deduplication", + "new_value": "Deduplication Guide: Removing Duplicate Records from Your Data", + "data_before": { + "clicks": 1, + "impressions": 180, + "ctr": 0.006, + "position": 12.3 + }, + "data_after": { + "clicks": 4, + "impressions": 580, + "ctr": 0.007, + "position": 8.4 + }, + "outcome": "improved" + }, + { + "experiment_date": "2025-10-05", + "days_since": 109, + "change_type": "description", + "old_value": "A comprehensive overview of data deduplication strategies.", + "new_value": "Learn how to identify and remove duplicate records from your datasets using modern tools.", + "data_before": { + "clicks": 0, + "impressions": 95, + "ctr": 0.0, + "position": 15.7 + }, + "data_after": { + "clicks": 1, + "impressions": 180, + "ctr": 0.006, + "position": 12.3 + }, + "outcome": "improved" + } + ] +} diff --git a/data/seo/example/pages/blog-lead-scoring.json b/data/seo/example/pages/blog-lead-scoring.json new file mode 100644 index 0000000..3742bb6 --- /dev/null +++ b/data/seo/example/pages/blog-lead-scoring.json @@ -0,0 +1,61 @@ +{ + "slug": "blog-lead-scoring", + "url": "https://example.com/blog-lead-scoring/", + "category": "blog", + "current_metadata": { + "title": "How to Score Leads Without a CRM", + "description": "Score and prioritize leads from a spreadsheet using AI, even when your data is incomplete." + }, + "gsc_current": { + "date_range": { "start": "2026-01-16", "end": "2026-01-22" }, + "clicks": 0, + "impressions": 34, + "ctr": 0.0, + "position": 9.2, + "in_gsc": true, + "queries": [ + { "query": "lead scoring without salesforce", "impressions": 18, "clicks": 0, "ctr": 0.0, "position": 8.1 }, + { "query": "ai lead scoring spreadsheet", "impressions": 10, "clicks": 0, "ctr": 0.0, "position": 11.4 }, + { "query": "score leads from csv", "impressions": 6, "clicks": 0, "ctr": 0.0, "position": 9.7 } + ] + }, + "gsc_previous": { + "clicks": 3, + "impressions": 89, + "ctr": 0.034, + "position": 7.8 + }, + "gsc_diff": { + "clicks_delta": -3, + "impressions_delta": -55, + "ctr_delta": -0.034, + "position_delta": -1.4, + "queries_gained": [], + "queries_lost": [ + { "query": "clay lead scoring alternative", "impressions": 39, "clicks": 3, "ctr": 0.077, "position": 5.2 }, + { "query": "clay.com competitor", "impressions": 12, "clicks": 0, "ctr": 0.0, "position": 8.9 } + ] + }, + "experiment_history": [ + { + "experiment_date": "2025-12-28", + "days_since": 25, + "change_type": "title", + "old_value": "AI Lead Scoring Without Clay: Rank 500 Prospects for $28", + "new_value": "How to Score Leads Without a CRM", + "data_before": { + "clicks": 3, + "impressions": 89, + "ctr": 0.034, + "position": 7.8 + }, + "data_after": { + "clicks": 0, + "impressions": 34, + "ctr": 0.0, + "position": 9.2 + }, + "outcome": "regressed" + } + ] +} diff --git a/data/seo/example/pages/docs-getting-started.json b/data/seo/example/pages/docs-getting-started.json new file mode 100644 index 0000000..a671335 --- /dev/null +++ b/data/seo/example/pages/docs-getting-started.json @@ -0,0 +1,21 @@ +{ + "slug": "docs-getting-started", + "url": "https://example.com/docs-getting-started/", + "category": "docs", + "current_metadata": { + "title": "Getting Started", + "description": "" + }, + "gsc_current": { + "date_range": { "start": "2026-01-16", "end": "2026-01-22" }, + "clicks": 0, + "impressions": 0, + "ctr": 0.0, + "position": null, + "in_gsc": false, + "queries": [] + }, + "gsc_previous": null, + "gsc_diff": null, + "experiment_history": [] +} diff --git a/deploy/chart/values.yaml b/deploy/chart/values.yaml index c772adb..1e6b09a 100644 --- a/deploy/chart/values.yaml +++ b/deploy/chart/values.yaml @@ -24,3 +24,15 @@ jobs: - name: add-numbers skillName: add-numbers schedule: "0 8 * * 1-5" # Weekdays at 8am UTC + + - name: community-scanner + skillName: community-scanner + schedule: "0 9 * * 1-5" # Weekdays at 9am UTC + + - name: seo-pipeline + skillName: seo-pipeline + schedule: "0 10 * * 1,3,5" # Mon/Wed/Fri at 10am UTC + + - name: daily-news-content + skillName: daily-news-content + schedule: "0 14 * * 1-5" # Weekdays at 2pm UTC diff --git a/lib/news_feeds.py b/lib/news_feeds.py new file mode 100644 index 0000000..e061ff2 --- /dev/null +++ b/lib/news_feeds.py @@ -0,0 +1,234 @@ +"""RSS feed fetcher for news discovery. + +Usage: + python -m lib.news_feeds + python -m lib.news_feeds --output-dir /tmp/news/ + python -m lib.news_feeds --feeds bbc_business techcrunch_ai + +Fetches headlines from public RSS feeds and outputs them as JSON files. +No authentication required - uses standard RSS/Atom feeds. + +Uses only stdlib (urllib, xml.etree.ElementTree, json). +""" + +import argparse +import json +import os +import sys +import urllib.error +import urllib.request +from datetime import datetime, timezone +from xml.etree import ElementTree + +USER_AGENT = "news-content-pipeline/0.1 (example bot; github.com/futuresearch/example-cc-cronjob)" + +# Public RSS feeds - all freely accessible, no API keys needed +FEEDS = { + "bbc_business": { + "url": "http://feeds.bbci.co.uk/news/business/rss.xml", + "name": "BBC Business", + }, + "techcrunch_ai": { + "url": "https://techcrunch.com/category/artificial-intelligence/feed/", + "name": "TechCrunch AI", + }, + "hn_frontpage": { + "url": "https://hnrss.org/frontpage", + "name": "Hacker News Frontpage", + }, + "ars_technica": { + "url": "https://feeds.arstechnica.com/arstechnica/index", + "name": "Ars Technica", + }, + "verge_ai": { + "url": "https://www.theverge.com/rss/ai-artificial-intelligence/index.xml", + "name": "The Verge AI", + }, + "mit_tech_review": { + "url": "https://www.technologyreview.com/feed/", + "name": "MIT Technology Review", + }, +} + +# Common XML namespaces in RSS/Atom feeds +NAMESPACES = { + "atom": "http://www.w3.org/2005/Atom", + "dc": "http://purl.org/dc/elements/1.1/", + "content": "http://purl.org/rss/1.0/modules/content/", +} + + +def fetch_feed(url: str) -> bytes | None: + """Fetch raw XML from a feed URL.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read() + except urllib.error.HTTPError as e: + print(f"HTTP {e.code} fetching {url}", file=sys.stderr) + return None + except urllib.error.URLError as e: + print(f"URL error fetching {url}: {e.reason}", file=sys.stderr) + return None + + +def parse_rss_item(item: ElementTree.Element) -> dict: + """Parse a single RSS <item> element.""" + return { + "title": (item.findtext("title") or "").strip(), + "link": (item.findtext("link") or "").strip(), + "description": (item.findtext("description") or "").strip(), + "published": ( + item.findtext("pubDate") + or item.findtext(f"{{{NAMESPACES['dc']}}}date") + or "" + ).strip(), + } + + +def parse_atom_entry(entry: ElementTree.Element) -> dict: + """Parse a single Atom <entry> element.""" + link_el = entry.find(f"{{{NAMESPACES['atom']}}}link") + link = link_el.get("href", "") if link_el is not None else "" + + return { + "title": (entry.findtext(f"{{{NAMESPACES['atom']}}}title") or "").strip(), + "link": link.strip(), + "description": ( + entry.findtext(f"{{{NAMESPACES['atom']}}}summary") + or entry.findtext(f"{{{NAMESPACES['atom']}}}content") + or "" + ).strip(), + "published": ( + entry.findtext(f"{{{NAMESPACES['atom']}}}published") + or entry.findtext(f"{{{NAMESPACES['atom']}}}updated") + or "" + ).strip(), + } + + +def parse_feed(xml_bytes: bytes) -> list[dict]: + """Parse RSS or Atom feed XML into a list of items.""" + try: + root = ElementTree.fromstring(xml_bytes) + except ElementTree.ParseError as e: + print(f"XML parse error: {e}", file=sys.stderr) + return [] + + items = [] + + # Try RSS 2.0 format: <rss><channel><item> + for item in root.iter("item"): + parsed = parse_rss_item(item) + if parsed["title"]: + items.append(parsed) + + # Try Atom format: <feed><entry> + if not items: + for entry in root.iter(f"{{{NAMESPACES['atom']}}}entry"): + parsed = parse_atom_entry(entry) + if parsed["title"]: + items.append(parsed) + + return items + + +def fetch_and_parse(feed_key: str, feed_config: dict) -> dict: + """Fetch and parse a single feed, returning structured output.""" + xml = fetch_feed(feed_config["url"]) + if xml is None: + return { + "feed": feed_key, + "name": feed_config["name"], + "url": feed_config["url"], + "status": "failed", + "items": [], + } + + items = parse_feed(xml) + return { + "feed": feed_key, + "name": feed_config["name"], + "url": feed_config["url"], + "status": "ok", + "item_count": len(items), + "items": items, + } + + +def main(): + parser = argparse.ArgumentParser(description="Fetch headlines from RSS feeds") + parser.add_argument( + "--output-dir", + default=None, + help="Directory to write JSON files (one per feed + manifest). If not set, prints to stdout.", + ) + parser.add_argument( + "--feeds", + nargs="*", + default=None, + help=f"Feed keys to fetch (default: all). Available: {', '.join(FEEDS.keys())}", + ) + args = parser.parse_args() + + feed_keys = args.feeds or list(FEEDS.keys()) + invalid = [k for k in feed_keys if k not in FEEDS] + if invalid: + print(f"Unknown feeds: {', '.join(invalid)}", file=sys.stderr) + print(f"Available: {', '.join(FEEDS.keys())}", file=sys.stderr) + sys.exit(1) + + results = [] + for key in feed_keys: + print(f"Fetching {FEEDS[key]['name']}...", file=sys.stderr) + result = fetch_and_parse(key, FEEDS[key]) + results.append(result) + print( + f" {result['status']}: {result.get('item_count', 0)} items", + file=sys.stderr, + ) + + if args.output_dir: + os.makedirs(args.output_dir, exist_ok=True) + + # Write individual feed files + for result in results: + path = os.path.join(args.output_dir, f"{result['feed']}.json") + with open(path, "w") as f: + json.dump(result, f, indent=2) + + # Write manifest + manifest = { + "fetched_at": datetime.now(timezone.utc).isoformat(), + "feeds": [ + { + "feed": r["feed"], + "name": r["name"], + "status": r["status"], + "item_count": r.get("item_count", 0), + "file": f"{r['feed']}.json", + } + for r in results + ], + "total_items": sum(r.get("item_count", 0) for r in results), + } + manifest_path = os.path.join(args.output_dir, "manifest.json") + with open(manifest_path, "w") as f: + json.dump(manifest, f, indent=2) + + print( + f"\nWrote {len(results)} feeds ({manifest['total_items']} total items) to {args.output_dir}", + file=sys.stderr, + ) + else: + # Print all results to stdout + output = { + "fetched_at": datetime.now(timezone.utc).isoformat(), + "feeds": results, + "total_items": sum(r.get("item_count", 0) for r in results), + } + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/lib/scanner.py b/lib/scanner.py new file mode 100644 index 0000000..05061ca --- /dev/null +++ b/lib/scanner.py @@ -0,0 +1,108 @@ +"""Reddit scanner using the public JSON API. + +Usage: + python -m lib.scanner <subreddit> + python -m lib.scanner <subreddit> --limit 50 + python -m lib.scanner <subreddit> --with-comments + +Fetches recent posts from a subreddit and outputs them as JSON. +No authentication required - uses Reddit's public .json endpoint. + +With --with-comments, also fetches the top comments for each post +(one additional request per post - be mindful of rate limits). +""" + +import argparse +import json +import sys +import time +import urllib.request +import urllib.error + +USER_AGENT = "community-scanner/0.1 (example bot; github.com/futuresearch/example-cc-cronjob)" + + +def fetch_json(url: str) -> dict | None: + """Fetch JSON from a URL with error handling.""" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode()) + except urllib.error.HTTPError as e: + print(f"HTTP {e.code} fetching {url}", file=sys.stderr) + return None + except urllib.error.URLError as e: + print(f"URL error fetching {url}: {e.reason}", file=sys.stderr) + return None + + +def fetch_comments(permalink: str, limit: int = 10) -> list[dict]: + """Fetch top comments for a post via its permalink.""" + url = f"https://www.reddit.com{permalink}.json?limit={limit}&sort=top" + data = fetch_json(url) + if not data or len(data) < 2: + return [] + + comments = [] + for child in data[1].get("data", {}).get("children", []): + if child.get("kind") != "t1": + continue + c = child.get("data", {}) + comments.append({ + "author": c.get("author", ""), + "body": c.get("body", ""), + "score": c.get("score", 0), + }) + + return comments + + +def fetch_subreddit( + subreddit: str, + limit: int = 25, + with_comments: bool = False, +) -> list[dict]: + """Fetch recent posts from a subreddit via the public JSON API.""" + url = f"https://www.reddit.com/r/{subreddit}/new.json?limit={limit}" + data = fetch_json(url) + if not data: + return [] + + posts = [] + for child in data.get("data", {}).get("children", []): + post = child.get("data", {}) + permalink = post.get("permalink", "") + + entry = { + "url": f"https://www.reddit.com{permalink}", + "title": post.get("title", ""), + "selftext": post.get("selftext", ""), + "author": post.get("author", ""), + "score": post.get("score", 0), + "num_comments": post.get("num_comments", 0), + "created_utc": post.get("created_utc", 0), + "subreddit": subreddit, + } + + if with_comments and post.get("num_comments", 0) > 0: + time.sleep(1) # Rate limit: 1 request per second + entry["comments"] = fetch_comments(permalink) + + posts.append(entry) + + return posts + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fetch recent posts from a subreddit") + parser.add_argument("subreddit", help="Subreddit name (without r/ prefix)") + parser.add_argument("--limit", type=int, default=25, help="Number of posts to fetch") + parser.add_argument( + "--with-comments", + action="store_true", + help="Also fetch top comments for each post (slower, one extra request per post)", + ) + args = parser.parse_args() + + posts = fetch_subreddit(args.subreddit, limit=args.limit, with_comments=args.with_comments) + print(json.dumps(posts, indent=2)) diff --git a/lib/seo_prepare.py b/lib/seo_prepare.py new file mode 100644 index 0000000..090071f --- /dev/null +++ b/lib/seo_prepare.py @@ -0,0 +1,165 @@ +"""Prepare SEO data for analysis agents. + +Takes raw GSC data and produces per-page input files for seo-page-analyzer agents. +Computes metrics, week-over-week diffs, and loads experiment history. + +Usage: + python -m lib.seo_prepare --date 2026-01-23 + python -m lib.seo_prepare --date 2026-01-23 --dry-run + +No external dependencies -- stdlib only (json, pathlib, argparse). +""" + +import argparse +import json +from datetime import UTC, datetime +from pathlib import Path + +# Configuration: edit these for your site +DOMAIN = "example.com" # Must match GSC page URLs +CONTENT_DIR = Path("content") # Where your .md/.mdx files live +PAGE_CATEGORIES: dict[str, str] = {} # slug -> category (blog, docs, landing) +DATA_DIR = Path("data/seo") + + +def _load(path: Path) -> list | dict: + if not path.exists(): return [] + try: return json.loads(path.read_text()) + except (json.JSONDecodeError, OSError): return [] + + +def _normalize(data: list | dict) -> list[dict]: + """Flatten GSC MCP response ({rows: [{keys, clicks, ...}]}) into dicts.""" + rows = data.get("rows", data) if isinstance(data, dict) else data if isinstance(data, list) else [] + out = [] + for r in rows: + if "keys" not in r: out.append(r); continue + e = {"url": r["keys"][-1]} if r["keys"] else {} + if len(r["keys"]) >= 2: e["query"] = r["keys"][0] + for f in ("clicks", "impressions", "ctr", "position"): e[f] = r.get(f, 0) + out.append(e) + return out + + +def _index(rows: list[dict]) -> dict: + """Build URL -> metrics dict or URL -> [query dicts] lookup.""" + out: dict = {} + for r in rows: + url = r.get("url", "") + if "query" in r: out.setdefault(url, []).append(r) + else: out[url] = {k: r.get(k, 0) for k in ("clicks", "impressions", "ctr", "position")} + for v in out.values(): + if isinstance(v, list): v.sort(key=lambda x: x.get("impressions", 0), reverse=True) + return out + + +def _find(slug, metrics, queries): + for u in (f"https://{DOMAIN}/{slug}/", f"https://{DOMAIN}/{slug}", + f"https://www.{DOMAIN}/{slug}/", f"https://www.{DOMAIN}/{slug}"): + m, q = metrics.get(u), queries.get(u, []) + if m or q: return m, q + return None, [] + + +def _diff(cm, cq, pm, pq): + if not pm and not pq: return None + c, p = cm or {}, pm or {} + pmap, cmap = {q["query"]: q for q in pq}, {q["query"]: q for q in cq} + return { + "clicks_delta": c.get("clicks", 0) - p.get("clicks", 0), + "impressions_delta": c.get("impressions", 0) - p.get("impressions", 0), + "ctr_delta": round(c.get("ctr", 0) - p.get("ctr", 0), 4), + "position_delta": round((p.get("position") or 0) - (c.get("position") or 0), 2), + "queries_gained": sorted([{"query": q, **cmap[q]} for q in set(cmap) - set(pmap)], + key=lambda x: x["impressions"], reverse=True)[:20], + "queries_lost": sorted([{"query": q, **pmap[q]} for q in set(pmap) - set(cmap)], + key=lambda x: x["impressions"], reverse=True)[:20], + } + + +def _history(slug, current, date): + """Load experiment outcomes from data/seo/changes/*.json.""" + cdir = DATA_DIR / "changes" + if not cdir.exists(): return [] + hist = [] + for f in sorted(cdir.glob("*.json")): + data = _load(f) + if not isinstance(data, dict): continue + for ch in data.get("changes", []): + if ch.get("slug") != slug: continue + exp = f.stem + try: days = (datetime.strptime(date, "%Y-%m-%d") - datetime.strptime(exp, "%Y-%m-%d")).days + except (ValueError, TypeError): days = 0 + before = ch.get("data_at_change", {}) + after, outcome = None, "pending" + if days >= 7 and current: + after = {k: current.get(k, 0) for k in ("clicks", "impressions", "ctr", "position")} + bc, ac = before.get("ctr", 0), after.get("ctr", 0) + cp = ((ac - bc) / bc * 100) if bc else 0 + pd = (before.get("position") or 100) - (after.get("position") or 100) + outcome = "improved" if cp > 20 or pd > 1 else "regressed" if cp < -20 or pd < -1 else "neutral" + hist.append({"experiment_date": exp, "days_since": days, "change_type": ch.get("field"), + "old_value": ch.get("old_value"), "new_value": ch.get("new_value"), + "data_before": before or None, "data_after": after, "outcome": outcome}) + return hist + + +def prepare(date: str, dry_run: bool = False) -> dict: + raw = DATA_DIR / "runs" / date / "raw" + pdir = DATA_DIR / "runs" / date / "pages" + pages = _normalize(_load(raw / "all-pages.json")) + qdata = _normalize(_load(raw / "page-queries.json")) + if not pages and not qdata: + print(f"Error: no raw data in {raw}"); return {"error": "missing_files"} + + mi, qi = _index(pages), _index(qdata) + # Previous run + rdir = DATA_DIR / "runs" + prev = sorted([d.name for d in rdir.iterdir() if d.is_dir() and d.name < date], reverse=True) if rdir.exists() else [] + pd = prev[0] if prev else None + pm, pq = {}, {} + if pd: + pr = DATA_DIR / "runs" / pd / "raw" + pm, pq = _index(_normalize(_load(pr / "all-pages.json"))), _index(_normalize(_load(pr / "page-queries.json"))) + + # Build inventory + inv, slugs = [], set() + if CONTENT_DIR.exists(): + for ext in ("*.md", "*.mdx"): + for f in sorted(CONTENT_DIR.glob(ext)): + s = f.stem; slugs.add(s) + inv.append({"slug": s, "url": f"https://{DOMAIN}/{s}/", + "category": PAGE_CATEGORIES.get(s, "other"), "title": "", "description": ""}) + for url in mi: + s = url.rstrip("/").split("/")[-1] + if s and s not in slugs: + slugs.add(s) + inv.append({"slug": s, "url": url, "category": PAGE_CATEGORIES.get(s, "other"), "title": "", "description": ""}) + + if not dry_run: pdir.mkdir(parents=True, exist_ok=True) + meta = _load(raw / "metadata.json") + dr = meta.get("date_range", {}) if isinstance(meta, dict) else {} + + for p in inv: + s = p["slug"] + m, q = _find(s, mi, qi); prm, prq = _find(s, pm, pq) + out = {"slug": s, "url": p["url"], "category": p["category"], + "current_metadata": {"title": p["title"], "description": p["description"]}, + "gsc_current": {"date_range": dr, "clicks": (m or {}).get("clicks", 0), + "impressions": (m or {}).get("impressions", 0), "ctr": (m or {}).get("ctr", 0), + "position": (m or {}).get("position"), "in_gsc": m is not None, "queries": q[:30]}, + "gsc_previous": {"clicks": prm.get("clicks", 0), "impressions": prm.get("impressions", 0), + "ctr": prm.get("ctr", 0), "position": prm.get("position")} if prm else None, + "gsc_diff": _diff(m, q, prm, prq), "experiment_history": _history(s, m, date)} + if not dry_run: (pdir / f"{s}.json").write_text(json.dumps(out, indent=2)) + + print(f"Prepared {len(inv)} pages for {date}" + (f" (prev: {pd})" if pd else "")) + return {"date": date, "pages": len(inv), "previous_run": pd, "at": datetime.now(UTC).isoformat()} + + +if __name__ == "__main__": + p = argparse.ArgumentParser(description="Prepare SEO data for analysis agents") + p.add_argument("--date", required=True, help="Run date (YYYY-MM-DD)") + p.add_argument("--dry-run", action="store_true", help="Show what would be created") + a = p.parse_args() + print(json.dumps(prepare(a.date, a.dry_run), indent=2))