From 36700b2dcfb2da5c42b73212b0bfef6da875da73 Mon Sep 17 00:00:00 2001 From: Alex Qiu Date: Thu, 12 Feb 2026 18:07:56 -0800 Subject: [PATCH 1/5] =?UTF-8?q?dynamic=20menu=20scraper=20template=20?= =?UTF-8?q?=E2=80=93=20WIP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../menu-dynamic-extraction-demo/.env.example | 15 ++ .../menu-dynamic-extraction-demo/.gitignore | 34 +++ python/menu-dynamic-extraction-demo/README.md | 112 ++++++++ python/menu-dynamic-extraction-demo/config.py | 36 +++ python/menu-dynamic-extraction-demo/main.py | 147 ++++++++++ python/menu-dynamic-extraction-demo/models.py | 98 +++++++ .../pyproject.toml | 35 +++ .../menu-dynamic-extraction-demo/scraper.py | 255 ++++++++++++++++++ python/menu-dynamic-extraction-demo/utils.py | 117 ++++++++ 9 files changed, 849 insertions(+) create mode 100644 python/menu-dynamic-extraction-demo/.env.example create mode 100644 python/menu-dynamic-extraction-demo/.gitignore create mode 100644 python/menu-dynamic-extraction-demo/README.md create mode 100644 python/menu-dynamic-extraction-demo/config.py create mode 100644 python/menu-dynamic-extraction-demo/main.py create mode 100644 python/menu-dynamic-extraction-demo/models.py create mode 100644 python/menu-dynamic-extraction-demo/pyproject.toml create mode 100644 python/menu-dynamic-extraction-demo/scraper.py create mode 100644 python/menu-dynamic-extraction-demo/utils.py diff --git a/python/menu-dynamic-extraction-demo/.env.example b/python/menu-dynamic-extraction-demo/.env.example new file mode 100644 index 00000000..79ab44e8 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/.env.example @@ -0,0 +1,15 @@ +# Browserbase credentials (required) +# Get these from https://www.browserbase.com/settings +BROWSERBASE_PROJECT_ID=your_browserbase_project_id +BROWSERBASE_API_KEY=your_browserbase_api_key + +# Google API key for Gemini model (required for Stagehand) +# Get your key from https://aistudio.google.com/apikey +GOOGLE_API_KEY=your_google_api_key + +# Optional: Logging configuration +# LOG_LEVEL=INFO + +# Optional: File paths +# WEBSITES_FILE=websites.txt +# OUTPUT_DIR=results diff --git a/python/menu-dynamic-extraction-demo/.gitignore b/python/menu-dynamic-extraction-demo/.gitignore new file mode 100644 index 00000000..a95c2797 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/.gitignore @@ -0,0 +1,34 @@ +# Environment variables (CRITICAL - contains API keys) +.env + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +ENV/ +env/ +*.egg-info/ +dist/ +build/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +.DS_Store + +# Output directories +results/ +output/ +*.json +*.csv + +# Logs +*.log + +# Jupyter +.ipynb_checkpoints/ diff --git a/python/menu-dynamic-extraction-demo/README.md b/python/menu-dynamic-extraction-demo/README.md new file mode 100644 index 00000000..28ade423 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/README.md @@ -0,0 +1,112 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor + +## AT A GLANCE + +- **Goal**: Automate restaurant menu extraction from websites using AI-powered browser automation to scrape menu items, prices, descriptions, and categories. +- **Pattern Template**: Demonstrates web scraping with Stagehand's observe/act/extract pattern for navigating complex restaurant websites and parsing menu structures. +- **Workflow**: Stagehand navigates to restaurant website, finds menu links using observe, extracts structured data with Pydantic schemas, handles multi-section menus (lunch/dinner/drinks), and outputs JSON results. +- **Multi-Section Support**: Automatically detects menu subsections (Lunch, Dinner, Happy Hour, etc.) and extracts each separately for comprehensive coverage. +- **Production-Ready**: Includes retry logic, popup handling, logging, error recovery, and parallel processing capabilities for batch extraction. +- Docs → [Stagehand Act](https://docs.stagehand.dev/basics/act) | [Stagehand Observe](https://docs.stagehand.dev/basics/observe) | [Stagehand Extract](https://docs.stagehand.dev/basics/extract) + +## GLOSSARY + +- **observe**: Find and return interactive elements on the page matching a description without performing actions. Used here to locate menu links and subsections. + Docs → https://docs.stagehand.dev/basics/observe +- **act**: Perform UI actions from natural language prompts (click buttons, navigate links). Used to click menu links discovered via observe. + Docs → https://docs.stagehand.dev/basics/act +- **extract**: Pull structured data from web pages using natural language instructions and Pydantic schemas. Ensures menu data is consistently formatted. + Docs → https://docs.stagehand.dev/basics/extract +- **Pydantic schemas**: Type-safe data models that define the structure of extracted menu data (sections, categories, items, prices). + Docs → https://docs.pydantic.dev/ +- **BYOB (Bring Your Own Browser)**: Run Stagehand sessions on Browserbase's cloud infrastructure for reliability, scalability, and live debugging. + Docs → https://docs.browserbase.com + +## QUICKSTART + +1. cd python/restaurant-demo +2. Install dependencies with uv: + + ```bash + uv pip install -e . + ``` + + Alternatively, use pip: + + ```bash + python -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + pip install -e . + ``` + +3. cp .env.example .env +4. Add required API keys to .env: + - `BROWSERBASE_PROJECT_ID` - Get from https://www.browserbase.com/settings + - `BROWSERBASE_API_KEY` - Get from https://www.browserbase.com/settings + - `GOOGLE_API_KEY` - Get from https://aistudio.google.com/apikey +5. Run the script: + ```bash + python main.py + ``` + The script will prompt you for a restaurant website URL. + +## EXPECTED OUTPUT + +- Prompts for restaurant website URL input +- Initializes Stagehand session with Browserbase (verbose logging shows browser actions) +- Navigates to the restaurant website and attempts to close any popups/modals +- Uses observe to find the menu link (retries up to 3 times if needed) +- Clicks the menu link and navigates to menu page +- Detects all menu subsections (Lunch, Dinner, Drinks, etc.) via observe +- For each subsection: + - Navigates to that section + - Extracts structured menu data: sections → categories → items (name, description, price) +- All extraction results are stored in the Stagehand session (can be extended to write JSON files) +- Session closes cleanly after extraction completes + +Example log output: +``` +INFO: Navigating to https://example-restaurant.com ... +INFO: Menu link found: ['https://example-restaurant.com/menu'] +INFO: Navigating to menu section: Lunch Menu ... +INFO: Extracting menu section: Lunch Menu +INFO: Navigating to menu section: Dinner Menu ... +INFO: Session closed successfully +``` + +## COMMON PITFALLS + +- "ModuleNotFoundError: No module named 'stagehand'": Ensure you installed dependencies with `uv pip install -e .` or `pip install -e .` +- Missing API keys: Verify .env contains BROWSERBASE_PROJECT_ID, BROWSERBASE_API_KEY, and GOOGLE_API_KEY +- "Could not find menu link after multiple attempts": The restaurant website may have an unusual structure. Try manually checking if there's a clear "Menu" link. Increase MAX_RETRIES in config if needed. +- Popup/modal blocking: The script attempts to close popups automatically, but some sites have persistent overlays. Check the Browserbase live view link to debug. +- Empty extraction results: Some restaurant sites load menus dynamically or via iframes. The script skips iframe links automatically but may need manual adjustment for special cases. +- Stagehand verbose=2 logging: Produces detailed output for debugging. Set LOG_LEVEL=WARNING in .env for quieter output. +- Find more information on your Browserbase dashboard → https://www.browserbase.com/sign-in + +## USE CASES + +• **Restaurant data aggregation**: Build a database of restaurant menus across multiple locations for food delivery or review platforms. +• **Menu price comparison**: Track menu prices over time to detect price changes or compare pricing across restaurant chains. +• **Dietary restriction filtering**: Extract menu items and descriptions to identify vegan, gluten-free, or allergen-friendly options automatically. +• **Recipe inspiration**: Collect menu descriptions to analyze trending ingredients, flavor combinations, or plating techniques. + +## LIMITATIONS +• **PDF menu support**: Some restaurants use PDF menus. Enhance extraction to handle PDF downloads and OCR if needed. + +## NEXT STEPS + +• **Batch processing**: Modify to accept a list of restaurant URLs from a file and process them in parallel using asyncio workers (see scraper.py for agent pattern). +• **Output to database**: Extend the script to save extracted menus to PostgreSQL, MongoDB, or Airtable for persistent storage and querying. +• **Restaurant info extraction**: Expand to extract contact details (phone, email, hours, address) in addition to menu data. +• **Incremental updates**: Track previously extracted menus and only re-scrape when website content has changed (use checksums or last-modified headers). + +## HELPFUL RESOURCES + +📚 Stagehand Docs: https://docs.stagehand.dev/v3/first-steps/introduction +📚 Python SDK: https://docs.stagehand.dev/v3/sdk/python +🎮 Browserbase: https://www.browserbase.com +💡 Try it out: https://www.browserbase.com/playground +🔧 Templates: https://www.browserbase.com/templates +📧 Need help? support@browserbase.com +💬 Discord: http://stagehand.dev/discord diff --git a/python/menu-dynamic-extraction-demo/config.py b/python/menu-dynamic-extraction-demo/config.py new file mode 100644 index 00000000..819f0e83 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/config.py @@ -0,0 +1,36 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Configuration +# See README.md for full documentation + +"""Configuration and environment variables for the restaurant scraper.""" + +import os +import logging +from dotenv import load_dotenv +from browserbase import Browserbase + +# Load environment variables from .env file +load_dotenv() + +# API Keys +MODEL_API_KEY = os.getenv("GOOGLE_API_KEY") # Google API key for Gemini models +BROWSERBASE_API_KEY = os.getenv("BROWSERBASE_API_KEY") +BROWSERBASE_PROJECT_ID = os.getenv("BROWSERBASE_PROJECT_ID") + +# File paths +WEBSITES_FILE = os.getenv("WEBSITES_FILE", "websites.txt") +OUTPUT_DIR = "results" + +# Scraper settings +NO_MENU_LINK_FOUND = "NO_MENU_LINK_FOUND" +MAX_RETRIES = 3 + +# Logging +LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() +logging.basicConfig( + level=LOG_LEVEL, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", +) +logger = logging.getLogger(__name__) + +# Initialize Browserbase client +bb = Browserbase(api_key=BROWSERBASE_API_KEY) diff --git a/python/menu-dynamic-extraction-demo/main.py b/python/menu-dynamic-extraction-demo/main.py new file mode 100644 index 00000000..f0dadb7a --- /dev/null +++ b/python/menu-dynamic-extraction-demo/main.py @@ -0,0 +1,147 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor +# See README.md for full documentation + +""" +Main entrypoint for restaurant menu extraction. + +This script uses Stagehand + Browserbase to automatically: +1. Navigate to restaurant websites +2. Find and click menu links +3. Extract structured menu data (sections, categories, items) +4. Save results to JSON files + +Usage: + python main.py # Interactive mode - prompts for URL + python main.py --batch # Batch mode - processes URLs from websites.txt +""" + +from playwright.sync_api import sync_playwright +from stagehand import Stagehand +from config import ( + BROWSERBASE_API_KEY, + BROWSERBASE_PROJECT_ID, + MODEL_API_KEY, + NO_MENU_LINK_FOUND, + bb, + logger +) +from models import MENU_SCHEMA +from utils import normalize_url, get_website_from_user, load_websites_from_file, save_menu_to_json +from scraper import close_popups, find_menu_link, extract_menu_from_sections, process_restaurant + + +def main(): + """Main function for interactive single-restaurant extraction.""" + # Create Browserbase session + session = bb.sessions.create(project_id=BROWSERBASE_PROJECT_ID) + session_id = session.id + + # Initialize Stagehand client + client = Stagehand( + browserbase_api_key=BROWSERBASE_API_KEY, + browserbase_project_id=BROWSERBASE_PROJECT_ID, + model_api_key=MODEL_API_KEY, + ) + + logger.info(f"Session started: {session_id}") + logger.info(f"Watch live: https://browserbase.com/sessions/{session_id}") + + try: + # Connect Playwright to Browserbase + with sync_playwright() as p: + browser = p.chromium.connect_over_cdp( + f"wss://connect.browserbase.com?apiKey={BROWSERBASE_API_KEY}&sessionId={session_id}" + ) + ctx = browser.contexts[0] + page = ctx.pages[0] if ctx.pages else ctx.new_page() + + # Get website URL from user + website_url = normalize_url(get_website_from_user()) + logger.info(f"Navigating to {website_url} ...") + page.goto(website_url, wait_until="domcontentloaded") + + # Close any popups + close_popups(client, session_id) + + # Locate menu link with retries + menu_link = find_menu_link(client, session_id) + if menu_link == NO_MENU_LINK_FOUND: + logger.error("Could not find menu link after multiple attempts.") + else: + logger.info(f"Menu link found: {menu_link}") + + # Navigate to menu + client.sessions.act( + id=session_id, + input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", + options={"model": {"modelName": "google/gemini-2.5-flash"}}, + ) + + page.wait_for_load_state("load", timeout=20000) + + # Find menu subsections + sections_response = client.sessions.observe( + id=session_id, + instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " + "Return them as a list of links. If none found, return the current page link only in a list. " + "Do not return duplicates if a link appears multiple times.", + ) + sections = sections_response.data.result + + # Extract menu from all sections + all_menu_sections = extract_menu_from_sections(client, session_id, page, sections) + + # Save combined menu data to JSON file + if all_menu_sections: + save_menu_to_json(website_url, all_menu_sections) + + browser.close() + + finally: + # End session + client.sessions.end(id=session_id) + logger.info("Session closed successfully") + + +def batch_process(): + """ + Process multiple restaurant websites in parallel. + URLs are loaded from WEBSITES_FILE (default: websites.txt). + + Example usage: + Create websites.txt with one URL per line: + https://www.restaurant1.com + https://www.restaurant2.com + # This is a comment + https://www.restaurant3.com + """ + websites = load_websites_from_file() + if not websites: + logger.error("No websites to process") + return + + logger.info(f"Starting batch processing of {len(websites)} websites") + + # Process all restaurants sequentially (sync version) + results = [] + for idx, url in enumerate(websites, start=1): + result = process_restaurant(url, agent_id=idx) + results.append(result) + + # Summary + successful = sum(1 for r in results if r["status"] == "success") + failed = len(results) - successful + logger.info(f"\n{'='*60}") + logger.info(f"Batch processing complete!") + logger.info(f"Total: {len(results)} | Success: {successful} | Failed: {failed}") + logger.info(f"{'='*60}\n") + + +if __name__ == "__main__": + import sys + + # Simple CLI argument handling + if len(sys.argv) > 1 and sys.argv[1] == "--batch": + batch_process() + else: + main() diff --git a/python/menu-dynamic-extraction-demo/models.py b/python/menu-dynamic-extraction-demo/models.py new file mode 100644 index 00000000..3eb99f98 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/models.py @@ -0,0 +1,98 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Data Models +# See README.md for full documentation + +"""Pydantic models and JSON schemas for menu extraction.""" + +from typing import Optional, List +from pydantic import BaseModel, Field + + +class MenuItem(BaseModel): + name: str + description: Optional[str] = None + price: Optional[str] = None + + +class MenuCategory(BaseModel): + """ + A category within a section. + e.g., "Antipasti", "Pizza", "Pasta" + """ + category_name: str + items: List[MenuItem] + + +class MenuSection(BaseModel): + """ + A full menu section, e.g., "Lunch", "Dinner", "Dessert". + Each section contains its own categories. + """ + section_name: str + categories: List[MenuCategory] + + +class Menu(BaseModel): + """ + The full restaurant menu. + Compatible with restaurants with multiple menu pages or subsections. + """ + sections: List[MenuSection] + + +# Manual JSON schema for Gemini API compatibility (avoids Pydantic's $defs) +MENU_SCHEMA = { + "type": "object", + "properties": { + "sections": { + "type": "array", + "description": "Menu sections (e.g., Lunch, Dinner, Dessert)", + "items": { + "type": "object", + "properties": { + "section_name": { + "type": "string", + "description": "Name of the menu section" + }, + "categories": { + "type": "array", + "description": "Categories within this section", + "items": { + "type": "object", + "properties": { + "category_name": { + "type": "string", + "description": "Name of the category (e.g., Appetizers, Entrees)" + }, + "items": { + "type": "array", + "description": "Menu items in this category", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Item name" + }, + "description": { + "type": "string", + "description": "Item description" + }, + "price": { + "type": "string", + "description": "Item price" + } + }, + "required": ["name"] + } + } + }, + "required": ["category_name", "items"] + } + } + }, + "required": ["section_name", "categories"] + } + } + }, + "required": ["sections"] +} diff --git a/python/menu-dynamic-extraction-demo/pyproject.toml b/python/menu-dynamic-extraction-demo/pyproject.toml new file mode 100644 index 00000000..0e0bb560 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/pyproject.toml @@ -0,0 +1,35 @@ +[project] +name = "restaurant-demo" +version = "0.1.0" +description = "Restaurant menu extraction using Stagehand and Browserbase" +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "browserbase>=1.4.0", + "playwright>=1.40.0", + "pydantic>=2.0.0", + "python-dotenv>=1.2.1", + "stagehand>=3.0.0", # v3 API - pure API client +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "black>=23.0.0", + "ruff>=0.1.0", +] + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 100 +target-version = ['py39', 'py310', 'py311'] + +[tool.ruff] +line-length = 100 +target-version = "py39" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W"] diff --git a/python/menu-dynamic-extraction-demo/scraper.py b/python/menu-dynamic-extraction-demo/scraper.py new file mode 100644 index 00000000..4eaad1de --- /dev/null +++ b/python/menu-dynamic-extraction-demo/scraper.py @@ -0,0 +1,255 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Scraping Logic +# See README.md for full documentation + +"""Core scraping logic for restaurant menu extraction.""" + +import time +import logging +from typing import List, Dict, Any, Optional +from datetime import datetime +from playwright.sync_api import sync_playwright, Page +from stagehand import Stagehand +from config import ( + BROWSERBASE_API_KEY, + BROWSERBASE_PROJECT_ID, + MODEL_API_KEY, + NO_MENU_LINK_FOUND, + MAX_RETRIES, + bb, + logger +) +from models import MENU_SCHEMA +from utils import save_menu_to_json + + +def close_popups(client: Stagehand, session_id: str, log: logging.Logger = logger) -> bool: + """ + Attempt to close popups/modals that might be blocking the page. + + Args: + client: Stagehand client instance + session_id: Active session ID + log: Logger instance + + Returns: + True if popups were closed, False otherwise + """ + try: + client.sessions.act( + id=session_id, + input="Close any popups, modals, or cookie notices that are blocking the page", + options={"model": {"modelName": "google/gemini-2.5-flash"}}, + ) + log.info("Successfully closed popups/modals") + return True + except Exception as e: + log.debug(f"No popups to close or failed to close: {e}") + return False + + +def find_menu_link(client: Stagehand, session_id: str, max_retries: int = MAX_RETRIES): + """ + Attempt to locate the restaurant's menu link using Stagehand observe. + Retries up to max_retries times if it fails. + + Args: + client: Stagehand client instance + session_id: Active session ID + max_retries: Maximum number of retry attempts + + Returns: + Menu link result or NO_MENU_LINK_FOUND + """ + instruction = ( + "Find the most likely link to the restaurant's menu on this webpage. If the webpage " + "already is the menu page, return the current page URL. Return only the link URL." + ) + + for attempt in range(1, max_retries + 1): + try: + response = client.sessions.observe( + id=session_id, + instruction=instruction, + options={"model": {"modelName": "google/gemini-2.5-flash"}}, + ) + return response.data.result + except Exception as e: + logger.warning(f"[Attempt {attempt}] Failed: {e}") + time.sleep(1) + return NO_MENU_LINK_FOUND + + +def extract_menu_from_sections( + client: Stagehand, + session_id: str, + page: Page, + sections: List[Any] +) -> List[Dict[str, Any]]: + """ + Extract menu data from all sections. + + Args: + client: Stagehand client instance + session_id: Active session ID + page: Playwright page instance + sections: List of menu sections to extract + + Returns: + List of all extracted menu sections + """ + all_menu_sections = [] + + for section in sections: + section_desc = section.get("description", "") if isinstance(section, dict) else str(section) + logger.info(f"Navigating to menu section: {section_desc} ...") + + # Skip iframe links + if "iframe" in section_desc.lower(): + logger.info("Skipping iframe link ...") + continue + + # Navigate to section + client.sessions.act( + id=session_id, + input=f"Navigate to: {section_desc}", + options={"model": {"modelName": "google/gemini-2.5-flash"}}, + ) + + page.wait_for_load_state("load", timeout=20000) + + # Extract menu data + extract_response = client.sessions.extract( + id=session_id, + instruction="Extract the menu organized by sections and categories. " + "Each section contains categories, and each category contains menu items. " + "For each item, extract the name, description, and price. " + "Preserve price formatting exactly as written.", + schema=MENU_SCHEMA, + options={"model": {"modelName": "google/gemini-2.5-flash"}}, + ) + logger.info(f"Menu data extracted for {section_desc}") + + # Collect the extracted menu data + menu_data = extract_response.data.result + if menu_data and "sections" in menu_data: + all_menu_sections.extend(menu_data["sections"]) + + return all_menu_sections + + +def process_restaurant(website_url: str, agent_id: int) -> Dict[str, Any]: + """ + Web agent that processes a single restaurant website. + This represents a single subprocessor in a production pipeline. + + Args: + website_url: The restaurant website to scrape + agent_id: Unique identifier for this agent instance + + Returns: + Dictionary containing extraction results and metadata + """ + agent_logger = logging.getLogger(f"Agent-{agent_id}") + start_time = datetime.now() + + result = { + "agent_id": agent_id, + "url": website_url, + "status": "pending", + "start_time": start_time.isoformat(), + "menu_data": [], + "error": None, + } + + # Create Browserbase session + session = bb.sessions.create(project_id=BROWSERBASE_PROJECT_ID) + session_id = session.id + + # Initialize Stagehand client + client = Stagehand( + browserbase_api_key=BROWSERBASE_API_KEY, + browserbase_project_id=BROWSERBASE_PROJECT_ID, + model_api_key=MODEL_API_KEY, + ) + + agent_logger.info(f"Session started: {session_id}") + agent_logger.info(f"Watch live: https://browserbase.com/sessions/{session_id}") + + try: + # Connect Playwright to Browserbase + with sync_playwright() as p: + browser = p.chromium.connect_over_cdp( + f"wss://connect.browserbase.com?apiKey={BROWSERBASE_API_KEY}&sessionId={session_id}" + ) + ctx = browser.contexts[0] + page = ctx.pages[0] if ctx.pages else ctx.new_page() + + # Navigate to website + agent_logger.info(f"Navigating to {website_url}") + page.goto(website_url, wait_until="domcontentloaded") + + # Close any popups on initial page load + close_popups(client, session_id, agent_logger) + + # Extract menu data + menu_link = find_menu_link(client, session_id) + if menu_link == NO_MENU_LINK_FOUND: + agent_logger.warning("Could not find menu link") + else: + agent_logger.info(f"Menu link: {menu_link}") + + # Navigate to menu link + client.sessions.act( + id=session_id, + input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", + options={"model": {"modelName": "google/gemini-2.5-flash"}}, + ) + + page.wait_for_load_state("load", timeout=20000) + + # Close any popups after navigating to menu page + close_popups(client, session_id, agent_logger) + + # Extract menu sections + sections_response = client.sessions.observe( + id=session_id, + instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " + "Return them as a list of links. If none found, return the current page link only in a list. " + "Do not return duplicates if a link appears multiple times.", + ) + sections = sections_response.data.result + + # Extract menu from all sections + all_menu_sections = extract_menu_from_sections(client, session_id, page, sections) + + browser.close() + + result["status"] = "success" + end_time = datetime.now() + result["end_time"] = end_time.isoformat() + result["duration_seconds"] = (end_time - start_time).total_seconds() + agent_logger.info(f"Completed extraction in {result['duration_seconds']:.2f}s") + + # Save combined menu data to JSON file + if all_menu_sections: + save_menu_to_json( + website_url, + all_menu_sections, + agent_id=agent_id, + duration_seconds=result["duration_seconds"] + ) + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + agent_logger.error(f"Error processing {website_url}: {e}", exc_info=True) + + finally: + # End session + try: + client.sessions.end(id=session_id) + agent_logger.info("Session closed successfully") + except Exception as e: + agent_logger.error(f"Error closing session: {e}") + + return result diff --git a/python/menu-dynamic-extraction-demo/utils.py b/python/menu-dynamic-extraction-demo/utils.py new file mode 100644 index 00000000..23171436 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/utils.py @@ -0,0 +1,117 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Utilities +# See README.md for full documentation + +"""Utility functions for the restaurant scraper.""" + +import json +import time +import re +from typing import List, Dict, Any +from pathlib import Path +from datetime import datetime +from urllib.parse import urlparse +from config import WEBSITES_FILE, OUTPUT_DIR, logger + + +def normalize_url(url: str) -> str: + """ + Normalize URL to ensure it has a protocol. + + Args: + url: The URL to normalize + + Returns: + Normalized URL with https:// prefix + """ + url = url.strip() + if not url.startswith(("http://", "https://")): + url = "https://" + url + return url + + +def load_websites_from_file(file_path: str = WEBSITES_FILE) -> List[str]: + """ + Load website URLs from a text file. + Lines starting with # are treated as comments and ignored. + + Args: + file_path: Path to the file containing URLs + + Returns: + List of normalized URLs + """ + websites = [] + try: + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + websites.append(normalize_url(line)) + logger.info(f"Loaded {len(websites)} websites from {file_path}") + return websites + except FileNotFoundError: + logger.error(f"File not found: {file_path}") + return [] + + +def get_website_from_user() -> str: + """ + Prompt the user to enter a restaurant website URL. + + Returns: + The URL entered by the user + """ + return input("Enter restaurant website URL: ").strip() + + +def save_menu_to_json( + website_url: str, + all_menu_sections: List[Dict[str, Any]], + agent_id: int = None, + duration_seconds: float = None +) -> str: + """ + Save combined menu data to a beautifully formatted JSON file. + + Args: + website_url: The restaurant website URL + all_menu_sections: Combined list of all menu sections + agent_id: Optional agent ID for batch processing + duration_seconds: Optional duration of extraction + + Returns: + Path to the saved JSON file + """ + # Create results directory if it doesn't exist + Path(OUTPUT_DIR).mkdir(exist_ok=True) + + # Generate safe filename from URL and timestamp + parsed_url = urlparse(website_url) + safe_name = re.sub(r'[^\w\-]', '_', parsed_url.netloc or parsed_url.path) + timestamp = int(time.time()) + filename = f"{OUTPUT_DIR}/{safe_name}_{timestamp}.json" + + # Create combined output + output_data = { + "restaurant_url": website_url, + "extracted_at": timestamp, + "extracted_at_readable": datetime.fromtimestamp(timestamp).isoformat(), + "menu": { + "sections": all_menu_sections + } + } + + # Add optional fields + if agent_id is not None: + output_data["agent_id"] = agent_id + if duration_seconds is not None: + output_data["duration_seconds"] = duration_seconds + + # Write beautifully formatted JSON + with open(filename, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + logger.info(f"✓ Menu saved to: {filename}") + logger.info(f"✓ Total sections extracted: {len(all_menu_sections)}") + + return filename From e8f6cdb5e71a5d8b04c0cd1ac0f9efcbcc435fac Mon Sep 17 00:00:00 2001 From: Alex Qiu Date: Tue, 17 Feb 2026 09:47:10 -0800 Subject: [PATCH 2/5] small fixes --- .../menu-dynamic-extraction-demo/.env.example | 9 +------- python/menu-dynamic-extraction-demo/README.md | 21 +++++++++++++------ python/menu-dynamic-extraction-demo/config.py | 11 +++++++++- python/menu-dynamic-extraction-demo/main.py | 7 ++++--- .../pyproject.toml | 2 +- .../menu-dynamic-extraction-demo/scraper.py | 1 + .../websites.txt.example | 15 +++++++++++++ 7 files changed, 47 insertions(+), 19 deletions(-) create mode 100644 python/menu-dynamic-extraction-demo/websites.txt.example diff --git a/python/menu-dynamic-extraction-demo/.env.example b/python/menu-dynamic-extraction-demo/.env.example index 79ab44e8..9afbafcc 100644 --- a/python/menu-dynamic-extraction-demo/.env.example +++ b/python/menu-dynamic-extraction-demo/.env.example @@ -3,13 +3,6 @@ BROWSERBASE_PROJECT_ID=your_browserbase_project_id BROWSERBASE_API_KEY=your_browserbase_api_key -# Google API key for Gemini model (required for Stagehand) +# Google API key (required for Stagehand with Gemini models) # Get your key from https://aistudio.google.com/apikey GOOGLE_API_KEY=your_google_api_key - -# Optional: Logging configuration -# LOG_LEVEL=INFO - -# Optional: File paths -# WEBSITES_FILE=websites.txt -# OUTPUT_DIR=results diff --git a/python/menu-dynamic-extraction-demo/README.md b/python/menu-dynamic-extraction-demo/README.md index 28ade423..6f2d7e28 100644 --- a/python/menu-dynamic-extraction-demo/README.md +++ b/python/menu-dynamic-extraction-demo/README.md @@ -4,6 +4,7 @@ - **Goal**: Automate restaurant menu extraction from websites using AI-powered browser automation to scrape menu items, prices, descriptions, and categories. - **Pattern Template**: Demonstrates web scraping with Stagehand's observe/act/extract pattern for navigating complex restaurant websites and parsing menu structures. +- **One script, many websites**: Stagehand can adapt to different webpage layouts with same core script thanks to its LLM-powered primitives. - **Workflow**: Stagehand navigates to restaurant website, finds menu links using observe, extracts structured data with Pydantic schemas, handles multi-section menus (lunch/dinner/drinks), and outputs JSON results. - **Multi-Section Support**: Automatically detects menu subsections (Lunch, Dinner, Happy Hour, etc.) and extracts each separately for comprehensive coverage. - **Production-Ready**: Includes retry logic, popup handling, logging, error recovery, and parallel processing capabilities for batch extraction. @@ -24,14 +25,14 @@ ## QUICKSTART -1. cd python/restaurant-demo +1. cd menu-dynamic-extraction-demo 2. Install dependencies with uv: ```bash uv pip install -e . ``` - Alternatively, use pip: + Alternatively, use pip/ pip3: ```bash python -m venv venv @@ -49,6 +50,13 @@ python main.py ``` The script will prompt you for a restaurant website URL. + Some of our favorites here in SF include https://www.thetailorssonsf.com/, https://www.thegrovesf.com/, and https://www.nopalitosf.com/. + + For batch processing multiple restaurants: + ```bash + python main.py --batch + ``` + Create a `websites.txt` file with one URL per line (see websites.txt.example). ## EXPECTED OUTPUT @@ -61,7 +69,7 @@ - For each subsection: - Navigates to that section - Extracts structured menu data: sections → categories → items (name, description, price) -- All extraction results are stored in the Stagehand session (can be extended to write JSON files) +- Saves all extraction results to timestamped JSON files in the `results/` directory - Session closes cleanly after extraction completes Example log output: @@ -78,10 +86,10 @@ INFO: Session closed successfully - "ModuleNotFoundError: No module named 'stagehand'": Ensure you installed dependencies with `uv pip install -e .` or `pip install -e .` - Missing API keys: Verify .env contains BROWSERBASE_PROJECT_ID, BROWSERBASE_API_KEY, and GOOGLE_API_KEY -- "Could not find menu link after multiple attempts": The restaurant website may have an unusual structure. Try manually checking if there's a clear "Menu" link. Increase MAX_RETRIES in config if needed. +- "Could not find menu link after multiple attempts": The restaurant website may have an unusual structure. Try manually checking if there's a clear "Menu" link. Increase MAX_RETRIES in config.py if needed. - Popup/modal blocking: The script attempts to close popups automatically, but some sites have persistent overlays. Check the Browserbase live view link to debug. - Empty extraction results: Some restaurant sites load menus dynamically or via iframes. The script skips iframe links automatically but may need manual adjustment for special cases. -- Stagehand verbose=2 logging: Produces detailed output for debugging. Set LOG_LEVEL=WARNING in .env for quieter output. +- Detailed logging: The script logs INFO level by default. Set LOG_LEVEL=WARNING in .env for quieter output, or LOG_LEVEL=DEBUG for more verbose logging. - Find more information on your Browserbase dashboard → https://www.browserbase.com/sign-in ## USE CASES @@ -96,10 +104,11 @@ INFO: Session closed successfully ## NEXT STEPS -• **Batch processing**: Modify to accept a list of restaurant URLs from a file and process them in parallel using asyncio workers (see scraper.py for agent pattern). +• **Parallel batch processing**: Enhance batch processing to use asyncio workers for concurrent extraction across multiple restaurants (currently processes sequentially). • **Output to database**: Extend the script to save extracted menus to PostgreSQL, MongoDB, or Airtable for persistent storage and querying. • **Restaurant info extraction**: Expand to extract contact details (phone, email, hours, address) in addition to menu data. • **Incremental updates**: Track previously extracted menus and only re-scrape when website content has changed (use checksums or last-modified headers). +• **PDF menu support**: Add support for restaurants that use PDF menus instead of web pages. ## HELPFUL RESOURCES diff --git a/python/menu-dynamic-extraction-demo/config.py b/python/menu-dynamic-extraction-demo/config.py index 819f0e83..cef7588d 100644 --- a/python/menu-dynamic-extraction-demo/config.py +++ b/python/menu-dynamic-extraction-demo/config.py @@ -12,10 +12,19 @@ load_dotenv() # API Keys -MODEL_API_KEY = os.getenv("GOOGLE_API_KEY") # Google API key for Gemini models +GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # Google API key for Gemini models +MODEL_API_KEY = GOOGLE_API_KEY # Alias for compatibility BROWSERBASE_API_KEY = os.getenv("BROWSERBASE_API_KEY") BROWSERBASE_PROJECT_ID = os.getenv("BROWSERBASE_PROJECT_ID") +# Validate required environment variables +if not GOOGLE_API_KEY: + raise ValueError("GOOGLE_API_KEY environment variable is required. Get one at https://aistudio.google.com/apikey") +if not BROWSERBASE_API_KEY: + raise ValueError("BROWSERBASE_API_KEY environment variable is required. Get one at https://www.browserbase.com/settings") +if not BROWSERBASE_PROJECT_ID: + raise ValueError("BROWSERBASE_PROJECT_ID environment variable is required. Get one at https://www.browserbase.com/settings") + # File paths WEBSITES_FILE = os.getenv("WEBSITES_FILE", "websites.txt") OUTPUT_DIR = "results" diff --git a/python/menu-dynamic-extraction-demo/main.py b/python/menu-dynamic-extraction-demo/main.py index f0dadb7a..ebab1148 100644 --- a/python/menu-dynamic-extraction-demo/main.py +++ b/python/menu-dynamic-extraction-demo/main.py @@ -64,6 +64,7 @@ def main(): close_popups(client, session_id) # Locate menu link with retries + all_menu_sections = [] menu_link = find_menu_link(client, session_id) if menu_link == NO_MENU_LINK_FOUND: logger.error("Could not find menu link after multiple attempts.") @@ -91,9 +92,9 @@ def main(): # Extract menu from all sections all_menu_sections = extract_menu_from_sections(client, session_id, page, sections) - # Save combined menu data to JSON file - if all_menu_sections: - save_menu_to_json(website_url, all_menu_sections) + # Save combined menu data to JSON file + if all_menu_sections: + save_menu_to_json(website_url, all_menu_sections) browser.close() diff --git a/python/menu-dynamic-extraction-demo/pyproject.toml b/python/menu-dynamic-extraction-demo/pyproject.toml index 0e0bb560..262deef4 100644 --- a/python/menu-dynamic-extraction-demo/pyproject.toml +++ b/python/menu-dynamic-extraction-demo/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "restaurant-demo" +name = "menu-dynamic-extraction-demo" version = "0.1.0" description = "Restaurant menu extraction using Stagehand and Browserbase" readme = "README.md" diff --git a/python/menu-dynamic-extraction-demo/scraper.py b/python/menu-dynamic-extraction-demo/scraper.py index 4eaad1de..9e3cc7cb 100644 --- a/python/menu-dynamic-extraction-demo/scraper.py +++ b/python/menu-dynamic-extraction-demo/scraper.py @@ -192,6 +192,7 @@ def process_restaurant(website_url: str, agent_id: int) -> Dict[str, Any]: close_popups(client, session_id, agent_logger) # Extract menu data + all_menu_sections = [] menu_link = find_menu_link(client, session_id) if menu_link == NO_MENU_LINK_FOUND: agent_logger.warning("Could not find menu link") diff --git a/python/menu-dynamic-extraction-demo/websites.txt.example b/python/menu-dynamic-extraction-demo/websites.txt.example new file mode 100644 index 00000000..16953ad8 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/websites.txt.example @@ -0,0 +1,15 @@ +# Restaurant Menu Extraction - Batch Processing URLs +# +# Instructions: +# 1. Copy this file to websites.txt +# 2. Add one restaurant URL per line +# 3. Lines starting with # are treated as comments and ignored +# 4. Run: python main.py --batch +# +# Example URLs: + +https://www.thetailorssonsf.com/ +https://www.thegrovesf.com/ +https://www.nopalitosf.com/ + +# Add more restaurant URLs below: From 472f395a8b4cc302b6032a90d02ac18ee57ade90 Mon Sep 17 00:00:00 2001 From: Alex Qiu Date: Tue, 17 Feb 2026 09:50:26 -0800 Subject: [PATCH 3/5] change env var to MODEL_API_KEY everywhere --- python/menu-dynamic-extraction-demo/.env.example | 6 +++--- python/menu-dynamic-extraction-demo/README.md | 4 ++-- python/menu-dynamic-extraction-demo/config.py | 7 +++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/menu-dynamic-extraction-demo/.env.example b/python/menu-dynamic-extraction-demo/.env.example index 9afbafcc..8384030d 100644 --- a/python/menu-dynamic-extraction-demo/.env.example +++ b/python/menu-dynamic-extraction-demo/.env.example @@ -3,6 +3,6 @@ BROWSERBASE_PROJECT_ID=your_browserbase_project_id BROWSERBASE_API_KEY=your_browserbase_api_key -# Google API key (required for Stagehand with Gemini models) -# Get your key from https://aistudio.google.com/apikey -GOOGLE_API_KEY=your_google_api_key +# Model API key (required for Stagehand) +# For Google Gemini models, get your key from https://aistudio.google.com/apikey +MODEL_API_KEY=your_model_api_key diff --git a/python/menu-dynamic-extraction-demo/README.md b/python/menu-dynamic-extraction-demo/README.md index 6f2d7e28..09863317 100644 --- a/python/menu-dynamic-extraction-demo/README.md +++ b/python/menu-dynamic-extraction-demo/README.md @@ -44,7 +44,7 @@ 4. Add required API keys to .env: - `BROWSERBASE_PROJECT_ID` - Get from https://www.browserbase.com/settings - `BROWSERBASE_API_KEY` - Get from https://www.browserbase.com/settings - - `GOOGLE_API_KEY` - Get from https://aistudio.google.com/apikey + - `MODEL_API_KEY` - Get from https://aistudio.google.com/apikey (for Google Gemini) 5. Run the script: ```bash python main.py @@ -85,7 +85,7 @@ INFO: Session closed successfully ## COMMON PITFALLS - "ModuleNotFoundError: No module named 'stagehand'": Ensure you installed dependencies with `uv pip install -e .` or `pip install -e .` -- Missing API keys: Verify .env contains BROWSERBASE_PROJECT_ID, BROWSERBASE_API_KEY, and GOOGLE_API_KEY +- Missing API keys: Verify .env contains BROWSERBASE_PROJECT_ID, BROWSERBASE_API_KEY, and MODEL_API_KEY - "Could not find menu link after multiple attempts": The restaurant website may have an unusual structure. Try manually checking if there's a clear "Menu" link. Increase MAX_RETRIES in config.py if needed. - Popup/modal blocking: The script attempts to close popups automatically, but some sites have persistent overlays. Check the Browserbase live view link to debug. - Empty extraction results: Some restaurant sites load menus dynamically or via iframes. The script skips iframe links automatically but may need manual adjustment for special cases. diff --git a/python/menu-dynamic-extraction-demo/config.py b/python/menu-dynamic-extraction-demo/config.py index cef7588d..6b624ac2 100644 --- a/python/menu-dynamic-extraction-demo/config.py +++ b/python/menu-dynamic-extraction-demo/config.py @@ -12,14 +12,13 @@ load_dotenv() # API Keys -GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # Google API key for Gemini models -MODEL_API_KEY = GOOGLE_API_KEY # Alias for compatibility +MODEL_API_KEY = os.getenv("MODEL_API_KEY") # API key for LLM provider (e.g., Google Gemini) BROWSERBASE_API_KEY = os.getenv("BROWSERBASE_API_KEY") BROWSERBASE_PROJECT_ID = os.getenv("BROWSERBASE_PROJECT_ID") # Validate required environment variables -if not GOOGLE_API_KEY: - raise ValueError("GOOGLE_API_KEY environment variable is required. Get one at https://aistudio.google.com/apikey") +if not MODEL_API_KEY: + raise ValueError("MODEL_API_KEY environment variable is required. For Google Gemini, get one at https://aistudio.google.com/apikey") if not BROWSERBASE_API_KEY: raise ValueError("BROWSERBASE_API_KEY environment variable is required. Get one at https://www.browserbase.com/settings") if not BROWSERBASE_PROJECT_ID: From 751bd4affa57e34e02104ab08d6707a9bd89f605 Mon Sep 17 00:00:00 2001 From: Alex Qiu Date: Tue, 17 Feb 2026 09:53:46 -0800 Subject: [PATCH 4/5] stagehand use default model config, not gemini specifically --- python/menu-dynamic-extraction-demo/main.py | 1 - python/menu-dynamic-extraction-demo/scraper.py | 5 ----- 2 files changed, 6 deletions(-) diff --git a/python/menu-dynamic-extraction-demo/main.py b/python/menu-dynamic-extraction-demo/main.py index ebab1148..006fed53 100644 --- a/python/menu-dynamic-extraction-demo/main.py +++ b/python/menu-dynamic-extraction-demo/main.py @@ -75,7 +75,6 @@ def main(): client.sessions.act( id=session_id, input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", - options={"model": {"modelName": "google/gemini-2.5-flash"}}, ) page.wait_for_load_state("load", timeout=20000) diff --git a/python/menu-dynamic-extraction-demo/scraper.py b/python/menu-dynamic-extraction-demo/scraper.py index 9e3cc7cb..e3bf9e57 100644 --- a/python/menu-dynamic-extraction-demo/scraper.py +++ b/python/menu-dynamic-extraction-demo/scraper.py @@ -38,7 +38,6 @@ def close_popups(client: Stagehand, session_id: str, log: logging.Logger = logge client.sessions.act( id=session_id, input="Close any popups, modals, or cookie notices that are blocking the page", - options={"model": {"modelName": "google/gemini-2.5-flash"}}, ) log.info("Successfully closed popups/modals") return True @@ -70,7 +69,6 @@ def find_menu_link(client: Stagehand, session_id: str, max_retries: int = MAX_RE response = client.sessions.observe( id=session_id, instruction=instruction, - options={"model": {"modelName": "google/gemini-2.5-flash"}}, ) return response.data.result except Exception as e: @@ -112,7 +110,6 @@ def extract_menu_from_sections( client.sessions.act( id=session_id, input=f"Navigate to: {section_desc}", - options={"model": {"modelName": "google/gemini-2.5-flash"}}, ) page.wait_for_load_state("load", timeout=20000) @@ -125,7 +122,6 @@ def extract_menu_from_sections( "For each item, extract the name, description, and price. " "Preserve price formatting exactly as written.", schema=MENU_SCHEMA, - options={"model": {"modelName": "google/gemini-2.5-flash"}}, ) logger.info(f"Menu data extracted for {section_desc}") @@ -203,7 +199,6 @@ def process_restaurant(website_url: str, agent_id: int) -> Dict[str, Any]: client.sessions.act( id=session_id, input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", - options={"model": {"modelName": "google/gemini-2.5-flash"}}, ) page.wait_for_load_state("load", timeout=20000) From ec1d6102f79651c804386e85b9d56bf91d958abb Mon Sep 17 00:00:00 2001 From: Alex Qiu Date: Tue, 17 Feb 2026 10:41:20 -0800 Subject: [PATCH 5/5] fix browser session management --- python/menu-dynamic-extraction-demo/README.md | 2 +- python/menu-dynamic-extraction-demo/config.py | 4 - python/menu-dynamic-extraction-demo/main.py | 99 +++++++++---------- .../pyproject.toml | 4 +- .../menu-dynamic-extraction-demo/scraper.py | 93 ++++++++--------- 5 files changed, 87 insertions(+), 115 deletions(-) diff --git a/python/menu-dynamic-extraction-demo/README.md b/python/menu-dynamic-extraction-demo/README.md index 09863317..170d1053 100644 --- a/python/menu-dynamic-extraction-demo/README.md +++ b/python/menu-dynamic-extraction-demo/README.md @@ -84,7 +84,7 @@ INFO: Session closed successfully ## COMMON PITFALLS -- "ModuleNotFoundError: No module named 'stagehand'": Ensure you installed dependencies with `uv pip install -e .` or `pip install -e .` +- "ModuleNotFoundError: No module named 'stagehand'": Ensure you installed dependencies with `uv pip install -e .` or `pip install -e .`. Note: Playwright is not required as Stagehand manages the browser automatically. - Missing API keys: Verify .env contains BROWSERBASE_PROJECT_ID, BROWSERBASE_API_KEY, and MODEL_API_KEY - "Could not find menu link after multiple attempts": The restaurant website may have an unusual structure. Try manually checking if there's a clear "Menu" link. Increase MAX_RETRIES in config.py if needed. - Popup/modal blocking: The script attempts to close popups automatically, but some sites have persistent overlays. Check the Browserbase live view link to debug. diff --git a/python/menu-dynamic-extraction-demo/config.py b/python/menu-dynamic-extraction-demo/config.py index 6b624ac2..cd5f805a 100644 --- a/python/menu-dynamic-extraction-demo/config.py +++ b/python/menu-dynamic-extraction-demo/config.py @@ -6,7 +6,6 @@ import os import logging from dotenv import load_dotenv -from browserbase import Browserbase # Load environment variables from .env file load_dotenv() @@ -39,6 +38,3 @@ format="%(asctime)s %(levelname)s %(name)s: %(message)s", ) logger = logging.getLogger(__name__) - -# Initialize Browserbase client -bb = Browserbase(api_key=BROWSERBASE_API_KEY) diff --git a/python/menu-dynamic-extraction-demo/main.py b/python/menu-dynamic-extraction-demo/main.py index 006fed53..f017d550 100644 --- a/python/menu-dynamic-extraction-demo/main.py +++ b/python/menu-dynamic-extraction-demo/main.py @@ -15,14 +15,12 @@ python main.py --batch # Batch mode - processes URLs from websites.txt """ -from playwright.sync_api import sync_playwright from stagehand import Stagehand from config import ( BROWSERBASE_API_KEY, BROWSERBASE_PROJECT_ID, MODEL_API_KEY, NO_MENU_LINK_FOUND, - bb, logger ) from models import MENU_SCHEMA @@ -32,10 +30,6 @@ def main(): """Main function for interactive single-restaurant extraction.""" - # Create Browserbase session - session = bb.sessions.create(project_id=BROWSERBASE_PROJECT_ID) - session_id = session.id - # Initialize Stagehand client client = Stagehand( browserbase_api_key=BROWSERBASE_API_KEY, @@ -43,59 +37,56 @@ def main(): model_api_key=MODEL_API_KEY, ) + stagehand_session = client.sessions.start( + model_name="google/gemini-2.5-flash", + ) + session_id = stagehand_session.data.session_id logger.info(f"Session started: {session_id}") logger.info(f"Watch live: https://browserbase.com/sessions/{session_id}") try: - # Connect Playwright to Browserbase - with sync_playwright() as p: - browser = p.chromium.connect_over_cdp( - f"wss://connect.browserbase.com?apiKey={BROWSERBASE_API_KEY}&sessionId={session_id}" + # Get website URL from user + website_url = normalize_url(get_website_from_user()) + logger.info(f"Navigating to {website_url} ...") + + # Navigate to website using Stagehand + client.sessions.navigate( + id=session_id, + url=website_url, + ) + + # Close any popups + close_popups(client, session_id) + + # Locate menu link with retries + all_menu_sections = [] + menu_link = find_menu_link(client, session_id) + if menu_link == NO_MENU_LINK_FOUND: + logger.error("Could not find menu link after multiple attempts.") + else: + logger.info(f"Menu link found: {menu_link}") + + # Navigate to menu + client.sessions.act( + id=session_id, + input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", + ) + + # Find menu subsections + sections_response = client.sessions.observe( + id=session_id, + instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " + "Return them as a list of links. If none found, return the current page link only in a list. " + "Do not return duplicates if a link appears multiple times.", ) - ctx = browser.contexts[0] - page = ctx.pages[0] if ctx.pages else ctx.new_page() - - # Get website URL from user - website_url = normalize_url(get_website_from_user()) - logger.info(f"Navigating to {website_url} ...") - page.goto(website_url, wait_until="domcontentloaded") - - # Close any popups - close_popups(client, session_id) - - # Locate menu link with retries - all_menu_sections = [] - menu_link = find_menu_link(client, session_id) - if menu_link == NO_MENU_LINK_FOUND: - logger.error("Could not find menu link after multiple attempts.") - else: - logger.info(f"Menu link found: {menu_link}") - - # Navigate to menu - client.sessions.act( - id=session_id, - input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", - ) - - page.wait_for_load_state("load", timeout=20000) - - # Find menu subsections - sections_response = client.sessions.observe( - id=session_id, - instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " - "Return them as a list of links. If none found, return the current page link only in a list. " - "Do not return duplicates if a link appears multiple times.", - ) - sections = sections_response.data.result - - # Extract menu from all sections - all_menu_sections = extract_menu_from_sections(client, session_id, page, sections) - - # Save combined menu data to JSON file - if all_menu_sections: - save_menu_to_json(website_url, all_menu_sections) - - browser.close() + sections = sections_response.data.result + + # Extract menu from all sections + all_menu_sections = extract_menu_from_sections(client, session_id, sections) + + # Save combined menu data to JSON file + if all_menu_sections: + save_menu_to_json(website_url, all_menu_sections) finally: # End session diff --git a/python/menu-dynamic-extraction-demo/pyproject.toml b/python/menu-dynamic-extraction-demo/pyproject.toml index 262deef4..ebfeafba 100644 --- a/python/menu-dynamic-extraction-demo/pyproject.toml +++ b/python/menu-dynamic-extraction-demo/pyproject.toml @@ -5,11 +5,9 @@ description = "Restaurant menu extraction using Stagehand and Browserbase" readme = "README.md" requires-python = ">=3.9" dependencies = [ - "browserbase>=1.4.0", - "playwright>=1.40.0", "pydantic>=2.0.0", "python-dotenv>=1.2.1", - "stagehand>=3.0.0", # v3 API - pure API client + "stagehand>=3.0.0", # v3 API - manages Browserbase sessions internally ] [project.optional-dependencies] diff --git a/python/menu-dynamic-extraction-demo/scraper.py b/python/menu-dynamic-extraction-demo/scraper.py index e3bf9e57..8e0875a3 100644 --- a/python/menu-dynamic-extraction-demo/scraper.py +++ b/python/menu-dynamic-extraction-demo/scraper.py @@ -7,7 +7,6 @@ import logging from typing import List, Dict, Any, Optional from datetime import datetime -from playwright.sync_api import sync_playwright, Page from stagehand import Stagehand from config import ( BROWSERBASE_API_KEY, @@ -15,7 +14,6 @@ MODEL_API_KEY, NO_MENU_LINK_FOUND, MAX_RETRIES, - bb, logger ) from models import MENU_SCHEMA @@ -80,7 +78,6 @@ def find_menu_link(client: Stagehand, session_id: str, max_retries: int = MAX_RE def extract_menu_from_sections( client: Stagehand, session_id: str, - page: Page, sections: List[Any] ) -> List[Dict[str, Any]]: """ @@ -89,7 +86,6 @@ def extract_menu_from_sections( Args: client: Stagehand client instance session_id: Active session ID - page: Playwright page instance sections: List of menu sections to extract Returns: @@ -112,8 +108,6 @@ def extract_menu_from_sections( input=f"Navigate to: {section_desc}", ) - page.wait_for_load_state("load", timeout=20000) - # Extract menu data extract_response = client.sessions.extract( id=session_id, @@ -157,10 +151,6 @@ def process_restaurant(website_url: str, agent_id: int) -> Dict[str, Any]: "error": None, } - # Create Browserbase session - session = bb.sessions.create(project_id=BROWSERBASE_PROJECT_ID) - session_id = session.id - # Initialize Stagehand client client = Stagehand( browserbase_api_key=BROWSERBASE_API_KEY, @@ -168,57 +158,54 @@ def process_restaurant(website_url: str, agent_id: int) -> Dict[str, Any]: model_api_key=MODEL_API_KEY, ) + # Start Stagehand session + stagehand_session = client.sessions.start( + model_name="google/gemini-2.5-flash", + ) + session_id = stagehand_session.data.session_id + agent_logger.info(f"Session started: {session_id}") agent_logger.info(f"Watch live: https://browserbase.com/sessions/{session_id}") try: - # Connect Playwright to Browserbase - with sync_playwright() as p: - browser = p.chromium.connect_over_cdp( - f"wss://connect.browserbase.com?apiKey={BROWSERBASE_API_KEY}&sessionId={session_id}" - ) - ctx = browser.contexts[0] - page = ctx.pages[0] if ctx.pages else ctx.new_page() + # Navigate to website using Stagehand + agent_logger.info(f"Navigating to {website_url}") + client.sessions.act( + id=session_id, + input=f"Go to {website_url}", + ) - # Navigate to website - agent_logger.info(f"Navigating to {website_url}") - page.goto(website_url, wait_until="domcontentloaded") + # Close any popups on initial page load + close_popups(client, session_id, agent_logger) - # Close any popups on initial page load + # Extract menu data + all_menu_sections = [] + menu_link = find_menu_link(client, session_id) + if menu_link == NO_MENU_LINK_FOUND: + agent_logger.warning("Could not find menu link") + else: + agent_logger.info(f"Menu link: {menu_link}") + + # Navigate to menu link + client.sessions.act( + id=session_id, + input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", + ) + + # Close any popups after navigating to menu page close_popups(client, session_id, agent_logger) - # Extract menu data - all_menu_sections = [] - menu_link = find_menu_link(client, session_id) - if menu_link == NO_MENU_LINK_FOUND: - agent_logger.warning("Could not find menu link") - else: - agent_logger.info(f"Menu link: {menu_link}") - - # Navigate to menu link - client.sessions.act( - id=session_id, - input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", - ) - - page.wait_for_load_state("load", timeout=20000) - - # Close any popups after navigating to menu page - close_popups(client, session_id, agent_logger) - - # Extract menu sections - sections_response = client.sessions.observe( - id=session_id, - instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " - "Return them as a list of links. If none found, return the current page link only in a list. " - "Do not return duplicates if a link appears multiple times.", - ) - sections = sections_response.data.result - - # Extract menu from all sections - all_menu_sections = extract_menu_from_sections(client, session_id, page, sections) - - browser.close() + # Extract menu sections + sections_response = client.sessions.observe( + id=session_id, + instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " + "Return them as a list of links. If none found, return the current page link only in a list. " + "Do not return duplicates if a link appears multiple times.", + ) + sections = sections_response.data.result + + # Extract menu from all sections + all_menu_sections = extract_menu_from_sections(client, session_id, sections) result["status"] = "success" end_time = datetime.now()