diff --git a/python/menu-dynamic-extraction-demo/.env.example b/python/menu-dynamic-extraction-demo/.env.example new file mode 100644 index 00000000..8384030d --- /dev/null +++ b/python/menu-dynamic-extraction-demo/.env.example @@ -0,0 +1,8 @@ +# Browserbase credentials (required) +# Get these from https://www.browserbase.com/settings +BROWSERBASE_PROJECT_ID=your_browserbase_project_id +BROWSERBASE_API_KEY=your_browserbase_api_key + +# Model API key (required for Stagehand) +# For Google Gemini models, get your key from https://aistudio.google.com/apikey +MODEL_API_KEY=your_model_api_key diff --git a/python/menu-dynamic-extraction-demo/.gitignore b/python/menu-dynamic-extraction-demo/.gitignore new file mode 100644 index 00000000..a95c2797 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/.gitignore @@ -0,0 +1,34 @@ +# Environment variables (CRITICAL - contains API keys) +.env + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +ENV/ +env/ +*.egg-info/ +dist/ +build/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +.DS_Store + +# Output directories +results/ +output/ +*.json +*.csv + +# Logs +*.log + +# Jupyter +.ipynb_checkpoints/ diff --git a/python/menu-dynamic-extraction-demo/README.md b/python/menu-dynamic-extraction-demo/README.md new file mode 100644 index 00000000..170d1053 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/README.md @@ -0,0 +1,121 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor + +## AT A GLANCE + +- **Goal**: Automate restaurant menu extraction from websites using AI-powered browser automation to scrape menu items, prices, descriptions, and categories. +- **Pattern Template**: Demonstrates web scraping with Stagehand's observe/act/extract pattern for navigating complex restaurant websites and parsing menu structures. +- **One script, many websites**: Stagehand can adapt to different webpage layouts with same core script thanks to its LLM-powered primitives. +- **Workflow**: Stagehand navigates to restaurant website, finds menu links using observe, extracts structured data with Pydantic schemas, handles multi-section menus (lunch/dinner/drinks), and outputs JSON results. +- **Multi-Section Support**: Automatically detects menu subsections (Lunch, Dinner, Happy Hour, etc.) and extracts each separately for comprehensive coverage. +- **Production-Ready**: Includes retry logic, popup handling, logging, error recovery, and parallel processing capabilities for batch extraction. +- Docs → [Stagehand Act](https://docs.stagehand.dev/basics/act) | [Stagehand Observe](https://docs.stagehand.dev/basics/observe) | [Stagehand Extract](https://docs.stagehand.dev/basics/extract) + +## GLOSSARY + +- **observe**: Find and return interactive elements on the page matching a description without performing actions. Used here to locate menu links and subsections. + Docs → https://docs.stagehand.dev/basics/observe +- **act**: Perform UI actions from natural language prompts (click buttons, navigate links). Used to click menu links discovered via observe. + Docs → https://docs.stagehand.dev/basics/act +- **extract**: Pull structured data from web pages using natural language instructions and Pydantic schemas. Ensures menu data is consistently formatted. + Docs → https://docs.stagehand.dev/basics/extract +- **Pydantic schemas**: Type-safe data models that define the structure of extracted menu data (sections, categories, items, prices). + Docs → https://docs.pydantic.dev/ +- **BYOB (Bring Your Own Browser)**: Run Stagehand sessions on Browserbase's cloud infrastructure for reliability, scalability, and live debugging. + Docs → https://docs.browserbase.com + +## QUICKSTART + +1. cd menu-dynamic-extraction-demo +2. Install dependencies with uv: + + ```bash + uv pip install -e . + ``` + + Alternatively, use pip/ pip3: + + ```bash + python -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + pip install -e . + ``` + +3. cp .env.example .env +4. Add required API keys to .env: + - `BROWSERBASE_PROJECT_ID` - Get from https://www.browserbase.com/settings + - `BROWSERBASE_API_KEY` - Get from https://www.browserbase.com/settings + - `MODEL_API_KEY` - Get from https://aistudio.google.com/apikey (for Google Gemini) +5. Run the script: + ```bash + python main.py + ``` + The script will prompt you for a restaurant website URL. + Some of our favorites here in SF include https://www.thetailorssonsf.com/, https://www.thegrovesf.com/, and https://www.nopalitosf.com/. + + For batch processing multiple restaurants: + ```bash + python main.py --batch + ``` + Create a `websites.txt` file with one URL per line (see websites.txt.example). + +## EXPECTED OUTPUT + +- Prompts for restaurant website URL input +- Initializes Stagehand session with Browserbase (verbose logging shows browser actions) +- Navigates to the restaurant website and attempts to close any popups/modals +- Uses observe to find the menu link (retries up to 3 times if needed) +- Clicks the menu link and navigates to menu page +- Detects all menu subsections (Lunch, Dinner, Drinks, etc.) via observe +- For each subsection: + - Navigates to that section + - Extracts structured menu data: sections → categories → items (name, description, price) +- Saves all extraction results to timestamped JSON files in the `results/` directory +- Session closes cleanly after extraction completes + +Example log output: +``` +INFO: Navigating to https://example-restaurant.com ... +INFO: Menu link found: ['https://example-restaurant.com/menu'] +INFO: Navigating to menu section: Lunch Menu ... +INFO: Extracting menu section: Lunch Menu +INFO: Navigating to menu section: Dinner Menu ... +INFO: Session closed successfully +``` + +## COMMON PITFALLS + +- "ModuleNotFoundError: No module named 'stagehand'": Ensure you installed dependencies with `uv pip install -e .` or `pip install -e .`. Note: Playwright is not required as Stagehand manages the browser automatically. +- Missing API keys: Verify .env contains BROWSERBASE_PROJECT_ID, BROWSERBASE_API_KEY, and MODEL_API_KEY +- "Could not find menu link after multiple attempts": The restaurant website may have an unusual structure. Try manually checking if there's a clear "Menu" link. Increase MAX_RETRIES in config.py if needed. +- Popup/modal blocking: The script attempts to close popups automatically, but some sites have persistent overlays. Check the Browserbase live view link to debug. +- Empty extraction results: Some restaurant sites load menus dynamically or via iframes. The script skips iframe links automatically but may need manual adjustment for special cases. +- Detailed logging: The script logs INFO level by default. Set LOG_LEVEL=WARNING in .env for quieter output, or LOG_LEVEL=DEBUG for more verbose logging. +- Find more information on your Browserbase dashboard → https://www.browserbase.com/sign-in + +## USE CASES + +• **Restaurant data aggregation**: Build a database of restaurant menus across multiple locations for food delivery or review platforms. +• **Menu price comparison**: Track menu prices over time to detect price changes or compare pricing across restaurant chains. +• **Dietary restriction filtering**: Extract menu items and descriptions to identify vegan, gluten-free, or allergen-friendly options automatically. +• **Recipe inspiration**: Collect menu descriptions to analyze trending ingredients, flavor combinations, or plating techniques. + +## LIMITATIONS +• **PDF menu support**: Some restaurants use PDF menus. Enhance extraction to handle PDF downloads and OCR if needed. + +## NEXT STEPS + +• **Parallel batch processing**: Enhance batch processing to use asyncio workers for concurrent extraction across multiple restaurants (currently processes sequentially). +• **Output to database**: Extend the script to save extracted menus to PostgreSQL, MongoDB, or Airtable for persistent storage and querying. +• **Restaurant info extraction**: Expand to extract contact details (phone, email, hours, address) in addition to menu data. +• **Incremental updates**: Track previously extracted menus and only re-scrape when website content has changed (use checksums or last-modified headers). +• **PDF menu support**: Add support for restaurants that use PDF menus instead of web pages. + +## HELPFUL RESOURCES + +📚 Stagehand Docs: https://docs.stagehand.dev/v3/first-steps/introduction +📚 Python SDK: https://docs.stagehand.dev/v3/sdk/python +🎮 Browserbase: https://www.browserbase.com +💡 Try it out: https://www.browserbase.com/playground +🔧 Templates: https://www.browserbase.com/templates +📧 Need help? support@browserbase.com +💬 Discord: http://stagehand.dev/discord diff --git a/python/menu-dynamic-extraction-demo/config.py b/python/menu-dynamic-extraction-demo/config.py new file mode 100644 index 00000000..cd5f805a --- /dev/null +++ b/python/menu-dynamic-extraction-demo/config.py @@ -0,0 +1,40 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Configuration +# See README.md for full documentation + +"""Configuration and environment variables for the restaurant scraper.""" + +import os +import logging +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# API Keys +MODEL_API_KEY = os.getenv("MODEL_API_KEY") # API key for LLM provider (e.g., Google Gemini) +BROWSERBASE_API_KEY = os.getenv("BROWSERBASE_API_KEY") +BROWSERBASE_PROJECT_ID = os.getenv("BROWSERBASE_PROJECT_ID") + +# Validate required environment variables +if not MODEL_API_KEY: + raise ValueError("MODEL_API_KEY environment variable is required. For Google Gemini, get one at https://aistudio.google.com/apikey") +if not BROWSERBASE_API_KEY: + raise ValueError("BROWSERBASE_API_KEY environment variable is required. Get one at https://www.browserbase.com/settings") +if not BROWSERBASE_PROJECT_ID: + raise ValueError("BROWSERBASE_PROJECT_ID environment variable is required. Get one at https://www.browserbase.com/settings") + +# File paths +WEBSITES_FILE = os.getenv("WEBSITES_FILE", "websites.txt") +OUTPUT_DIR = "results" + +# Scraper settings +NO_MENU_LINK_FOUND = "NO_MENU_LINK_FOUND" +MAX_RETRIES = 3 + +# Logging +LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() +logging.basicConfig( + level=LOG_LEVEL, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", +) +logger = logging.getLogger(__name__) diff --git a/python/menu-dynamic-extraction-demo/main.py b/python/menu-dynamic-extraction-demo/main.py new file mode 100644 index 00000000..f017d550 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/main.py @@ -0,0 +1,138 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor +# See README.md for full documentation + +""" +Main entrypoint for restaurant menu extraction. + +This script uses Stagehand + Browserbase to automatically: +1. Navigate to restaurant websites +2. Find and click menu links +3. Extract structured menu data (sections, categories, items) +4. Save results to JSON files + +Usage: + python main.py # Interactive mode - prompts for URL + python main.py --batch # Batch mode - processes URLs from websites.txt +""" + +from stagehand import Stagehand +from config import ( + BROWSERBASE_API_KEY, + BROWSERBASE_PROJECT_ID, + MODEL_API_KEY, + NO_MENU_LINK_FOUND, + logger +) +from models import MENU_SCHEMA +from utils import normalize_url, get_website_from_user, load_websites_from_file, save_menu_to_json +from scraper import close_popups, find_menu_link, extract_menu_from_sections, process_restaurant + + +def main(): + """Main function for interactive single-restaurant extraction.""" + # Initialize Stagehand client + client = Stagehand( + browserbase_api_key=BROWSERBASE_API_KEY, + browserbase_project_id=BROWSERBASE_PROJECT_ID, + model_api_key=MODEL_API_KEY, + ) + + stagehand_session = client.sessions.start( + model_name="google/gemini-2.5-flash", + ) + session_id = stagehand_session.data.session_id + logger.info(f"Session started: {session_id}") + logger.info(f"Watch live: https://browserbase.com/sessions/{session_id}") + + try: + # Get website URL from user + website_url = normalize_url(get_website_from_user()) + logger.info(f"Navigating to {website_url} ...") + + # Navigate to website using Stagehand + client.sessions.navigate( + id=session_id, + url=website_url, + ) + + # Close any popups + close_popups(client, session_id) + + # Locate menu link with retries + all_menu_sections = [] + menu_link = find_menu_link(client, session_id) + if menu_link == NO_MENU_LINK_FOUND: + logger.error("Could not find menu link after multiple attempts.") + else: + logger.info(f"Menu link found: {menu_link}") + + # Navigate to menu + client.sessions.act( + id=session_id, + input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", + ) + + # Find menu subsections + sections_response = client.sessions.observe( + id=session_id, + instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " + "Return them as a list of links. If none found, return the current page link only in a list. " + "Do not return duplicates if a link appears multiple times.", + ) + sections = sections_response.data.result + + # Extract menu from all sections + all_menu_sections = extract_menu_from_sections(client, session_id, sections) + + # Save combined menu data to JSON file + if all_menu_sections: + save_menu_to_json(website_url, all_menu_sections) + + finally: + # End session + client.sessions.end(id=session_id) + logger.info("Session closed successfully") + + +def batch_process(): + """ + Process multiple restaurant websites in parallel. + URLs are loaded from WEBSITES_FILE (default: websites.txt). + + Example usage: + Create websites.txt with one URL per line: + https://www.restaurant1.com + https://www.restaurant2.com + # This is a comment + https://www.restaurant3.com + """ + websites = load_websites_from_file() + if not websites: + logger.error("No websites to process") + return + + logger.info(f"Starting batch processing of {len(websites)} websites") + + # Process all restaurants sequentially (sync version) + results = [] + for idx, url in enumerate(websites, start=1): + result = process_restaurant(url, agent_id=idx) + results.append(result) + + # Summary + successful = sum(1 for r in results if r["status"] == "success") + failed = len(results) - successful + logger.info(f"\n{'='*60}") + logger.info(f"Batch processing complete!") + logger.info(f"Total: {len(results)} | Success: {successful} | Failed: {failed}") + logger.info(f"{'='*60}\n") + + +if __name__ == "__main__": + import sys + + # Simple CLI argument handling + if len(sys.argv) > 1 and sys.argv[1] == "--batch": + batch_process() + else: + main() diff --git a/python/menu-dynamic-extraction-demo/models.py b/python/menu-dynamic-extraction-demo/models.py new file mode 100644 index 00000000..3eb99f98 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/models.py @@ -0,0 +1,98 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Data Models +# See README.md for full documentation + +"""Pydantic models and JSON schemas for menu extraction.""" + +from typing import Optional, List +from pydantic import BaseModel, Field + + +class MenuItem(BaseModel): + name: str + description: Optional[str] = None + price: Optional[str] = None + + +class MenuCategory(BaseModel): + """ + A category within a section. + e.g., "Antipasti", "Pizza", "Pasta" + """ + category_name: str + items: List[MenuItem] + + +class MenuSection(BaseModel): + """ + A full menu section, e.g., "Lunch", "Dinner", "Dessert". + Each section contains its own categories. + """ + section_name: str + categories: List[MenuCategory] + + +class Menu(BaseModel): + """ + The full restaurant menu. + Compatible with restaurants with multiple menu pages or subsections. + """ + sections: List[MenuSection] + + +# Manual JSON schema for Gemini API compatibility (avoids Pydantic's $defs) +MENU_SCHEMA = { + "type": "object", + "properties": { + "sections": { + "type": "array", + "description": "Menu sections (e.g., Lunch, Dinner, Dessert)", + "items": { + "type": "object", + "properties": { + "section_name": { + "type": "string", + "description": "Name of the menu section" + }, + "categories": { + "type": "array", + "description": "Categories within this section", + "items": { + "type": "object", + "properties": { + "category_name": { + "type": "string", + "description": "Name of the category (e.g., Appetizers, Entrees)" + }, + "items": { + "type": "array", + "description": "Menu items in this category", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Item name" + }, + "description": { + "type": "string", + "description": "Item description" + }, + "price": { + "type": "string", + "description": "Item price" + } + }, + "required": ["name"] + } + } + }, + "required": ["category_name", "items"] + } + } + }, + "required": ["section_name", "categories"] + } + } + }, + "required": ["sections"] +} diff --git a/python/menu-dynamic-extraction-demo/pyproject.toml b/python/menu-dynamic-extraction-demo/pyproject.toml new file mode 100644 index 00000000..ebfeafba --- /dev/null +++ b/python/menu-dynamic-extraction-demo/pyproject.toml @@ -0,0 +1,33 @@ +[project] +name = "menu-dynamic-extraction-demo" +version = "0.1.0" +description = "Restaurant menu extraction using Stagehand and Browserbase" +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "pydantic>=2.0.0", + "python-dotenv>=1.2.1", + "stagehand>=3.0.0", # v3 API - manages Browserbase sessions internally +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "black>=23.0.0", + "ruff>=0.1.0", +] + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 100 +target-version = ['py39', 'py310', 'py311'] + +[tool.ruff] +line-length = 100 +target-version = "py39" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W"] diff --git a/python/menu-dynamic-extraction-demo/scraper.py b/python/menu-dynamic-extraction-demo/scraper.py new file mode 100644 index 00000000..8e0875a3 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/scraper.py @@ -0,0 +1,238 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Scraping Logic +# See README.md for full documentation + +"""Core scraping logic for restaurant menu extraction.""" + +import time +import logging +from typing import List, Dict, Any, Optional +from datetime import datetime +from stagehand import Stagehand +from config import ( + BROWSERBASE_API_KEY, + BROWSERBASE_PROJECT_ID, + MODEL_API_KEY, + NO_MENU_LINK_FOUND, + MAX_RETRIES, + logger +) +from models import MENU_SCHEMA +from utils import save_menu_to_json + + +def close_popups(client: Stagehand, session_id: str, log: logging.Logger = logger) -> bool: + """ + Attempt to close popups/modals that might be blocking the page. + + Args: + client: Stagehand client instance + session_id: Active session ID + log: Logger instance + + Returns: + True if popups were closed, False otherwise + """ + try: + client.sessions.act( + id=session_id, + input="Close any popups, modals, or cookie notices that are blocking the page", + ) + log.info("Successfully closed popups/modals") + return True + except Exception as e: + log.debug(f"No popups to close or failed to close: {e}") + return False + + +def find_menu_link(client: Stagehand, session_id: str, max_retries: int = MAX_RETRIES): + """ + Attempt to locate the restaurant's menu link using Stagehand observe. + Retries up to max_retries times if it fails. + + Args: + client: Stagehand client instance + session_id: Active session ID + max_retries: Maximum number of retry attempts + + Returns: + Menu link result or NO_MENU_LINK_FOUND + """ + instruction = ( + "Find the most likely link to the restaurant's menu on this webpage. If the webpage " + "already is the menu page, return the current page URL. Return only the link URL." + ) + + for attempt in range(1, max_retries + 1): + try: + response = client.sessions.observe( + id=session_id, + instruction=instruction, + ) + return response.data.result + except Exception as e: + logger.warning(f"[Attempt {attempt}] Failed: {e}") + time.sleep(1) + return NO_MENU_LINK_FOUND + + +def extract_menu_from_sections( + client: Stagehand, + session_id: str, + sections: List[Any] +) -> List[Dict[str, Any]]: + """ + Extract menu data from all sections. + + Args: + client: Stagehand client instance + session_id: Active session ID + sections: List of menu sections to extract + + Returns: + List of all extracted menu sections + """ + all_menu_sections = [] + + for section in sections: + section_desc = section.get("description", "") if isinstance(section, dict) else str(section) + logger.info(f"Navigating to menu section: {section_desc} ...") + + # Skip iframe links + if "iframe" in section_desc.lower(): + logger.info("Skipping iframe link ...") + continue + + # Navigate to section + client.sessions.act( + id=session_id, + input=f"Navigate to: {section_desc}", + ) + + # Extract menu data + extract_response = client.sessions.extract( + id=session_id, + instruction="Extract the menu organized by sections and categories. " + "Each section contains categories, and each category contains menu items. " + "For each item, extract the name, description, and price. " + "Preserve price formatting exactly as written.", + schema=MENU_SCHEMA, + ) + logger.info(f"Menu data extracted for {section_desc}") + + # Collect the extracted menu data + menu_data = extract_response.data.result + if menu_data and "sections" in menu_data: + all_menu_sections.extend(menu_data["sections"]) + + return all_menu_sections + + +def process_restaurant(website_url: str, agent_id: int) -> Dict[str, Any]: + """ + Web agent that processes a single restaurant website. + This represents a single subprocessor in a production pipeline. + + Args: + website_url: The restaurant website to scrape + agent_id: Unique identifier for this agent instance + + Returns: + Dictionary containing extraction results and metadata + """ + agent_logger = logging.getLogger(f"Agent-{agent_id}") + start_time = datetime.now() + + result = { + "agent_id": agent_id, + "url": website_url, + "status": "pending", + "start_time": start_time.isoformat(), + "menu_data": [], + "error": None, + } + + # Initialize Stagehand client + client = Stagehand( + browserbase_api_key=BROWSERBASE_API_KEY, + browserbase_project_id=BROWSERBASE_PROJECT_ID, + model_api_key=MODEL_API_KEY, + ) + + # Start Stagehand session + stagehand_session = client.sessions.start( + model_name="google/gemini-2.5-flash", + ) + session_id = stagehand_session.data.session_id + + agent_logger.info(f"Session started: {session_id}") + agent_logger.info(f"Watch live: https://browserbase.com/sessions/{session_id}") + + try: + # Navigate to website using Stagehand + agent_logger.info(f"Navigating to {website_url}") + client.sessions.act( + id=session_id, + input=f"Go to {website_url}", + ) + + # Close any popups on initial page load + close_popups(client, session_id, agent_logger) + + # Extract menu data + all_menu_sections = [] + menu_link = find_menu_link(client, session_id) + if menu_link == NO_MENU_LINK_FOUND: + agent_logger.warning("Could not find menu link") + else: + agent_logger.info(f"Menu link: {menu_link}") + + # Navigate to menu link + client.sessions.act( + id=session_id, + input=f"Click on: {menu_link[0] if isinstance(menu_link, list) else menu_link}", + ) + + # Close any popups after navigating to menu page + close_popups(client, session_id, agent_logger) + + # Extract menu sections + sections_response = client.sessions.observe( + id=session_id, + instruction="Find all subsections on the current menu page, i.e. 'Lunch', 'Dinner', 'Happy Hour', etc. " + "Return them as a list of links. If none found, return the current page link only in a list. " + "Do not return duplicates if a link appears multiple times.", + ) + sections = sections_response.data.result + + # Extract menu from all sections + all_menu_sections = extract_menu_from_sections(client, session_id, sections) + + result["status"] = "success" + end_time = datetime.now() + result["end_time"] = end_time.isoformat() + result["duration_seconds"] = (end_time - start_time).total_seconds() + agent_logger.info(f"Completed extraction in {result['duration_seconds']:.2f}s") + + # Save combined menu data to JSON file + if all_menu_sections: + save_menu_to_json( + website_url, + all_menu_sections, + agent_id=agent_id, + duration_seconds=result["duration_seconds"] + ) + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + agent_logger.error(f"Error processing {website_url}: {e}", exc_info=True) + + finally: + # End session + try: + client.sessions.end(id=session_id) + agent_logger.info("Session closed successfully") + except Exception as e: + agent_logger.error(f"Error closing session: {e}") + + return result diff --git a/python/menu-dynamic-extraction-demo/utils.py b/python/menu-dynamic-extraction-demo/utils.py new file mode 100644 index 00000000..23171436 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/utils.py @@ -0,0 +1,117 @@ +# Stagehand + Browserbase: Restaurant Menu Extractor - Utilities +# See README.md for full documentation + +"""Utility functions for the restaurant scraper.""" + +import json +import time +import re +from typing import List, Dict, Any +from pathlib import Path +from datetime import datetime +from urllib.parse import urlparse +from config import WEBSITES_FILE, OUTPUT_DIR, logger + + +def normalize_url(url: str) -> str: + """ + Normalize URL to ensure it has a protocol. + + Args: + url: The URL to normalize + + Returns: + Normalized URL with https:// prefix + """ + url = url.strip() + if not url.startswith(("http://", "https://")): + url = "https://" + url + return url + + +def load_websites_from_file(file_path: str = WEBSITES_FILE) -> List[str]: + """ + Load website URLs from a text file. + Lines starting with # are treated as comments and ignored. + + Args: + file_path: Path to the file containing URLs + + Returns: + List of normalized URLs + """ + websites = [] + try: + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + websites.append(normalize_url(line)) + logger.info(f"Loaded {len(websites)} websites from {file_path}") + return websites + except FileNotFoundError: + logger.error(f"File not found: {file_path}") + return [] + + +def get_website_from_user() -> str: + """ + Prompt the user to enter a restaurant website URL. + + Returns: + The URL entered by the user + """ + return input("Enter restaurant website URL: ").strip() + + +def save_menu_to_json( + website_url: str, + all_menu_sections: List[Dict[str, Any]], + agent_id: int = None, + duration_seconds: float = None +) -> str: + """ + Save combined menu data to a beautifully formatted JSON file. + + Args: + website_url: The restaurant website URL + all_menu_sections: Combined list of all menu sections + agent_id: Optional agent ID for batch processing + duration_seconds: Optional duration of extraction + + Returns: + Path to the saved JSON file + """ + # Create results directory if it doesn't exist + Path(OUTPUT_DIR).mkdir(exist_ok=True) + + # Generate safe filename from URL and timestamp + parsed_url = urlparse(website_url) + safe_name = re.sub(r'[^\w\-]', '_', parsed_url.netloc or parsed_url.path) + timestamp = int(time.time()) + filename = f"{OUTPUT_DIR}/{safe_name}_{timestamp}.json" + + # Create combined output + output_data = { + "restaurant_url": website_url, + "extracted_at": timestamp, + "extracted_at_readable": datetime.fromtimestamp(timestamp).isoformat(), + "menu": { + "sections": all_menu_sections + } + } + + # Add optional fields + if agent_id is not None: + output_data["agent_id"] = agent_id + if duration_seconds is not None: + output_data["duration_seconds"] = duration_seconds + + # Write beautifully formatted JSON + with open(filename, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + logger.info(f"✓ Menu saved to: {filename}") + logger.info(f"✓ Total sections extracted: {len(all_menu_sections)}") + + return filename diff --git a/python/menu-dynamic-extraction-demo/websites.txt.example b/python/menu-dynamic-extraction-demo/websites.txt.example new file mode 100644 index 00000000..16953ad8 --- /dev/null +++ b/python/menu-dynamic-extraction-demo/websites.txt.example @@ -0,0 +1,15 @@ +# Restaurant Menu Extraction - Batch Processing URLs +# +# Instructions: +# 1. Copy this file to websites.txt +# 2. Add one restaurant URL per line +# 3. Lines starting with # are treated as comments and ignored +# 4. Run: python main.py --batch +# +# Example URLs: + +https://www.thetailorssonsf.com/ +https://www.thegrovesf.com/ +https://www.nopalitosf.com/ + +# Add more restaurant URLs below: