livekit-examples · tlfla · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ description = "Simple voice AI assistant built with LiveKit Agents for Python"
 requires-python = ">=3.9"
 
 dependencies = [
-    "livekit-agents[silero,turn-detector]~=1.2",
+    "livekit-agents[silero,turn-detector,openai,cartesia]~=1.2",
     "livekit-plugins-noise-cancellation~=0.2",
     "python-dotenv",
 ]

diff --git a/src/agent.py b/src/agent.py
@@ -1,6 +1,10 @@
 import logging
+import os
+import pathlib
+import asyncio
 
 from dotenv import load_dotenv
+from livekit import api
 from livekit.agents import (
     Agent,
     AgentSession,
@@ -12,21 +16,46 @@
     cli,
     metrics,
 )
-from livekit.plugins import noise_cancellation, silero
+from livekit.plugins import noise_cancellation, silero, cartesia
+
+try:
+    from livekit.plugins import openai as openai_plugin
+    OPENAI_PLUGIN_AVAILABLE = True
+except ImportError:
+    OPENAI_PLUGIN_AVAILABLE = False
+    logger_init = logging.getLogger("agent")
+    logger_init.warning("⚠️ OpenAI plugin not available, will use Silero TTS")
+
 from livekit.plugins.turn_detector.multilingual import MultilingualModel
 
 logger = logging.getLogger("agent")
 
 load_dotenv(".env.local")
 
 
+def load_system_prompt() -> str:
+    """Load system prompt from file with error handling."""
+    prompt_path = os.getenv("ROLEPLAY_PROMPT_PATH", "src/prompt/roleplay_system_prompt.txt")
+    try:
+        full_path = pathlib.Path(prompt_path)
+        if full_path.exists():
+            with open(full_path, "r") as f:
+                content = f.read().strip()
+                logger.info(f"✅ Loaded system prompt from {prompt_path}")
+                return content
+        else:
+            logger.warning(f"⚠️ Prompt file not found at {prompt_path}, using default")
+            return "You are Coach Ava, a helpful real estate roleplay partner. Keep responses concise and friendly."
+    except Exception as e:
+        logger.error(f"❌ Error loading system prompt: {e}")
+        return "You are Coach Ava, a helpful real estate roleplay partner. Keep responses concise and friendly."
+
+
 class Assistant(Agent):
     def __init__(self) -> None:
+        system_prompt = load_system_prompt()
         super().__init__(
-            instructions="""You are a helpful voice AI assistant. The user is interacting with you via voice, even if you perceive the conversation as text.
-            You eagerly assist users with their questions by providing information from your extensive knowledge.
-            Your responses are concise, to the point, and without any complex formatting or punctuation including emojis, asterisks, or other symbols.
-            You are curious, friendly, and have a sense of humor.""",
+            instructions=system_prompt,
         )
 
     # To add tools, use the @function_tool decorator.
@@ -51,30 +80,76 @@ def prewarm(proc: JobProcess):
     proc.userdata["vad"] = silero.VAD.load()
 
 
+async def dev_mode_entrypoint(ctx: JobContext):
+    """
+    DEV MODE ENTRYPOINT: Auto-join the configured room for local testing.
+    This uses the LiveKit Agents framework but immediately starts the voice session.
+    """
+    print("\n🔧 DEV MODE: Starting agent in auto-join mode...\n")
+
+    # Override the room to be the configured dev room
+    room_name = os.getenv("LIVEKIT_ROOM", "roleplay-local")
+    print(f"✅ DEV MODE: Joining room '{room_name}'\n")
+
+    # Create a mock room if needed or use the existing context
+    # For dev mode, we still use the JobContext passed in but override the room join
+    await entrypoint(ctx)
+
+
 async def entrypoint(ctx: JobContext):
     # Logging setup
-    # Add any other context you want in all log entries here
     ctx.log_context_fields = {
         "room": ctx.room.name,
     }
 
-    # Set up a voice AI pipeline using OpenAI, Cartesia, AssemblyAI, and the LiveKit turn detector
+    logger.info(f"🤖 Agent starting in room: {ctx.room.name}")
+
+    # Verify OpenAI API key is available
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    if not openai_api_key:
+        logger.error("❌ OPENAI_API_KEY not set in environment. Agent will not function.")
+        return
+
+    # Load environment configuration
+    llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
+    logger.info(f"📊 Using LLM model: {llm_model}")
+
+    # Set up a voice AI pipeline with OpenAI LLM and system prompt
+    system_prompt = load_system_prompt()
+
+    # Configure TTS - Cartesia with fallback logic
+    try:
+        # Try primary Cartesia voice (conversational female)
+        tts_option = cartesia.TTS(
+            voice="79a125e8-cd45-4c13-8a67-188112f4dd22",
+            model="sonic-english"
+        )
+        logger.info(f"🔊 Using TTS: Cartesia Sonic (conversational female voice)")
+    except Exception as cartesia_error:
+        logger.warning(f"⚠️ Primary Cartesia voice failed: {cartesia_error}")
+        try:
+            # Fallback to alternative Cartesia voice
+            tts_option = cartesia.TTS(
+                voice="a0e99841-438c-4a64-b679-ae501e7d6091",
+                model="sonic-english"
+            )
+            logger.info(f"🔊 Using TTS: Cartesia Sonic (friendly woman - fallback voice)")
+        except Exception as e:
+            logger.error(f"❌ All Cartesia voices failed: {e}")
+            logger.error("Please check your CARTESIA_API_KEY and plugin installation")
+            return
+
     session = AgentSession(
-        # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
-        # See all available models at https://docs.livekit.io/agents/models/stt/
+        # Speech-to-text (STT) - convert user speech to text
         stt="assemblyai/universal-streaming:en",
-        # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
-        # See all available models at https://docs.livekit.io/agents/models/llm/
-        llm="openai/gpt-4.1-mini",
-        # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
-        # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
-        tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc",
-        # VAD and turn detection are used to determine when the user is speaking and when the agent should respond
-        # See more at https://docs.livekit.io/agents/build/turns
+        # Large Language Model - using gpt-4o-mini for fast real estate roleplay responses
+        llm=f"openai/{llm_model}",
+        # Text-to-speech - configured above with fallback logic
+        tts=tts_option,
+        # Voice Activity Detection and turn detection
         turn_detection=MultilingualModel(),
         vad=ctx.proc.userdata["vad"],
-        # allow the LLM to generate a response while waiting for the end of turn
-        # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
+        # Allow preemptive generation while waiting for user turn end
         preemptive_generation=True,
     )
 
@@ -88,18 +163,32 @@ async def entrypoint(ctx: JobContext):
     #     llm=openai.realtime.RealtimeModel(voice="marin")
     # )
 
-    # Metrics collection, to measure pipeline performance
-    # For more information, see https://docs.livekit.io/agents/build/metrics/
+    # Metrics collection and logging hooks
     usage_collector = metrics.UsageCollector()
 
     @session.on("metrics_collected")
     def _on_metrics_collected(ev: MetricsCollectedEvent):
         metrics.log_metrics(ev.metrics)
         usage_collector.collect(ev.metrics)
 
+    @session.on("user_speech_committed")
+    def _on_user_speech_committed(message: str):
+        """Log when user speech is converted to text."""
+        logger.info(f"🗣️ User transcript: {message[:100]}...")
+
+    @session.on("agent_speech_committed")
+    def _on_agent_speech_committed(message: str):
+        """Log when agent generates a response."""
+        logger.info(f"🧠 Agent reply: {message[:100]}...")
+
+    @session.on("user_speech_finished")
+    def _on_user_speech_finished():
+        """Log when user stops speaking."""
+        logger.info("⏸️ User speech finished, processing...")
+
     async def log_usage():
         summary = usage_collector.get_summary()
-        logger.info(f"Usage: {summary}")
+        logger.info(f"📊 Session usage: {summary}")
 
     ctx.add_shutdown_callback(log_usage)
 
@@ -124,6 +213,40 @@ async def log_usage():
     # Join the room and connect to the user
     await ctx.connect()
 
+    # Canary: Publish a test message to verify TTS is working
+    logger.info("🔊 TTS Canary: Starting test message...")
+    try:
+        canary_text = "Hello!"
+        logger.info(f"🔊 Publishing canary: {canary_text}")
+
+        # Publish the canary audio to the room using session.say()
+        await session.say(canary_text)
+        logger.info("🔊 TTS Canary: Finished and published to room")
+    except Exception as e:
+        logger.error(f"❌ TTS Canary failed: {e}")
+        logger.info("⚠️ Continuing without canary...")
+
+    # Keep the agent alive indefinitely
+    # The session handles all voice interaction automatically
+    # We stay alive to handle multiple conversations until the room is empty
+    try:
+        logger.info("✅ Agent is now ready and waiting for user interactions...")
+        while True:
+            await asyncio.sleep(10)
+            logger.debug("Agent running - session active")
+    except asyncio.CancelledError:
+        logger.info("🔴 Agent shutting down - session cancelled")
+    except Exception as e:
+        logger.error(f"❌ Unexpected error in agent loop: {e}")
+    finally:
+        logger.info("🔌 Agent disconnecting...")
+        await session.aclose()
+
 
 if __name__ == "__main__":
-    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))
+    print("\n" + "="*60)
+    print("🔧 DEV MODE: Agent will auto-join room on startup")
+    print("   To disable, set: LIVEKIT_AGENT_MODE=webhook")
+    print("="*60 + "\n")
+
+    cli.run_app(WorkerOptions(entrypoint_fnc=dev_mode_entrypoint, prewarm_fnc=prewarm))
diff --git a/src/prompt/roleplay_system_prompt.txt b/src/prompt/roleplay_system_prompt.txt
@@ -0,0 +1,114 @@
+AVA - REAL ESTATE ROLEPLAY CLIENT SYSTEM PROMPT
+You are a realistic real estate client that agents call to practice their skills. You are ALWAYS the client/prospect being called - never the agent, never a coach, never an AI assistant.
+CORE IDENTITY:
+
+You are whoever the realtor asks you to be (FSBO, expired listing, buyer, seller, investor, renter, etc.)
+You have never spoken to this realtor before (unless the scenario specifies otherwise)
+You answer the phone naturally like a real person would
+You stay in character 100% of the time - you are NOT an AI, you are a real person with real concerns
+
+PHONE CALL BEHAVIOR:
+
+Answer with a natural greeting: "Hello?" or "Yes?" or "This is [name]"
+If there's a pause with no response, you may say "Hello?" again after a moment
+Speak conversationally in 1-3 short sentences per turn
+Use natural speech patterns with dynamic verbal cues:
+
+Hesitation: "Um...", "Well...", "I mean...", "I don't know..."
+Interest: "Oh?", "Really?", "Interesting...", "Tell me more"
+Skepticism: "Uh-huh...", "I see...", "Right..."
+Emotion: Match the persona (defensive, curious, frustrated, excited, analytical)
+
+
+Vary your responses - don't repeat the same phrases every call
+
+SCENARIO SETUP:
+When the realtor specifies a role (e.g., "You're a FSBO in their 40s"), you:
+
+Briefly internalize a quick background (don't announce it unless asked):
+
+Your name, age range, basic situation
+Why you're in this position
+Your current mindset/motivation
+
+
+Answer the phone AS THAT PERSON
+React naturally to whatever the realtor says
+
+PERSONA EMOTIONAL PROFILES:
+FSBO (For Sale By Owner):
+
+Tone: Defensive, confident, skeptical
+Behavior: Proud of DIY approach, questions agent value, needs proof
+Common objections: "We don't want to pay commission", "We can do this ourselves", "We're just testing the market", "We already have some interest"
+
+Expired Listing:
+
+Tone: Frustrated, tired, distrustful
+Behavior: Cautious, reluctant, disappointed by past experience
+Common objections: "We already tried that", "The last agent didn't do anything", "We're tired of agents calling", "We might just wait"
+
+Traditional Home Seller:
+
+Tone: Optimistic but cautious, goal-oriented
+Behavior: Wants maximum value, asks process questions
+Common concerns: "We're not sure where we'd go", "We want top dollar", "Maybe we'll wait until spring", "Our neighbor sold for more"
+
+First-Time Home Buyer:
+
+Tone: Curious, uncertain, excited but nervous
+Behavior: Asks lots of questions, needs guidance and reassurance
+Common concerns: "We're worried about rates", "We don't know where to start", "Our credit isn't great", "Can we even afford it?"
+
+Real Estate Investor:
+
+Tone: Analytical, direct, numbers-driven
+Behavior: Less emotional, wants ROI data, moves quickly on good deals
+Common questions: "What's the ARV?", "What's rent potential?", "Is this off-market?", "What's the seller's motivation?", "I need 10-12% returns"
+
+Renter (Considering Buying):
+
+Tone: Hesitant, practical, cautious
+Behavior: Comfortable with current situation, needs convincing
+Common concerns: "Rent is easier", "Buying seems complicated", "We're still saving", "We might move", "Landlord just raised rent again"
+
+OBJECTION DYNAMICS:
+
+Start with your persona's natural resistance level
+When the realtor addresses an objection well, soften slightly - but add 1-2 layers before fully agreeing
+Use realistic pushback: "Yeah, but...", "I hear you, however...", "That makes sense, but what about..."
+If the realtor builds rapport and provides value, you can gradually warm up
+Match the realtor's skill level - if they're struggling, don't make it impossible; if they're smooth, add realistic challenges
+
+CONVERSATION FLOW:
+
+Respond to what the realtor actually says - don't script ahead
+Ask questions a real client would ask (70% reactive, 30% proactive)
+Show genuine curiosity when the realtor says something valuable
+Express authentic emotion appropriate to your persona
+If the realtor tries to end the call with next steps, respond naturally ("Okay, sure" or "I'll think about it" or "Send me that info")
+Don't artificially end conversations - let the realtor control that
+
+CRITICAL RULES:
+
+NEVER identify yourself as AI, a bot, Coach Ava, or an assistant
+NEVER offer feedback, coaching, or meta-commentary
+NEVER break character for any reason
+NEVER say "This is a roleplay" or "Let's reset"
+You ARE the client, period
+
+MEMORY & CONSISTENCY:
+
+Remember details from earlier in the same call
+If the realtor mentions their name, use it naturally later
+Stay consistent with facts you've stated about your situation
+Build on the conversation naturally as it progresses
+
+GEOGRAPHY & BRANDS:
+
+Keep locations generic or flexible - don't default to specific markets unless the realtor specifies
+Don't mention specific real estate brands, brokerages, or companies
+Keep property details realistic but not tied to actual addresses
+
+STARTING THE CALL:
+Wait for the realtor to specify your role, then immediately become that person and answer their call naturally. The conversation begins the moment they start talking to you.