geistlabs · aalexmrt · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025
diff --git a/MULTI_MODEL_OPTIMIZATION_RECAP.md b/MULTI_MODEL_OPTIMIZATION_RECAP.md
diff --git a/analyze_harmony.sh b/analyze_harmony.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+echo "🧪 Analyzing Harmony Format Artifacts"
+echo "======================================"
+echo ""
+
+# Test 1: Weather query (tool-based)
+echo "Test 1: Weather in Paris (Tool Query)"
+echo "--------------------------------------"
+curl -s -N http://localhost:8000/api/chat/stream \
+  -H 'Content-Type: application/json' \
+  -d '{"message":"What is the weather in Paris?"}' \
+  -m 30 2>&1 > /tmp/harmony_test1.txt
+
+# Extract just the response content
+cat /tmp/harmony_test1.txt | grep 'data:' | grep -v 'ping' | head -1 | \
+  sed 's/.*"token": "\(.*\)", "sequence".*/\1/' | \
+  sed 's/\\n/\n/g' | \
+  sed 's/\\"/"/g'
+
+echo ""
+echo ""
+sleep 2
+
+# Test 2: Simple creative query
+echo "Test 2: Tell me a joke (Creative Query - Direct GPT-OSS)"
+echo "---------------------------------------------------------"
+curl -s -N http://localhost:8000/api/chat/stream \
+  -H 'Content-Type: application/json' \
+  -d '{"message":"Tell me a programming joke"}' \
+  -m 10 2>&1 > /tmp/harmony_test2.txt
+
+cat /tmp/harmony_test2.txt | grep 'data:' | grep -v 'ping' | head -10 | \
+  sed 's/.*"token": "\(.*\)", "sequence".*/\1/' | tr -d '\n'
+
+echo ""
+echo ""
+sleep 2
+
+# Test 3: Simple knowledge query
+echo "Test 3: What is Docker? (Knowledge Query - Direct GPT-OSS)"
+echo "-----------------------------------------------------------"
+curl -s -N http://localhost:8000/api/chat/stream \
+  -H 'Content-Type: application/json' \
+  -d '{"message":"What is Docker?"}' \
+  -m 10 2>&1 > /tmp/harmony_test3.txt
+
+cat /tmp/harmony_test3.txt | grep 'data:' | grep -v 'ping' | head -10 | \
+  sed 's/.*"token": "\(.*\)", "sequence".*/\1/' | tr -d '\n'
+
+echo ""
+echo ""
+echo "======================================"
+echo "Raw files saved:"
+echo "  /tmp/harmony_test1.txt (Weather)"
+echo "  /tmp/harmony_test2.txt (Joke)"
+echo "  /tmp/harmony_test3.txt (Docker)"
diff --git a/backend/check-download.sh b/backend/check-download.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Monitor Qwen download progress
+
+MODEL_FILE="/Users/alexmartinez/openq-ws/geistai/backend/inference/models/qwen2.5-coder-32b-instruct-q4_k_m.gguf"
+LOG_FILE="/tmp/qwen_download.log"
+EXPECTED_SIZE="18GB"
+
+echo "🔍 Qwen 2.5 32B Download Monitor"
+echo "=================================="
+echo ""
+
+if [ -f "$MODEL_FILE" ]; then
+    CURRENT_SIZE=$(ls -lh "$MODEL_FILE" | awk '{print $5}')
+    echo "✅ File exists: $CURRENT_SIZE / ~$EXPECTED_SIZE"
+    echo ""
+
+    # Check if complete (file should be ~18GB)
+    SIZE_BYTES=$(stat -f%z "$MODEL_FILE" 2>/dev/null || stat -c%s "$MODEL_FILE" 2>/dev/null)
+    if [ "$SIZE_BYTES" -gt 17000000000 ]; then
+        echo "🎉 Download complete!"
+        echo ""
+        echo "Next steps:"
+        echo "  cd /Users/alexmartinez/openq-ws/geistai/backend"
+        echo "  ./start-local-dev.sh"
+    else
+        echo "⏳ Still downloading..."
+        echo ""
+        echo "📊 Live progress:"
+        tail -3 "$LOG_FILE"
+    fi
+else
+    echo "⏳ Download starting..."
+    if [ -f "$LOG_FILE" ]; then
+        echo ""
+        echo "📊 Progress:"
+        tail -3 "$LOG_FILE"
+    fi
+fi
+
+echo ""
+echo "To monitor: watch -n 2 ./check-download.sh"
diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml
@@ -135,6 +135,9 @@ services:
       - LOG_LEVEL=DEBUG
       - HARMONY_REASONING_EFFORT=low
       - INFERENCE_URL=http://host.docker.internal:8080  # Connect to host inference
+      - INFERENCE_URL_QWEN=http://host.docker.internal:8080  # Connect to Qwen
+      - INFERENCE_URL_LLAMA=http://host.docker.internal:8082 # Connect to Llama
+      - WHISPER_SERVICE_URL=http://host.docker.internal:8004 # Connect to host Whisper STT
       - EMBEDDINGS_URL=http://embeddings:8001
       - SSL_ENABLED=false
       # Development-specific Python settings

diff --git a/backend/router/answer_mode.py b/backend/router/answer_mode.py
@@ -0,0 +1,144 @@
+"""
+Answer Mode - Forces LLM to generate final answer without calling tools
+
+This is a simplified implementation for MVP that wraps the existing
+agent system and adds a firewall to prevent infinite tool loops.
+"""
+
+import httpx
+from typing import AsyncIterator, List, Dict
+import json
+import asyncio # Added for async sleep
+
+
+async def answer_mode_stream(
+    query: str,
+    findings: str,
+    inference_url: str = "http://host.docker.internal:8080"
+) -> AsyncIterator[str]:
+    """
+    Generate final answer from tool findings with firewall
+
+    Args:
+        query: Original user question
+        findings: Text summary of tool results
+        inference_url: Which model to use (Qwen or GPT-OSS URL)
+
+    Yields:
+        Content chunks to stream to user
+    """
+
+    # Direct prompt for clean, concise answers
+    messages = [
+        {
+            "role": "user",
+            "content": (
+                f"{query}\n\n"
+                f"Here is relevant information:\n{findings}\n\n"
+                f"Please provide a brief answer (2-3 sentences) and list the source URLs."
+            )
+        }
+    ]
+
+    client = httpx.AsyncClient(timeout=30.0)
+    full_response = ""  # Accumulate full response for post-processing
+
+    try:
+        async with client.stream(
+            "POST",
+            f"{inference_url}/v1/chat/completions",
+            json={
+                "messages": messages,
+                "tools": [],  # NO TOOLS - completely disabled
+                "stream": True,
+                "max_tokens": 120,  # Optimized for fast summaries
+                "temperature": 0.8   # Fast sampling
+            }
+        ) as response:
+
+            content_seen = False
+
+            async for line in response.aiter_lines():
+                if line.startswith("data: "):
+                    if line.strip() == "data: [DONE]":
+                        break
+
+                    try:
+                        data = json.loads(line[6:])
+
+                        if "choices" in data and len(data["choices"]) > 0:
+                            choice = data["choices"][0]
+                            delta = choice.get("delta", {})
+
+                            # FIREWALL: Drop any hallucinated tool calls
+                            if "tool_calls" in delta:
+                                print(f"⚠️  Answer-mode firewall: Dropped tool_call (this shouldn't happen!)")
+                                continue
+
+                            # Accumulate content
+                            if "content" in delta and delta["content"]:
+                                content_seen = True
+                                full_response += delta["content"]
+
+                            # Stop on finish
+                            finish_reason = choice.get("finish_reason")
+                            if finish_reason in ["stop", "length"]:
+                                break
+
+                    except json.JSONDecodeError:
+                        continue
+
+            # Post-process: Clean up response and stream it token by token
+            # Llama should produce clean output, but let's clean just in case
+
+            import re
+
+            # Clean the response
+            cleaned_response = full_response
+
+            # Remove any potential Harmony markers (shouldn't be present with Llama)
+            if "<|channel|>" in cleaned_response:
+                # Extract final channel if present
+                if "<|channel|>final<|message|>" in cleaned_response:
+                    parts = cleaned_response.split("<|channel|>final<|message|>")
+                    if len(parts) > 1:
+                        cleaned_response = parts[1].split("<|end|>")[0] if "<|end|>" in parts[1] else parts[1]
+                else:
+                    # Remove all Harmony markers
+                    cleaned_response = re.sub(r'<\|[^|]+\|>', '', cleaned_response)
+
+            # Clean up any meta-commentary (shouldn't be present with Llama)
+            cleaned_response = re.sub(r'We need to (answer|check|provide|browse)[^.]*\.', '', cleaned_response)
+            cleaned_response = re.sub(r'The user (asks|wants|needs|provided)[^.]*\.', '', cleaned_response)
+            cleaned_response = re.sub(r'Let\'s (open|browse|check)[^.]*\.', '', cleaned_response)
+            cleaned_response = re.sub(r'\s+', ' ', cleaned_response).strip()
+
+            # Stream the cleaned response token by token for better UX
+            if cleaned_response:
+                # Split into words and stream them
+                words = cleaned_response.split()
+                for i, word in enumerate(words):
+                    if i == 0:
+                        yield word
+                    else:
+                        yield " " + word
+                    # Small delay to simulate streaming
+                    await asyncio.sleep(0.05)
+            else:
+                # Fallback: provide simple answer from findings
+                fallback = f"Based on the search results: {findings[:200]}..."
+                words = fallback.split()
+                for i, word in enumerate(words):
+                    if i == 0:
+                        yield word
+                    else:
+                        yield " " + word
+                    await asyncio.sleep(0.05)
+
+            # Fallback if no content generated
+            if not content_seen:
+                print(f"❌ Answer mode produced no content - using fallback")
+                yield f"\n\nBased on the search results: {findings[:200]}..."
+
+    finally:
+        await client.aclose()