Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
437 changes: 437 additions & 0 deletions MULTI_MODEL_OPTIMIZATION_RECAP.md

Large diffs are not rendered by default.

57 changes: 57 additions & 0 deletions analyze_harmony.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash

echo "🧪 Analyzing Harmony Format Artifacts"
echo "======================================"
echo ""

# Test 1: Weather query (tool-based)
echo "Test 1: Weather in Paris (Tool Query)"
echo "--------------------------------------"
curl -s -N http://localhost:8000/api/chat/stream \
-H 'Content-Type: application/json' \
-d '{"message":"What is the weather in Paris?"}' \
-m 30 2>&1 > /tmp/harmony_test1.txt

# Extract just the response content
cat /tmp/harmony_test1.txt | grep 'data:' | grep -v 'ping' | head -1 | \
sed 's/.*"token": "\(.*\)", "sequence".*/\1/' | \
sed 's/\\n/\n/g' | \
sed 's/\\"/"/g'

echo ""
echo ""
sleep 2

# Test 2: Simple creative query
echo "Test 2: Tell me a joke (Creative Query - Direct GPT-OSS)"
echo "---------------------------------------------------------"
curl -s -N http://localhost:8000/api/chat/stream \
-H 'Content-Type: application/json' \
-d '{"message":"Tell me a programming joke"}' \
-m 10 2>&1 > /tmp/harmony_test2.txt

cat /tmp/harmony_test2.txt | grep 'data:' | grep -v 'ping' | head -10 | \
sed 's/.*"token": "\(.*\)", "sequence".*/\1/' | tr -d '\n'

echo ""
echo ""
sleep 2

# Test 3: Simple knowledge query
echo "Test 3: What is Docker? (Knowledge Query - Direct GPT-OSS)"
echo "-----------------------------------------------------------"
curl -s -N http://localhost:8000/api/chat/stream \
-H 'Content-Type: application/json' \
-d '{"message":"What is Docker?"}' \
-m 10 2>&1 > /tmp/harmony_test3.txt

cat /tmp/harmony_test3.txt | grep 'data:' | grep -v 'ping' | head -10 | \
sed 's/.*"token": "\(.*\)", "sequence".*/\1/' | tr -d '\n'

echo ""
echo ""
echo "======================================"
echo "Raw files saved:"
echo " /tmp/harmony_test1.txt (Weather)"
echo " /tmp/harmony_test2.txt (Joke)"
echo " /tmp/harmony_test3.txt (Docker)"
42 changes: 42 additions & 0 deletions backend/check-download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

# Monitor Qwen download progress

MODEL_FILE="/Users/alexmartinez/openq-ws/geistai/backend/inference/models/qwen2.5-coder-32b-instruct-q4_k_m.gguf"
LOG_FILE="/tmp/qwen_download.log"
EXPECTED_SIZE="18GB"

echo "🔍 Qwen 2.5 32B Download Monitor"
echo "=================================="
echo ""

if [ -f "$MODEL_FILE" ]; then
CURRENT_SIZE=$(ls -lh "$MODEL_FILE" | awk '{print $5}')
echo "✅ File exists: $CURRENT_SIZE / ~$EXPECTED_SIZE"
echo ""

# Check if complete (file should be ~18GB)
SIZE_BYTES=$(stat -f%z "$MODEL_FILE" 2>/dev/null || stat -c%s "$MODEL_FILE" 2>/dev/null)
if [ "$SIZE_BYTES" -gt 17000000000 ]; then
echo "🎉 Download complete!"
echo ""
echo "Next steps:"
echo " cd /Users/alexmartinez/openq-ws/geistai/backend"
echo " ./start-local-dev.sh"
else
echo "⏳ Still downloading..."
echo ""
echo "📊 Live progress:"
tail -3 "$LOG_FILE"
fi
else
echo "⏳ Download starting..."
if [ -f "$LOG_FILE" ]; then
echo ""
echo "📊 Progress:"
tail -3 "$LOG_FILE"
fi
fi

echo ""
echo "To monitor: watch -n 2 ./check-download.sh"
3 changes: 3 additions & 0 deletions backend/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ services:
- LOG_LEVEL=DEBUG
- HARMONY_REASONING_EFFORT=low
- INFERENCE_URL=http://host.docker.internal:8080 # Connect to host inference
- INFERENCE_URL_QWEN=http://host.docker.internal:8080 # Connect to Qwen
- INFERENCE_URL_LLAMA=http://host.docker.internal:8082 # Connect to Llama
- WHISPER_SERVICE_URL=http://host.docker.internal:8004 # Connect to host Whisper STT
- EMBEDDINGS_URL=http://embeddings:8001
- SSL_ENABLED=false
# Development-specific Python settings
Expand Down
144 changes: 144 additions & 0 deletions backend/router/answer_mode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
Answer Mode - Forces LLM to generate final answer without calling tools

This is a simplified implementation for MVP that wraps the existing
agent system and adds a firewall to prevent infinite tool loops.
"""

import httpx
from typing import AsyncIterator, List, Dict
import json
import asyncio # Added for async sleep


async def answer_mode_stream(
query: str,
findings: str,
inference_url: str = "http://host.docker.internal:8080"
) -> AsyncIterator[str]:
"""
Generate final answer from tool findings with firewall

Args:
query: Original user question
findings: Text summary of tool results
inference_url: Which model to use (Qwen or GPT-OSS URL)

Yields:
Content chunks to stream to user
"""

# Direct prompt for clean, concise answers
messages = [
{
"role": "user",
"content": (
f"{query}\n\n"
f"Here is relevant information:\n{findings}\n\n"
f"Please provide a brief answer (2-3 sentences) and list the source URLs."
)
}
]

client = httpx.AsyncClient(timeout=30.0)
full_response = "" # Accumulate full response for post-processing

try:
async with client.stream(
"POST",
f"{inference_url}/v1/chat/completions",
json={
"messages": messages,
"tools": [], # NO TOOLS - completely disabled
"stream": True,
"max_tokens": 120, # Optimized for fast summaries
"temperature": 0.8 # Fast sampling
}
) as response:

content_seen = False

async for line in response.aiter_lines():
if line.startswith("data: "):
if line.strip() == "data: [DONE]":
break

try:
data = json.loads(line[6:])

if "choices" in data and len(data["choices"]) > 0:
choice = data["choices"][0]
delta = choice.get("delta", {})

# FIREWALL: Drop any hallucinated tool calls
if "tool_calls" in delta:
print(f"⚠️ Answer-mode firewall: Dropped tool_call (this shouldn't happen!)")
continue

# Accumulate content
if "content" in delta and delta["content"]:
content_seen = True
full_response += delta["content"]

# Stop on finish
finish_reason = choice.get("finish_reason")
if finish_reason in ["stop", "length"]:
break

except json.JSONDecodeError:
continue

# Post-process: Clean up response and stream it token by token
# Llama should produce clean output, but let's clean just in case

import re

# Clean the response
cleaned_response = full_response

# Remove any potential Harmony markers (shouldn't be present with Llama)
if "<|channel|>" in cleaned_response:
# Extract final channel if present
if "<|channel|>final<|message|>" in cleaned_response:
parts = cleaned_response.split("<|channel|>final<|message|>")
if len(parts) > 1:
cleaned_response = parts[1].split("<|end|>")[0] if "<|end|>" in parts[1] else parts[1]
else:
# Remove all Harmony markers
cleaned_response = re.sub(r'<\|[^|]+\|>', '', cleaned_response)

# Clean up any meta-commentary (shouldn't be present with Llama)
cleaned_response = re.sub(r'We need to (answer|check|provide|browse)[^.]*\.', '', cleaned_response)
cleaned_response = re.sub(r'The user (asks|wants|needs|provided)[^.]*\.', '', cleaned_response)
cleaned_response = re.sub(r'Let\'s (open|browse|check)[^.]*\.', '', cleaned_response)
cleaned_response = re.sub(r'\s+', ' ', cleaned_response).strip()

# Stream the cleaned response token by token for better UX
if cleaned_response:
# Split into words and stream them
words = cleaned_response.split()
for i, word in enumerate(words):
if i == 0:
yield word
else:
yield " " + word
# Small delay to simulate streaming
await asyncio.sleep(0.05)
else:
# Fallback: provide simple answer from findings
fallback = f"Based on the search results: {findings[:200]}..."
words = fallback.split()
for i, word in enumerate(words):
if i == 0:
yield word
else:
yield " " + word
await asyncio.sleep(0.05)

# Fallback if no content generated
if not content_seen:
print(f"❌ Answer mode produced no content - using fallback")
yield f"\n\nBased on the search results: {findings[:200]}..."

finally:
await client.aclose()
Loading
Loading