diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..2b0ff1c --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,24 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/python +{ + "name": "RAG Chat API", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bookworm", + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + "forwardPorts": [ + 80, + 8000 + ], + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "pip3 install --user -r requirements.txt", + // Configure tool-specific properties. + "customizations": { + // "jetbrains" : { + // "backend" : "PyCharm" + // } + } + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} \ No newline at end of file diff --git a/.dockerignore b/.dockerignore index a4a472e..2668f14 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,14 +1,14 @@ -.git -.gitignore -.github -.chat-env -__pycache__ -*.pyc -*.pyo -*.pyd -.Python -env/ -venv/ -.venv/ -.env +.git +.gitignore +.github +.chat-env +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env/ +venv/ +.venv/ +.env *.log \ No newline at end of file diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 81a846f..1c0330a 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -1,23 +1,23 @@ -name: Pylint - -on: [push] - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pylint - - name: Analysing the code with pylint - run: | - pylint api.py +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint api.py diff --git a/.gitignore b/.gitignore index 59ba37a..dbe2eb2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,20 +1,25 @@ -.DS_Store -*.pyc -__pycache__/ -# .chat-env/ -*.env -*.env.* -*.ipynb_checkpoints -*.log -*.sqlite3 -*.db -*.db-journal -*.db-wal -*.db-shm -*.egg-info/ -*.egg -*.egg-info/ -responses/ -db/knowledge_base/* -db/chroma/* -*.md \ No newline at end of file +.DS_Store +*.pyc +__pycache__/ +.chat_env/ +.chat-env/ +*.env +*.env.* +*.ipynb_checkpoints +*.log +*.sqlite3 +*.db +*.db-journal +*.db-wal +*.db-shm +*.egg-info/ +*.egg +*.egg-info/ +responses/ +db/knowledge_base/* +db/chroma/* +*.md +.secrets/ +.idea/ +fine-tuning.ipynb + diff --git a/Dockerfile b/Dockerfile index e6c23ef..68dca25 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,44 +1,44 @@ -FROM python:3.12-slim - -WORKDIR /app - -# Set environment variables -ENV APP_NAME=rag_chat_api -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 - -# Install system dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - libmagic1 \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# Create necessary directories with proper permissions -RUN mkdir -p db/chroma && \ - chmod -R 755 db - -# IMPORTANT: Copy and install requirements first -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy pre-built ChromaDB (if it exists locally) -COPY db/chroma /app/db/chroma - -# Set proper permissions for the ChromaDB -RUN chmod -R 755 /app/db/chroma - -# Copy configuration files -COPY .env.prod ./.env.prod -COPY docker-entrypoint.sh ./ -RUN chmod +x ./docker-entrypoint.sh - -# Copy source code -COPY embeddings.py api.py ./ - -# Expose the API port -EXPOSE 8000 - -# Command to run the application -ENTRYPOINT ["./docker-entrypoint.sh"] +FROM python:3.12-slim + +WORKDIR /app + +# Set environment variables +ENV APP_NAME=rag_chat_api +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + libmagic1 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Create necessary directories with proper permissions +RUN mkdir -p db/chroma && \ + chmod -R 755 db + +# IMPORTANT: Copy and install requirements first +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy pre-built ChromaDB (if it exists locally) +COPY db/chroma /app/db/chroma + +# Set proper permissions for the ChromaDB +RUN chmod -R 755 /app/db/chroma + +# Copy configuration files +COPY .env.prod ./.env +#COPY docker-entrypoint.sh ./app +#RUN chmod +x ./app/docker-entrypoint.sh + +# Copy source code +COPY embeddings.py api.py ./ + +# Expose the API port +EXPOSE 8000 + +# Command to run the application +#ENTRYPOINT ["./app/docker-entrypoint.sh"] CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/README.md b/README.md index 555cf5f..5c94603 100644 --- a/README.md +++ b/README.md @@ -1,133 +1,133 @@ -# RAG Chat API Starter Kit - -A RAG (Retrieval Augmented Generation) system that uses Google's Gemini models to provide contextual responses based on your documentation. - -## Overview - -RAG Chat API enables you to create a chatbot that answers questions based on your technical documentation. It uses: - -- **Google Gemini models** for advanced embeddings and text generation -- **ChromaDB** for vector storage and similarity search -- **LangChain** for document loading, chunking, and prompt management - -This system creates embeddings of your documentation, stores them in a vector database, and then retrieves relevant context when answering user queries. - -## Features - -- ✅ Load and process documents (.txt, .md, .pdf (expected to be introduced in v2.0.0)) -- ✅ Split documents into manageable chunks with overlap -- ✅ Generate and store embeddings with Google's text-embedding-004 model -- ✅ Semantic search for relevant document chunks -- ✅ Contextual answers using Gemini 1.5 Pro -- ✅ Customizable chunk size and overlap -- ✅ Organized response storage - -## Installation - -1. Clone this repository: - ```bash - git clone - cd devdocs-chat-api - ``` - -2. Create and activate a virtual environment: - ```bash - python -m venv .chat-env - source .chat-env/bin/activate # On Windows: .chat-env\Scripts\activate - ``` - -3. Install dependencies: - ```bash - pip install -r requirements.txt - ``` - -4. Create a .env file with your configuration: - ``` - CHROMA_PATH="db/chroma" - GOOGLE_API_KEY="your-google-api-key-here" - DATA_STORE_PATH="db/knowledge_base" - ``` - -## Usage - -### 1. Add your documentation - -Place your documentation files in the directory specified by `DATA_STORE_PATH` (default: knowledge_base): - -```bash -mkdir -p db/knowledge_base -cp your-docs/*.md db/knowledge_base/ -``` - -### 2. Generate embeddings - -Run the embeddings script to process your documentation and create vector embeddings: - -```bash -python embeddings.py -``` - -This will: -- Load all documents from your knowledge base directory -- Split them into chunks of text -- Generate embeddings using Google's text-embedding-004 model -- Store the embeddings in ChromaDB - -### 3. Query your documentation - -Run the main script to interact with your documentation: - -```bash -python main.py -``` - -By default, this will: -- Ensure your knowledge base and ChromaDB directories exist -- Check if there are documents to process -- Generate embeddings if needed -- Run a sample query ("Why do we need to use embeddings?") -- Find relevant document chunks based on the query -- Generate a contextual response using Gemini 1.5 Pro -- Save the response to the responses directory - -## Customization - -You can customize the behavior by modifying: - -- **Query**: Change the example query in main.py -- **Chunking parameters**: Adjust `CHUNK_SIZE` and `CHUNK_OVERLAP` in embeddings.py -- **Model**: Change the embedding or LLM model in the respective files -- **Prompt template**: Modify the `PROMPT_TEMPLATE` in main.py - -## Project Structure - -- main.py: Main script for querying the documentation -- embeddings.py: Script for generating and storing embeddings -- .env: Environment variables -- db: Directory for databases - - `chroma/`: ChromaDB vector database - - `knowledge_base/`: Your documentation files -- responses: Generated responses - -## Requirements - -- Python 3.9+ -- Google API key with access to Gemini models -- Dependencies listed in requirements.txt - -## Troubleshooting - -- **Empty ChromaDB**: Make sure you have documents in your knowledge base directory and run embeddings.py -- **No responses**: Check that your Google API key is valid and has access to the required models -- **Missing directories**: The system will create required directories automatically - -## License - -[Your License Here] - ---- - -## Contributing - -Contributions are welcome! Please fork the repository and submit a pull request with your changes. +# RAG Chat API Starter Kit + +A RAG (Retrieval Augmented Generation) system that uses Google's Gemini models to provide contextual responses based on your documentation. + +## Overview + +RAG Chat API enables you to create a chatbot that answers questions based on your technical documentation. It uses: + +- **Google Gemini models** for advanced embeddings and text generation +- **ChromaDB** for vector storage and similarity search +- **LangChain** for document loading, chunking, and prompt management + +This system creates embeddings of your documentation, stores them in a vector database, and then retrieves relevant context when answering user queries. + +## Features + +- ✅ Load and process documents (.txt, .md, .pdf (expected to be introduced in v2.0.0)) +- ✅ Split documents into manageable chunks with overlap +- ✅ Generate and store embeddings with Google's text-embedding-004 model +- ✅ Semantic search for relevant document chunks +- ✅ Contextual answers using Gemini 1.5 Pro +- ✅ Customizable chunk size and overlap +- ✅ Organized response storage + +## Installation + +1. Clone this repository: + ```bash + git clone + cd devdocs-chat-api + ``` + +2. Create and activate a virtual environment: + ```bash + python -m venv .chat-env + source .chat-env/bin/activate # On Windows: .chat-env\Scripts\activate + ``` + +3. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +4. Create a .env file with your configuration: + ``` + CHROMA_PATH="db/chroma" + GOOGLE_API_KEY="your-google-api-key-here" + DATA_STORE_PATH="db/knowledge_base" + ``` + +## Usage + +### 1. Add your documentation + +Place your documentation files in the directory specified by `DATA_STORE_PATH` (default: knowledge_base): + +```bash +mkdir -p db/knowledge_base +cp your-docs/*.md db/knowledge_base/ +``` + +### 2. Generate embeddings + +Run the embeddings script to process your documentation and create vector embeddings: + +```bash +python embeddings.py +``` + +This will: +- Load all documents from your knowledge base directory +- Split them into chunks of text +- Generate embeddings using Google's text-embedding-004 model +- Store the embeddings in ChromaDB + +### 3. Query your documentation + +Run the main script to interact with your documentation: + +```bash +python main.py +``` + +By default, this will: +- Ensure your knowledge base and ChromaDB directories exist +- Check if there are documents to process +- Generate embeddings if needed +- Run a sample query ("Why do we need to use embeddings?") +- Find relevant document chunks based on the query +- Generate a contextual response using Gemini 1.5 Pro +- Save the response to the responses directory + +## Customization + +You can customize the behavior by modifying: + +- **Query**: Change the example query in main.py +- **Chunking parameters**: Adjust `CHUNK_SIZE` and `CHUNK_OVERLAP` in embeddings.py +- **Model**: Change the embedding or LLM model in the respective files +- **Prompt template**: Modify the `PROMPT_TEMPLATE` in main.py + +## Project Structure + +- main.py: Main script for querying the documentation +- embeddings.py: Script for generating and storing embeddings +- .env: Environment variables +- db: Directory for databases + - `chroma/`: ChromaDB vector database + - `knowledge_base/`: Your documentation files +- responses: Generated responses + +## Requirements + +- Python 3.9+ +- Google API key with access to Gemini models +- Dependencies listed in requirements.txt + +## Troubleshooting + +- **Empty ChromaDB**: Make sure you have documents in your knowledge base directory and run embeddings.py +- **No responses**: Check that your Google API key is valid and has access to the required models +- **Missing directories**: The system will create required directories automatically + +## License + +[Your License Here] + +--- + +## Contributing + +Contributions are welcome! Please fork the repository and submit a pull request with your changes. Make sure to follow the code style and include tests for new features. \ No newline at end of file diff --git a/api.py b/api.py index 5930106..91a20ca 100644 --- a/api.py +++ b/api.py @@ -1,399 +1,647 @@ -"""# api.py""" - -import os -import logging -import shutil -from typing import Optional -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, Field -from dotenv import load_dotenv -from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI -from langchain_chroma import Chroma -from langchain.prompts import ChatPromptTemplate - -from embeddings import load_documents, split_text - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) - -# Load environment variables -load_dotenv() -CHROMA_DB_PATH = os.getenv("CHROMA_PATH") -LLM_MODEL_NAME = "gemini-1.5-pro" -DATA_STORE_PATH = os.getenv("DATA_STORE_PATH") -EMBEDDING_MODEL_NAME = "models/text-embedding-004" -CHUNK_SIZE = 1000 -CHUNK_OVERLAP = 500 - -# Initialize FastAPI app -app = FastAPI( - title="DevDocs Chat API", - description="Query your documentation with natural language", - version="1.0.0", -) - - -# Create request model -class QueryRequest(BaseModel): - """Request model for querying documentation.""" - - query: str = Field(..., description="The question to ask about your documentation") - max_tokens: Optional[int] = Field(1024, description="Maximum tokens in response") - relevance_threshold: Optional[float] = Field( - 0.4, description="Minimum relevance score threshold" - ) - k: Optional[int] = Field(5, description="Number of documents to retrieve") - - -# Create response model -class QueryResponse(BaseModel): - """Response model for querying documentation.""" - - answer: str - sources: list[str] = [] - relevance_scores: list[float] = [] - - -# Template for prompts -# Replace the current PROMPT_TEMPLATE with this enhanced version - -PROMPT_TEMPLATE = """ -You are a specialized technical documentation assistant for software developers. - -## CONTEXT INFORMATION -{context} - -## QUESTION -{question} - -## INSTRUCTIONS -1. Answer ONLY based on the provided context above. -2. If the context contains the complete answer, provide a detailed and thorough response. -3. If the context contains partial information, answer with what's available and clearly indicate what information is missing. -4. If the answer isn't in the context at all, respond with: "Based on the available documentation, I don't have information about this specific topic." -5. Include relevant code examples from the context when applicable. -6. Format your answer for clarity: - - Use markdown formatting for headings and lists - - Format code in appropriate code blocks with language specification - - Break complex concepts into smaller sections -7. Do not reference external knowledge or make assumptions beyond what's provided in the context. -8. If technical steps are involved, present them as numbered steps. -9. If there are warnings or important notes in the context, highlight them clearly. -10. If the user interacts with you by greetings or thanks, respond politely but keep the focus on the documentation. - -## ANSWER: -""" - -# Add CORS middleware - ADD THIS CODE BLOCK -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # Allows all origins - allow_credentials=True, - allow_methods=["*"], # Allows all methods including OPTIONS - allow_headers=["*"], # Allows all headers -) - - -def detect_conversation_type(query: str) -> tuple[str, bool]: - """ - Detect if the query is a greeting, farewell, or regular question. - - Args: - query: The user's query string - - Returns: - tuple: (message_type, needs_rag) - - message_type: 'greeting', 'farewell', or 'question' - - needs_rag: Whether RAG search is needed - """ - # Normalize query - query_lower = query.lower().strip() - - # Common greetings - greetings = [ - "hello", - "hi", - "hey", - "greetings", - "good morning", - "good afternoon", - "good evening", - "howdy", - "what's up", - "how are you", - "nice to meet you", - "hi there", - "hello there", - ] - - # Common farewells - farewells = [ - "bye", - "goodbye", - "see you", - "later", - "take care", - "farewell", - "have a good day", - "have a nice day", - "until next time", - "thanks", - "thank you", - "thanks a lot", - "appreciate it", - "cya", - ] - - # Check if query is just a greeting - for greeting in greetings: - if query_lower == greeting or query_lower.startswith(greeting + " "): - return "greeting", False - - # Check if query is just a farewell - for farewell in farewells: - if query_lower == farewell or query_lower.startswith(farewell + " "): - return "farewell", False - - # Otherwise it's a question that needs RAG - return "question", True - - -# Initialize embedding model and ChromaDB -@app.on_event("startup") -async def startup_db_client(): - """Initialize the embedding model and ChromaDB on startup.""" - app.embedding_function = GoogleGenerativeAIEmbeddings( - model="models/text-embedding-004" - ) - - try: - app.chroma_db = Chroma( - embedding_function=app.embedding_function, - persist_directory=CHROMA_DB_PATH, - collection_name="knowledge_base", - ) - collection_stats = app.chroma_db._collection.count() - logger.info("ChromaDB initialized with %s documents", collection_stats) - - if collection_stats == 0: - logger.warning( - "ChromaDB is empty. Please ensure documents are loaded correctly." - ) - except Exception as e: - logger.error("Error initializing ChromaDB: %s", e) - raise e - - -@app.get("/") -async def root(): - """Root endpoint to check if the API is running.""" - return { - "message": "Welcome to DevDocs Chat API. Use /query endpoint to ask questions." - } - - -@app.post("/query", response_model=QueryResponse) -async def query_docs(request: QueryRequest): - """Endpoint to query the documentation.""" - try: - # Get the query from the request - query = request.query - logger.info("Received query: %s", query) - - # Check if the query is a greeting or farewell - message_type, needs_rag = detect_conversation_type(query) - - # Handle greeting - if message_type == "greeting": - return QueryResponse( - answer="👋 Hello! I'm your technical documentation assistant. How can I help you with your development questions today?", - sources=[], - relevance_scores=[], - ) - - # Handle farewell - if message_type == "farewell": - return QueryResponse( - answer="Thanks for using the documentation assistant. If you have more questions later, feel free to ask!", - sources=[], - relevance_scores=[], - ) - - # Get the relevant documents with relevance scores - results = app.chroma_db.similarity_search_with_relevance_scores( - query=query, k=request.k - ) - - if not results: - logger.warning("No relevant documents found in ChromaDB.") - - # Check if query looks like a question about the documentation - doc_related_keywords = [ - "documentation", - "docs", - "manual", - "guide", - "tutorial", - "api", - "reference", - ] - - if any(keyword in query.lower() for keyword in doc_related_keywords): - return QueryResponse( - answer="I don't have enough information about that in the documentation. You can try rephrasing your question, or check if your question is related to the available documentation topics.", - sources=[], - relevance_scores=[], - ) - - # More general fallback - return QueryResponse( - answer="I'm a technical documentation assistant focused on helping with questions about the documented topics. I don't have information about that specific topic in my knowledge base. Please ask a question related to the documentation content.", - sources=[], - relevance_scores=[], - ) - - # Filter documents by relevance score - relevant_documents = [] - relevant_scores = [] - sources = [] - - for doc, score in results: - if score > request.relevance_threshold: - relevant_documents.append(doc) - relevant_scores.append(score) - # Extract source information - if doc.metadata and "source" in doc.metadata: - sources.append(os.path.basename(doc.metadata["source"])) - else: - sources.append("unknown") - else: - logger.warning( - "Document with score %s is below threshold and will not be included.", - score, - ) - - if not relevant_documents: - logger.warning("No relevant documents found after filtering by score.") - return QueryResponse( - answer="I don't have enough information about that in the documentation.", - sources=[], - relevance_scores=[], - ) - - # Format the context for the prompt - context_text = "\n\n---\n\n".join( - [doc.page_content for doc in relevant_documents] - ) - - # Create the prompt - prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) - prompt = prompt_template.format(context=context_text, question=query) - - # Send the prompt to the Google Generative AI API - model = ChatGoogleGenerativeAI( - model=LLM_MODEL_NAME, max_output_tokens=request.max_tokens - ) - response = model.invoke(prompt) - - return QueryResponse( - answer=response.content, sources=sources, relevance_scores=relevant_scores - ) - - except Exception as e: - logger.error("Error processing query: %s", e) - raise HTTPException(status_code=500, detail=str(e)) from e - - -@app.post("/reload") -async def reload_chroma(): - """Endpoint to reload ChromaDB.""" - try: - # First check if we can write to the directory - logging.info("Checking ChromaDB directory permissions: %s", CHROMA_DB_PATH) - - # return {"message": "Reloading ChromaDB..."} - - # Create parent directory if it doesn't exist - os.makedirs(os.path.dirname(CHROMA_DB_PATH), exist_ok=True) - - # Attempt to create a test file to verify write permissions - test_file = os.path.join(os.path.dirname(CHROMA_DB_PATH), "test_write.txt") - try: - with open(test_file, "w") as f: - f.write("Testing write permissions") - os.remove(test_file) - logging.info("Write permissions confirmed for ChromaDB directory") - except (PermissionError, IOError) as e: - logging.error("No write permissions for ChromaDB directory: %s", e) - return { - "error": "Permission denied", - "message": f"""Cannot write to {CHROMA_DB_PATH}. -Please check permissions or use a different directory.""", - } - - # Now proceed with the reload - if os.path.exists(CHROMA_DB_PATH): - try: - logging.warning("Deleting existing ChromaDB at: %s", CHROMA_DB_PATH) - shutil.rmtree(CHROMA_DB_PATH) - except PermissionError as e: - logging.error("Permission error deleting ChromaDB: %s", e) - return { - "error": "Permission denied", - "message": f"Cannot delete existing database at {CHROMA_DB_PATH}." - "Try running: sudo chmod -R 755 {CHROMA_DB_PATH}", - } - - # Create the directory with proper permissions - os.makedirs(CHROMA_DB_PATH, exist_ok=True) - - # Load and process documents - documents = load_documents(DATA_STORE_PATH) - chunks = split_text(documents, CHUNK_SIZE, CHUNK_OVERLAP) - - logger.info( - "Loaded %s documents and split into %s chunks.", len(documents), len(chunks) - ) - - embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME) - app.chroma_db = Chroma.from_documents( - chunks, - embeddings, - persist_directory=CHROMA_DB_PATH, - collection_name="knowledge_base", - ) - - collection_stats = app.chroma_db._collection.count() - logger.info("ChromaDB reloaded with %s documents", collection_stats) - - return { - "message": f"ChromaDB reloaded successfully with {collection_stats} documents" - } - - except Exception as e: - logger.error("Error reloading ChromaDB: %s", e) - - # Provide helpful error message for common issues - error_msg = str(e).lower() - if "readonly database" in error_msg: - return { - "error": "Read-only database", - "message": "The database is read-only. Try running these commands:", - "commands": [ - f"sudo chown -R $USER {CHROMA_DB_PATH}", - f"chmod -R 755 {CHROMA_DB_PATH}", - f"rm -f {CHROMA_DB_PATH}/*.lock", - ], - } - - raise HTTPException(status_code=500, detail=str(e)) from e - - -if __name__ == "__main__": - import uvicorn - - uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True) +"""# api.py""" + +import os +import logging +from pathlib import Path +import shutil +from typing import Optional +import uuid +from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +from dotenv import load_dotenv +from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI +from langchain_chroma import Chroma +from langchain.prompts import ChatPromptTemplate + +from embeddings import load_documents, split_text + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Load environment variables +load_dotenv() +CHROMA_DB_PATH = os.getenv("CHROMA_PATH") +LLM_MODEL_NAME = "gemini-1.5-pro" +DATA_STORE_PATH = os.getenv("DATA_STORE_PATH") +EMBEDDING_MODEL_NAME = "models/text-embedding-004" +CHUNK_SIZE = 1000 +CHUNK_OVERLAP = 500 + +# Initialize FastAPI app +app = FastAPI( + title="DevDocs Chat API", + description="Query your documentation with natural language", + version="1.0.0", +) + + +# Create request model +class QueryRequest(BaseModel): + """Request model for querying documentation.""" + + query: str = Field(..., + description="The question to ask about your documentation") + max_tokens: Optional[int] = Field( + 1024, description="Maximum tokens in response") + relevance_threshold: Optional[float] = Field( + 0.4, description="Minimum relevance score threshold" + ) + k: Optional[int] = Field(5, description="Number of documents to retrieve") + + +# Create response model +class QueryResponse(BaseModel): + """Response model for querying documentation.""" + + answer: str + sources: list[dict] = [] + relevance_scores: list[float] = [] + + +# Template for prompts +# Replace the current PROMPT_TEMPLATE with this enhanced version + +PROMPT_TEMPLATE = """ +You are a specialized historian specializing in Indian History. + +## CONTEXT INFORMATION +{context} + +## QUESTION +{question} + +## INSTRUCTIONS +1. Answer ONLY based on the provided context above. +2. If the context contains the complete answer, provide a detailed and thorough response. + +## ANSWER: +""" + + +forDocs = """ +3. Never say phrases like "based on the documentation," "according to the context," or "the information provided." +4. If the answer isn't in the context at all, respond with: "Based on the available documentation, I don't have information about this specific topic." +5. Include relevant code examples from the context when applicable. +6. Format your answer for clarity: + - Use markdown formatting for headings and lists + - Format code in appropriate code blocks with language specification + - Break complex concepts into smaller sections +7. Do not reference external knowledge or make assumptions beyond what's provided in the context. +8. If technical steps are involved, present them as numbered steps. +9. If there are warnings or important notes in the context, highlight them clearly. +10. If the user interacts with you by greetings or thanks, respond politely but keep the focus on the documentation. +11. Answer directly as if this information is your own knowledge, not as if you're referencing documentation. +12. If you don't have enough information to answer confidently, suggest the user check specific relevant documentation pages (use the URLs in the context). +""" + +# Add CORS middleware - ADD THIS CODE BLOCK +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Allows all origins + allow_credentials=True, + allow_methods=["*"], # Allows all methods including OPTIONS + allow_headers=["*"], # Allows all headers +) + + +def detect_conversation_type(query: str) -> tuple[str, bool]: + """ + Detect if the query is a greeting, farewell, or regular question. + + Args: + query: The user's query string + + Returns: + tuple: (message_type, needs_rag) + - message_type: 'greeting', 'farewell', or 'question' + - needs_rag: Whether RAG search is needed + """ + # Normalize query + query_lower = query.lower().strip() + + # Common greetings + greetings = [ + "hello", + "hi", + "hey", + "greetings", + "good morning", + "good afternoon", + "good evening", + "howdy", + "what's up", + "how are you", + "nice to meet you", + "hi there", + "hello there", + ] + + # Common farewells + farewells = [ + "bye", + "goodbye", + "see you", + "later", + "take care", + "farewell", + "have a good day", + "have a nice day", + "until next time", + "thanks", + "thank you", + "thanks a lot", + "appreciate it", + "cya", + ] + + # Check if query is just a greeting + for greeting in greetings: + if query_lower == greeting or query_lower.startswith(greeting + " "): + return "greeting", False + + # Check if query is just a farewell + for farewell in farewells: + if query_lower == farewell or query_lower.startswith(farewell + " "): + return "farewell", False + + # Otherwise it's a question that needs RAG + return "question", True + + +# Initialize embedding model and ChromaDB +@app.on_event("startup") +async def startup_db_client(): + """Initialize the embedding model and ChromaDB on startup.""" + app.embedding_function = GoogleGenerativeAIEmbeddings( + model="models/text-embedding-004" + ) + + # Log Secrets + logger.debug(""" + Google API key: %s, + Github Token: %s, + """, os.getenv("GOOGLE_API_KEY"), os.getenv("GITHUB_TOKEN")) + + try: + app.chroma_db = Chroma( + embedding_function=app.embedding_function, + persist_directory=CHROMA_DB_PATH, + collection_name="knowledge_base", + ) + collection_stats = app.chroma_db._collection.count() + logger.info("ChromaDB initialized with %s documents", collection_stats) + + if collection_stats == 0: + logger.warning( + "ChromaDB is empty. Please ensure documents are loaded correctly." + ) + except Exception as e: + logger.error("Error initializing ChromaDB: %s", e) + raise e + + +# Create a directory for uploaded files if it doesn't exist +UPLOADS_DIR = os.path.join(os.path.dirname( + os.path.abspath(__file__)), "uploads") +os.makedirs(UPLOADS_DIR, exist_ok=True) + + +class ProcessingStatus(BaseModel): + """Model for the processing status response.""" + job_id: str + status: str + message: str + + +# Dictionary to store background job statuses +processing_jobs = {} + + +def process_document_background(file_path: str, job_id: str): + """Background task to process uploaded document and create embeddings.""" + try: + processing_jobs[job_id] = { + "status": "processing", "message": "Processing document and creating embeddings..."} + + # Create a temporary directory for processing + temp_dir = os.path.join(UPLOADS_DIR, job_id) + os.makedirs(temp_dir, exist_ok=True) + + # Load document and create chunks + documents = load_documents(file_path) + chunks = split_text(documents, CHUNK_SIZE, CHUNK_OVERLAP) + + # Create embeddings and add to existing ChromaDB + embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME) + + # Create a separate collection for this upload or add to existing + collection_name = f"uploaded_{job_id}" + + # Create a new ChromaDB instance for this document + doc_db = Chroma.from_documents( + chunks, + embeddings, + persist_directory=os.path.join(CHROMA_DB_PATH, job_id), + collection_name=collection_name, + ) + + # Update the main ChromaDB to include this collection + # This is optional - you might want separate collections per document + # Or merge them into your main knowledge base + + # Update job status + processing_jobs[job_id] = { + "status": "completed", + "message": f"Successfully processed document with {len(chunks)} chunks", + "collection": collection_name + } + + except Exception as e: + logger.error(f"Error processing document: {e}") + processing_jobs[job_id] = { + "status": "failed", "message": f"Error: {str(e)}"} + + +@app.post("/upload", response_model=ProcessingStatus) +async def upload_document( + background_tasks: BackgroundTasks, + file: UploadFile = File(...), + collection_name: str = Form(None) +): + """ + Upload a document (PDF) to create vector embeddings. + + The document will be processed in the background and its embeddings + will be stored in ChromaDB for future queries. + """ + # Generate a unique job ID + job_id = str(uuid.uuid4()) + + # Create uploads directory if it doesn't exist + os.makedirs(UPLOADS_DIR, exist_ok=True) + + # Save the uploaded file + file_extension = Path(file.filename).suffix.lower() + + if file_extension != '.pdf': + raise HTTPException( + status_code=400, detail="Only PDF files are supported at this time" + ) + + file_path = os.path.join(UPLOADS_DIR, f"{job_id}{file_extension}") + + # Save the uploaded file + with open(file_path, "wb") as f: + content = await file.read() + f.write(content) + + # Initialize job status + processing_jobs[job_id] = {"status": "queued", + "message": "Job queued for processing"} + + # Start background processing + background_tasks.add_task(process_document_background, file_path, job_id) + + return ProcessingStatus( + job_id=job_id, + status="queued", + message="Document upload successful. Processing started in the background." + ) + + +@app.get("/upload/status/{job_id}", response_model=ProcessingStatus) +async def get_processing_status(job_id: str): + """Check the status of a document processing job.""" + if job_id not in processing_jobs: + raise HTTPException( + status_code=404, detail=f"Job ID {job_id} not found" + ) + + job_info = processing_jobs[job_id] + + return ProcessingStatus( + job_id=job_id, + status=job_info["status"], + message=job_info["message"] + ) + + +@app.get("/") +async def root(): + """Root endpoint to check if the API is running.""" + return { + "message": "Welcome to DevDocs Chat API. Use /query endpoint to ask questions." + } + + +@app.post("/query/{job_id}", response_model=QueryResponse) +async def query_specific_document(job_id: str, request: QueryRequest): + """Query a specific uploaded document by its job ID.""" + # Check if the job exists and is completed + if job_id not in processing_jobs: + raise HTTPException(status_code=404, detail=f"Document with ID {job_id} not found") + + job_info = processing_jobs[job_id] + if job_info["status"] != "completed": + raise HTTPException( + status_code=400, + detail=f"Document processing is not complete. Current status: {job_info['status']}" + ) + + try: + # Get the specific ChromaDB collection for this document + specific_db = Chroma( + embedding_function=app.embedding_function, + persist_directory=os.path.join(CHROMA_DB_PATH, job_id), + collection_name=job_info.get("collection", f"uploaded_{job_id}") + ) + + # Get the query from the request + query = request.query + logger.info(f"Received query for document {job_id}: {query}") + + # Check if greeting/farewell + message_type, needs_rag = detect_conversation_type(query) + if message_type in ["greeting", "farewell"]: + return QueryResponse( + answer=("👋 Hello! I can answer questions about your uploaded document." + if message_type == "greeting" else + "Thanks for using the document assistant!"), + sources=[], + relevance_scores=[] + ) + + # Get relevant documents + results = specific_db.similarity_search_with_relevance_scores( + query=query, k=request.k + ) + + if not results: + return QueryResponse( + answer="I couldn't find relevant information in your uploaded document.", + sources=[], + relevance_scores=[] + ) + + # Process results same as in your existing query endpoint + relevant_documents = [] + relevant_scores = [] + sources = [] + + for doc, score in results: + if score > request.relevance_threshold: + relevant_documents.append(doc) + relevant_scores.append(score) + if doc.metadata and "source" in doc.metadata: + sources.append({ + "source": doc.metadata["source"], + "title": doc.metadata.get("title", "unknown"), + }) + else: + sources.append({"source": "unknown", "title": "unknown"}) + + if not relevant_documents: + return QueryResponse( + answer="I couldn't find sufficiently relevant information in your uploaded document.", + sources=[], + relevance_scores=[], + ) + + # Format context and generate response + context_text = "\n\n---\n\n".join( + [doc.page_content for doc in relevant_documents] + ) + + # Create the prompt + prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) + prompt = prompt_template.format(context=context_text, question=query) + + # Generate response + model = ChatGoogleGenerativeAI( + model=LLM_MODEL_NAME, max_output_tokens=request.max_tokens + ) + response = model.invoke(prompt) + + return QueryResponse( + answer=response.content, + sources=sources, + relevance_scores=relevant_scores + ) + + except Exception as e: + logger.error(f"Error querying document {job_id}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +# @app.post("/query", response_model=QueryResponse) +# async def query_docs(request: QueryRequest): +# """Endpoint to query the documentation.""" +# try: +# # Get the query from the request +# query = request.query +# logger.info("Received query: %s", query) + +# # Check if the query is a greeting or farewell +# message_type, needs_rag = detect_conversation_type(query) + +# # Handle greeting +# if message_type == "greeting": +# return QueryResponse( +# answer="👋 Hello! I'm your technical documentation assistant. How can I help you with your development questions today?", +# sources=[], +# relevance_scores=[], +# ) + +# # Handle farewell +# if message_type == "farewell": +# return QueryResponse( +# answer="Thanks for using the documentation assistant. If you have more questions later, feel free to ask!", +# sources=[], +# relevance_scores=[], +# ) + +# # Get the relevant documents with relevance scores +# results = app.chroma_db.similarity_search_with_relevance_scores( +# query=query, k=request.k +# ) + +# if not results: +# logger.warning("No relevant documents found in ChromaDB.") + +# # Check if query looks like a question about the documentation +# doc_related_keywords = [ +# "documentation", +# "docs", +# "manual", +# "guide", +# "tutorial", +# "api", +# "reference", +# ] + +# if any(keyword in query.lower() for keyword in doc_related_keywords): +# return QueryResponse( +# answer="I don't have enough information about that in the documentation. You can try rephrasing your question, or check if your question is related to the available documentation topics.", +# sources=[], +# relevance_scores=[], +# ) + +# # More general fallback +# return QueryResponse( +# answer="I'm a technical documentation assistant focused on helping with questions about the documented topics. I don't have information about that specific topic in my knowledge base. Please ask a question related to the documentation content.", +# sources=[], +# relevance_scores=[], +# ) + +# # Filter documents by relevance score +# relevant_documents = [] +# relevant_scores = [] +# sources = [] + +# for doc, score in results: +# if score > request.relevance_threshold: +# relevant_documents.append(doc) +# relevant_scores.append(score) +# # Extract source information +# if doc.metadata and "source" in doc.metadata: +# sources.append({ +# "source": doc.metadata["source"], +# "title": doc.metadata.get("title", "unknown"), +# }) +# else: +# sources.append("unknown") +# else: +# logger.warning( +# "Document with score %s is below threshold and will not be included.", +# score, +# ) + +# if not relevant_documents: +# logger.warning( +# "No relevant documents found after filtering by score.") +# return QueryResponse( +# answer="I don't have enough information about that in the documentation.", +# sources=[], +# relevance_scores=[], +# ) + +# # Format the context for the prompt +# context_text = "\n\n---\n\n".join( +# [doc.page_content for doc in relevant_documents] +# ) + +# # Create the prompt +# prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) +# prompt = prompt_template.format(context=context_text, question=query) + +# # Send the prompt to the Google Generative AI API +# model = ChatGoogleGenerativeAI( +# model=LLM_MODEL_NAME, max_output_tokens=request.max_tokens +# ) +# response = model.invoke(prompt) + +# return QueryResponse( +# answer=response.content, sources=sources, relevance_scores=relevant_scores +# ) + +# except Exception as e: +# logger.error("Error processing query: %s", e) +# raise HTTPException(status_code=500, detail=str(e)) from e + + +@app.post("/reload") +async def reload_chroma(): + """Endpoint to reload ChromaDB.""" + try: + # First check if we can write to the directory + logging.info( + "Checking ChromaDB directory permissions: %s", CHROMA_DB_PATH) + + # return {"message": "Reloading ChromaDB..."} + + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(CHROMA_DB_PATH), exist_ok=True) + + # Attempt to create a test file to verify write permissions + test_file = os.path.join(os.path.dirname( + CHROMA_DB_PATH), "test_write.txt") + try: + with open(test_file, "w") as f: + f.write("Testing write permissions") + os.remove(test_file) + logging.info("Write permissions confirmed for ChromaDB directory") + except (PermissionError, IOError) as e: + logging.error("No write permissions for ChromaDB directory: %s", e) + return { + "error": "Permission denied", + "message": f"""Cannot write to {CHROMA_DB_PATH}. +Please check permissions or use a different directory.""", + } + + # Now proceed with the reload + if os.path.exists(CHROMA_DB_PATH): + try: + logging.warning( + "Deleting existing ChromaDB at: %s", CHROMA_DB_PATH) + shutil.rmtree(CHROMA_DB_PATH) + except PermissionError as e: + logging.error("Permission error deleting ChromaDB: %s", e) + return { + "error": "Permission denied", + "message": f"Cannot delete existing database at {CHROMA_DB_PATH}." + "Try running: sudo chmod -R 755 {CHROMA_DB_PATH}", + } + + # Create the directory with proper permissions + os.makedirs(CHROMA_DB_PATH, exist_ok=True) + + # Load and process documents + documents = load_documents(DATA_STORE_PATH) + chunks = split_text(documents, CHUNK_SIZE, CHUNK_OVERLAP) + + logger.info( + "Loaded %s documents and split into %s chunks.", len( + documents), len(chunks) + ) + + embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME) + app.chroma_db = Chroma.from_documents( + chunks, + embeddings, + persist_directory=CHROMA_DB_PATH, + collection_name="knowledge_base", + ) + + collection_stats = app.chroma_db._collection.count() + logger.info("ChromaDB reloaded with %s documents", collection_stats) + + return { + "message": f"ChromaDB reloaded successfully with {collection_stats} documents" + } + + except Exception as e: + logger.error("Error reloading ChromaDB: %s", e) + + # Provide helpful error message for common issues + error_msg = str(e).lower() + if "readonly database" in error_msg: + return { + "error": "Read-only database", + "message": "The database is read-only. Try running these commands:", + "commands": [ + f"sudo chown -R $USER {CHROMA_DB_PATH}", + f"chmod -R 755 {CHROMA_DB_PATH}", + f"rm -f {CHROMA_DB_PATH}/*.lock", + ], + } + + raise HTTPException(status_code=500, detail=str(e)) from e + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True) diff --git a/compare_embeddings.py b/compare_embeddings.py index db82ef0..d1af49c 100644 --- a/compare_embeddings.py +++ b/compare_embeddings.py @@ -1,109 +1,109 @@ -import os -import logging -from typing import List, Dict, Any - -from langchain_google_genai import GoogleGenerativeAIEmbeddings -from langchain_community.vectorstores import Chroma -from langchain.evaluation import load_evaluator -from dotenv import load_dotenv -from google import genai - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - - -def get_env_variable(var_name: str) -> str: - """Retrieve an environment variable or raise an error if not set.""" - value = os.getenv(var_name) - if not value: - raise ValueError(f"{var_name} environment variable is not set.") - return value - - -def initialize_embeddings() -> GoogleGenerativeAIEmbeddings: - """Initialize the Google Generative AI Embeddings.""" - return GoogleGenerativeAIEmbeddings( - model=genai.Model.GEMINI_1_5, - temperature=0.2, - max_output_tokens=256, - top_k=40, - top_p=0.95, - ) - - -def load_vector_store(embeddings: GoogleGenerativeAIEmbeddings, db_path: str) -> Chroma: - """Load the vector store.""" - return Chroma( - embedding_function=embeddings, - persist_directory=db_path, - collection_name="knowledge_base", - ) - - -def perform_similarity_search( - vector_store: Chroma, query_embedding: List[float], k: int = 5 -) -> List[Dict[str, Any]]: - """Perform a similarity search and return the results.""" - return vector_store.similarity_search_by_vector( - query_embedding, - k=k, - filter=None, - include=["distance", "metadata"], - ) - - -def main(): - try: - load_dotenv() - - # Load environment variables - chroma_db_path = get_env_variable("CHROMA_PATH") - knowledge_base_dir = get_env_variable("DATA_STORE_PATH") - - # Initialize embeddings - embeddings = initialize_embeddings() - logging.info("Embeddings initialized successfully.") - - # Load evaluator - evaluator = load_evaluator("langchain/eval/embeddings/embedding_similarity") - if not evaluator: - raise ValueError("Evaluator not found.") - logging.info("Evaluator loaded successfully.") - - # Load vector store - vector_store = load_vector_store(embeddings, chroma_db_path) - logging.info("Vector store loaded successfully.") - - # Embed query - query = "What do we mean by 'knowledge base'?" - logging.info("Embedding query...") - query_embedding = embeddings.embed_query(query) - if not query_embedding: - raise ValueError("Query embedding failed.") - logging.info("Query embedded successfully.") - - # Perform similarity search - logging.info("Performing similarity search...") - results = perform_similarity_search(vector_store, query_embedding) - if not results: - raise ValueError("Similarity search failed.") - logging.info("Similarity search completed successfully.") - - # Print results - logging.info("Results:") - for result in results: - logging.info( - f"Distance: {result['distance']}, Metadata: {result['metadata']}" - ) - - logging.info("Data store generated successfully.") - - except Exception as e: - logging.error(f"An error occurred: {e}") - - -if __name__ == "__main__": - main() - logging.info("Main function executed successfully.") +import os +import logging +from typing import List, Dict, Any + +from langchain_google_genai import GoogleGenerativeAIEmbeddings +from langchain_community.vectorstores import Chroma +from langchain.evaluation import load_evaluator +from dotenv import load_dotenv +from google import genai + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + + +def get_env_variable(var_name: str) -> str: + """Retrieve an environment variable or raise an error if not set.""" + value = os.getenv(var_name) + if not value: + raise ValueError(f"{var_name} environment variable is not set.") + return value + + +def initialize_embeddings() -> GoogleGenerativeAIEmbeddings: + """Initialize the Google Generative AI Embeddings.""" + return GoogleGenerativeAIEmbeddings( + model=genai.Model.GEMINI_1_5, + temperature=0.2, + max_output_tokens=256, + top_k=40, + top_p=0.95, + ) + + +def load_vector_store(embeddings: GoogleGenerativeAIEmbeddings, db_path: str) -> Chroma: + """Load the vector store.""" + return Chroma( + embedding_function=embeddings, + persist_directory=db_path, + collection_name="knowledge_base", + ) + + +def perform_similarity_search( + vector_store: Chroma, query_embedding: List[float], k: int = 5 +) -> List[Dict[str, Any]]: + """Perform a similarity search and return the results.""" + return vector_store.similarity_search_by_vector( + query_embedding, + k=k, + filter=None, + include=["distance", "metadata"], + ) + + +def main(): + try: + load_dotenv() + + # Load environment variables + chroma_db_path = get_env_variable("CHROMA_PATH") + knowledge_base_dir = get_env_variable("DATA_STORE_PATH") + + # Initialize embeddings + embeddings = initialize_embeddings() + logging.info("Embeddings initialized successfully.") + + # Load evaluator + evaluator = load_evaluator("langchain/eval/embeddings/embedding_similarity") + if not evaluator: + raise ValueError("Evaluator not found.") + logging.info("Evaluator loaded successfully.") + + # Load vector store + vector_store = load_vector_store(embeddings, chroma_db_path) + logging.info("Vector store loaded successfully.") + + # Embed query + query = "What do we mean by 'knowledge base'?" + logging.info("Embedding query...") + query_embedding = embeddings.embed_query(query) + if not query_embedding: + raise ValueError("Query embedding failed.") + logging.info("Query embedded successfully.") + + # Perform similarity search + logging.info("Performing similarity search...") + results = perform_similarity_search(vector_store, query_embedding) + if not results: + raise ValueError("Similarity search failed.") + logging.info("Similarity search completed successfully.") + + # Print results + logging.info("Results:") + for result in results: + logging.info( + f"Distance: {result['distance']}, Metadata: {result['metadata']}" + ) + + logging.info("Data store generated successfully.") + + except Exception as e: + logging.error(f"An error occurred: {e}") + + +if __name__ == "__main__": + main() + logging.info("Main function executed successfully.") diff --git a/data_sources/independence.pdf b/data_sources/independence.pdf new file mode 100644 index 0000000..f97ed62 Binary files /dev/null and b/data_sources/independence.pdf differ diff --git a/deploy.sh b/deploy.sh index 53db861..d7890c4 100755 --- a/deploy.sh +++ b/deploy.sh @@ -1,11 +1,11 @@ -#!/bin/bash -set -e - -# Prepare the database for inclusion in Docker image -./prepare-db.sh - -# Build and start the Docker containers -docker-compose up -d - -echo "API is running at http://localhost:8000" +#!/bin/bash +set -e + +# Prepare the database for inclusion in Docker image +./prepare-db.sh + +# Build and start the Docker containers +docker-compose up -d + +echo "API is running at http://localhost:8000" echo "To test it, try: curl -X POST http://localhost:8000/query -H 'Content-Type: application/json' -d '{\"query\":\"How to install rcg?\"}'" \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 07855b5..9f319f4 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,18 +1,18 @@ -services: - api: - build: . - container_name: api - ports: - - "8000:8000" - volumes: - # Use a named volume for persistent ChromaDB storage - - chroma_data:/app/db/chroma - environment: - - CHROMA_PATH=/app/db/chroma - - DATA_STORE_PATH=/app/db/knowledge_base - - GOOGLE_API_KEY=${GOOGLE_API_KEY} - restart: unless-stopped - -volumes: - chroma_data: - driver: local +services: + api: + build: . + container_name: api + ports: + - "8000:8000" + volumes: + # Use a named volume for persistent ChromaDB storage + - chroma_data:/app/db/chroma + environment: + - CHROMA_PATH=/app/db/chroma + - DATA_STORE_PATH=/app/db/knowledge_base + - GOOGLE_API_KEY="AIzaSyCqocy4RGw08bvimmzZRWBV37rm1aq5bHY" + restart: unless-stopped + +volumes: + chroma_data: + driver: local diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index dcb5137..ce6e174 100644 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,28 +1,28 @@ -#!/bin/bash -set -e - -# Print Python and environment information -python --version -echo "Starting API in environment: $APP_NAME" - -# Check for required environment variables -if [ -z "$GOOGLE_API_KEY" ]; then - echo "WARNING: GOOGLE_API_KEY is not set" -fi - -# Print Python and environment information -python --version -echo "Starting API in environment: $APP_NAME" - -# Check for the existence of ChromaDB -if [ -d "/app/db/chroma" ] && [ "$(ls -A /app/db/chroma)" ]; then - echo "Found existing ChromaDB" -else - echo "WARNING: ChromaDB not found. The API may need to generate embeddings on first use." -fi - -# Wait for any dependent services (if added later) -# sleep 5 - -# Execute the CMD +#!/bin/bash +set -e + +# Print Python and environment information +python --version +echo "Starting API in environment: $APP_NAME" + +# Check for required environment variables +if [ -z "$GOOGLE_API_KEY" ]; then + echo "WARNING: GOOGLE_API_KEY is not set" +fi + +# Print Python and environment information +python --version +echo "Starting API in environment: $APP_NAME" + +# Check for the existence of ChromaDB +if [ -d "/app/db/chroma" ] && [ "$(ls -A /app/db/chroma)" ]; then + echo "Found existing ChromaDB" +else + echo "WARNING: ChromaDB not found. The API may need to generate embeddings on first use." +fi + +# Wait for any dependent services (if added later) +# sleep 5 + +# Execute the CMD exec "$@" \ No newline at end of file diff --git a/embeddings.py b/embeddings.py index 18d29b4..f30d760 100644 --- a/embeddings.py +++ b/embeddings.py @@ -1,130 +1,219 @@ -import shutil -import os -import logging -from dotenv import load_dotenv -from langchain_community.document_loaders import DirectoryLoader -from langchain_community.document_loaders import GithubFileLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.schema import Document -from langchain_google_genai import GoogleGenerativeAIEmbeddings -from langchain_community.vectorstores import Chroma - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -load_dotenv() - -CHROMA_DB_PATH = os.getenv("CHROMA_PATH") -DATA_STORE_PATH = os.getenv("DATA_STORE_PATH") -GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") -EMBEDDING_MODEL_NAME = "models/text-embedding-004" -CHUNK_SIZE = 1000 -CHUNK_OVERLAP = 500 - - -def _validate_environment_variables(): - """Validates that required environment variables are set.""" - if not CHROMA_DB_PATH: - raise ValueError("CHROMA_PATH environment variable is not set.") - if not DATA_STORE_PATH: - raise ValueError("DATA_STORE_PATH environment variable is not set.") - logging.info("Environment variables validated.") - - -def load_documents(directory: str) -> list[Document]: - """Loads documents from the specified directory.""" - logging.info(f"Loading documents from: {directory}") - if not os.path.exists(directory): - raise FileNotFoundError(f"Directory not found: {directory}") - # loader = DirectoryLoader(directory, glob="**/*.md") - loader = GithubFileLoader( - repo="nudgenow/nudge-devdocs", - branch="prod_main", - file_filter=lambda file_path: file_path.endswith(".md") and file_path.startswith("docs/"), - directory=["docs/"], - access_token=GITHUB_TOKEN, - ) - documents = loader.load() - logging.info(f"Loaded {len(documents)} documents from: {directory}") - return documents - - -def split_text( - documents: list[Document], chunk_size: int, chunk_overlap: int -) -> list[Document]: - """Splits documents into smaller chunks.""" - logging.info( - f"Splitting {len(documents)} documents into chunks of size {chunk_size} with overlap {chunk_overlap}." - ) - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - length_function=len, - add_start_index=True, - ) - chunks = text_splitter.split_documents(documents) - logging.info(f"Split documents into {len(chunks)} chunks.") - return chunks - - -def create_chroma_db( - chunks: list[Document], persist_directory: str, embedding_model: str -) -> Chroma: - """Creates and persists a ChromaDB from the document chunks.""" - logging.info(f"Creating ChromaDB at: {persist_directory}") - if os.path.exists(persist_directory): - logging.warning(f"Deleting existing ChromaDB at: {persist_directory}") - shutil.rmtree(persist_directory) - - embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model) - db = Chroma.from_documents( - chunks, - embeddings, - persist_directory=persist_directory, - collection_name="knowledge_base", - ) - - logging.info(f"ChromaDB created and persisted to: {persist_directory}") - - # Debug logging - collection_stats = db._collection.count() - logging.debug(f"ChromaDB collection contains {collection_stats} documents") - - if collection_stats == 0: - logging.warning( - "ChromaDB is empty after creation. Please check the document loading and splitting process." - ) - else: - logging.info(f"ChromaDB contains {collection_stats} documents after creation.") - - return db - - -def generate_data_store( - data_store_path: str, - chroma_db_path: str, - embedding_model_name: str, - chunk_size: int, - chunk_overlap: int, -): - """Orchestrates the process of loading, splitting, and saving data to ChromaDB.""" - logging.info("Starting data store generation.") - try: - documents = load_documents(data_store_path) - chunks = split_text(documents, chunk_size, chunk_overlap) - create_chroma_db(chunks, chroma_db_path, embedding_model_name) - logging.info("Data store generation complete successfully.") - except Exception as e: - logging.error( - f"An error occurred during data store generation: {e}", exc_info=True - ) - - -if __name__ == "__main__": - _validate_environment_variables() - generate_data_store( - DATA_STORE_PATH, CHROMA_DB_PATH, EMBEDDING_MODEL_NAME, CHUNK_SIZE, CHUNK_OVERLAP - ) - logging.info("Data store generation script finished.") +import sys +import shutil +import os +import logging +from dotenv import load_dotenv +from langchain_community.document_loaders import GithubFileLoader, PyPDFLoader, DirectoryLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.schema import Document +from langchain_google_genai import GoogleGenerativeAIEmbeddings +from langchain_community.vectorstores import Chroma + + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +load_dotenv() + +CHROMA_DB_PATH = os.getenv("CHROMA_PATH") +DATA_STORE_PATH = os.getenv("DATA_STORE_PATH") +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +EMBEDDING_MODEL_NAME = "models/text-embedding-004" +CHUNK_SIZE = 1000 +CHUNK_OVERLAP = 500 +BASE_URL = "https://docs.nudgenow.com/" + + +def _validate_environment_variables(): + """Validates that required environment variables are set.""" + if not CHROMA_DB_PATH: + raise ValueError("CHROMA_PATH environment variable is not set.") + if not DATA_STORE_PATH: + raise ValueError("DATA_STORE_PATH environment variable is not set.") + logging.info("Environment variables validated.") + + +def load_documents(repository: str) -> list[Document]: + """Loads documents from the specified repository.""" + logging.info(f"Loading documents from repository: {repository}") + # if not os.path.exists(repository): + # raise FileNotFoundError(f"repository not found: {repository}") + # loader = DirectoryLoader(repository, glob="**/*.md") + # loader = GithubFileLoader( + # repo="nudgenow/nudge-devdocs", + # branch="prod_main", + # file_filter=lambda file_path: file_path.endswith( + # ".md") and file_path.startswith("docs/"), + # directory=["docs/"], + # access_token=GITHUB_TOKEN, + # ) + + if not os.path.exists(repository): + raise FileNotFoundError( + f"Repository directory not found: {repository}") + + # Check if it's a single PDF file + if os.path.isfile(repository) and repository.lower().endswith('.pdf'): + logging.info(f"Loading single PDF file: {repository}") + loader = PyPDFLoader(repository) + documents = loader.load() + logging.info(f"Loaded 1 PDF document with {len(documents)} pages") + else: + # Load all PDFs from directory + logging.info(f"Loading all PDFs from directory: {repository}") + loader = DirectoryLoader( + repository, + glob="**/*.pdf", + loader_cls=PyPDFLoader + ) + documents = loader.load() + logging.info(f"Loaded {len(documents)} PDF pages from directory") + + documents = loader.load() + + # Process the metadata to create titles and URLs + for i, doc in enumerate(documents): + # If it's a single document, use the filename as title + if "source" in doc.metadata: + filename = os.path.basename(doc.metadata["source"]) + base_name = os.path.splitext(filename)[0] + clean_title = base_name.replace('-', ' ').replace('_', ' ').title() + + doc.metadata['title'] = doc.metadata.get('title', clean_title) + + # If we have a base URL and want to create reference links + if BASE_URL: + doc.metadata['url'] = f"{BASE_URL}/{base_name}" + else: + # If source is not in metadata, create minimal metadata + doc.metadata['title'] = f"Document {i+1}" + doc.metadata['source'] = f"page_{i+1}" + + logging.info(f"Processed metadata for {len(documents)} documents") + + # import re + # # Process the metadata to remove numbering prefixes + # for doc in documents: + # original_path = doc.metadata['source'].split("docs/")[-1] + # filename = os.path.basename(doc.metadata["source"]) + + # # Split the path into components + # path_parts = original_path.split('/') + + # # Clean each component by removing the numbering prefix + # cleaned_parts = [ + # re.sub(r'^\d+[\-\.]', '', part) + # for part in path_parts + # ] + + # # Reassemble the path + # cleaned_path = '/'.join(cleaned_parts) + + # # Replace spaces with underscores + # cleaned_path = cleaned_path.replace(' ', '%20') + + # # Create web URL (remove .md extension) + # web_path = os.path.splitext(cleaned_path)[0] + # web_url = f"{BASE_URL}{web_path}" + + # # print(f"Cleaned path: {cleaned_path}") + # # print(f"Web URL: {web_url}\n") + + # if "title" not in doc.metadata: + # clean_filename = re.sub(r'^\d+[\-\.]', '', filename) + # clean_title = os.path.splitext(clean_filename)[ + # 0].replace('-', ' ').title() + # doc.metadata['title'] = clean_title + + # # Update the document metadata with the new web URL + # doc.metadata['source'] = web_url + # doc.metadata['path'] = web_url + + logging.info( + f"Loaded {len(documents)} documents from repository: {repository}\n\n" + ) + + return documents + + +def split_text( + documents: list[Document], chunk_size: int, chunk_overlap: int +) -> list[Document]: + """Splits documents into smaller chunks.""" + logging.info( + f"Splitting {len(documents)} documents into chunks of size {chunk_size} with overlap {chunk_overlap}." + ) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=len, + add_start_index=True, + ) + chunks = text_splitter.split_documents(documents) + logging.info(f"Split documents into {len(chunks)} chunks.") + return chunks + + +def create_chroma_db( + chunks: list[Document], persist_directory: str, embedding_model: str +) -> Chroma: + """Creates and persists a ChromaDB from the document chunks.""" + logging.info(f"Creating ChromaDB at: {persist_directory}") + if os.path.exists(persist_directory): + logging.warning(f"Deleting existing ChromaDB at: {persist_directory}") + shutil.rmtree(persist_directory) + + embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model) + + db = Chroma.from_documents( + chunks, + embeddings, + persist_directory=persist_directory, + collection_name="knowledge_base", + ) + + logging.info(f"ChromaDB created and persisted to: {persist_directory}") + + # Debug logging + collection_stats = db._collection.count() + logging.debug(f"ChromaDB collection contains {collection_stats} documents") + + if collection_stats == 0: + logging.warning( + "ChromaDB is empty after creation. Please check the document loading and splitting process." + ) + else: + logging.info( + f"ChromaDB contains {collection_stats} documents after creation.") + + return db + + +def generate_data_store( + data_store_path: str, + chroma_db_path: str, + embedding_model_name: str, + chunk_size: int, + chunk_overlap: int, +): + """Orchestrates the process of loading, splitting, and saving data to ChromaDB.""" + logging.info("Starting data store generation.") + try: + documents = load_documents(data_store_path) + chunks = split_text(documents, chunk_size, chunk_overlap) + create_chroma_db(chunks, chroma_db_path, embedding_model_name) + logging.info("Data store generation complete successfully.") + except Exception as e: + logging.error( + f"An error occurred during data store generation: {e}", exc_info=True + ) + + +if __name__ == "__main__": + _validate_environment_variables() + generate_data_store( + DATA_STORE_PATH, CHROMA_DB_PATH, EMBEDDING_MODEL_NAME, CHUNK_SIZE, CHUNK_OVERLAP + ) + logging.info("Data store generation script finished.") diff --git a/main.py b/main.py index 9fe4947..c2ebb75 100644 --- a/main.py +++ b/main.py @@ -1,178 +1,178 @@ -import os -import logging - -from dotenv import load_dotenv -from langchain_google_genai import GoogleGenerativeAIEmbeddings -from langchain_chroma import Chroma -from langchain.prompts import ChatPromptTemplate -from langchain_google_genai import ChatGoogleGenerativeAI -from langchain_core.documents import Document - -load_dotenv() - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -CHROMA_DB_PATH = os.getenv("CHROMA_PATH") -DATA_STORE_PATH = os.getenv("DATA_STORE_PATH") -RESPONSE_FILE_PATH = os.path.join("", "response.md") - -if not CHROMA_DB_PATH: - raise ValueError("CHROMA_PATH environment variable is not set.") -if not DATA_STORE_PATH: - raise ValueError("DATA_STORE_PATH environment variable is not set.") - -# Set default LLM model -LLM_MODEL_NAME = "gemini-1.5-pro" # Use gemini-pro as default model - -PROMPT_TEMPLATE = """ - Answer the question as detailed as possible and with appropriate code examples from the provided context, - make sure to provide all the details, if the answer is not in the provided context just say, "Sorry, can't help you with that". - Strictly follow the context and do not add any extra information. Do not repeat the question in your answer. - - Context: {context} \n - Question: {question} \n - - Answer: - """ - - -def ensure_directories_exist(): - """Ensure that required directories exist.""" - os.makedirs(CHROMA_DB_PATH, exist_ok=True) - os.makedirs(DATA_STORE_PATH, exist_ok=True) - - -def main(): - # Ensure directories exist - ensure_directories_exist() - - # Check if the data directory has any files - has_files = False - if os.path.exists(DATA_STORE_PATH): - for root, dirs, files in os.walk(DATA_STORE_PATH): - if any(file.endswith((".txt", ".md", ".pdf")) for file in files): - has_files = True - break - - if not has_files: - logging.warning( - f"No documents found in {DATA_STORE_PATH}. Please add some documents first." - ) - return - - # Check if embeddings need to be generated - if not os.path.exists(CHROMA_DB_PATH) or not os.listdir(CHROMA_DB_PATH): - logging.info("No ChromaDB found. Running embedding generation...") - # Import here to avoid circular imports - from embeddings import ( - generate_data_store, - EMBEDDING_MODEL_NAME, - CHUNK_SIZE, - CHUNK_OVERLAP, - ) - - generate_data_store( - DATA_STORE_PATH, - CHROMA_DB_PATH, - EMBEDDING_MODEL_NAME, - CHUNK_SIZE, - CHUNK_OVERLAP, - ) - - embedding_function = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") - - try: - chroma_db = Chroma( - embedding_function=embedding_function, - persist_directory=CHROMA_DB_PATH, - collection_name="knowledge_base", - ) - - # Debug: Check if ChromaDB has documents - collection_stats = chroma_db._collection.count() - logging.info(f"ChromaDB collection contains {collection_stats} documents") - - if collection_stats == 0: - logging.warning( - "ChromaDB is empty. Please ensure documents are loaded correctly." - ) - return - - # Example query - query = "What do we mean by 'knowledge base'?" - - # Get the relevant documents with relevance scores - results = chroma_db.similarity_search_with_relevance_scores(query=query, k=5) - - # Log the raw response - logging.info( - f"Raw ChromaDB Response: {len(results)} results found. With scores: {[result[1] for result in results]}" - ) - - if not results: - logging.warning("No relevant documents found in ChromaDB.") - return - - # Extract the relevant documents and scores but only allow documents with score > 0.4 - relevant_documents = [] - relevant_scores = [] - - for doc, score in results: - if score > 0.4: - relevant_documents.append(doc) - relevant_scores.append(score) - else: - logging.warning( - f"Document with score {score} is below threshold and will not be included." - ) - continue - - if not relevant_documents: - logging.warning("No relevant documents found after filtering by score.") - return - - # Log the extracted documents and scores - for i, doc in enumerate(relevant_documents): - logging.info( - f"Retrieved Document {i+1}:\nContent: {doc.page_content}\nScore: {relevant_scores[i]}" - ) - - # Format the context for the prompt - context_text = "\n\n---\n\n".join( - [doc.page_content for doc in relevant_documents] - ) - - # Create the prompt - prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) - prompt = prompt_template.format(context=context_text, question=query) - - logging.info(f"Formatted Prompt:\n{prompt}") - - # Send the prompt to the Google Generative AI API with proper model parameter - try: - # Initialize the model with required parameters - model = ChatGoogleGenerativeAI(model=LLM_MODEL_NAME) - response = model.invoke(prompt) - logging.info(f"Response from Google Generative AI: {response}") - - # Log the response to response file - response_file_path = os.path.join("responses", f"response-{response.id}.md") - with open(response_file_path, "w") as response_file: - response_file.write(response.content) - logging.info(f"Response logged to {response_file_path}") - except ValueError as ve: - logging.error(f"ValueError: {ve}") - raise - except Exception as e: - logging.error(f"Error calling Google Generative AI: {e}") - raise - - except Exception as e: - logging.error(f"Error in main: {e}, traceback: {e.__traceback__}") - - -if __name__ == "__main__": - main() +import os +import logging + +from dotenv import load_dotenv +from langchain_google_genai import GoogleGenerativeAIEmbeddings +from langchain_chroma import Chroma +from langchain.prompts import ChatPromptTemplate +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_core.documents import Document + +load_dotenv() + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +CHROMA_DB_PATH = os.getenv("CHROMA_PATH") +DATA_STORE_PATH = os.getenv("DATA_STORE_PATH") +RESPONSE_FILE_PATH = os.path.join("", "response.md") + +if not CHROMA_DB_PATH: + raise ValueError("CHROMA_PATH environment variable is not set.") +if not DATA_STORE_PATH: + raise ValueError("DATA_STORE_PATH environment variable is not set.") + +# Set default LLM model +LLM_MODEL_NAME = "gemini-1.5-pro" # Use gemini-pro as default model + +PROMPT_TEMPLATE = """ + Answer the question as detailed as possible and with appropriate code examples from the provided context, + make sure to provide all the details, if the answer is not in the provided context just say, "Sorry, can't help you with that". + Strictly follow the context and do not add any extra information. Do not repeat the question in your answer. + + Context: {context} \n + Question: {question} \n + + Answer: + """ + + +def ensure_directories_exist(): + """Ensure that required directories exist.""" + os.makedirs(CHROMA_DB_PATH, exist_ok=True) + os.makedirs(DATA_STORE_PATH, exist_ok=True) + + +def main(): + # Ensure directories exist + ensure_directories_exist() + + # Check if the data directory has any files + has_files = False + if os.path.exists(DATA_STORE_PATH): + for root, dirs, files in os.walk(DATA_STORE_PATH): + if any(file.endswith((".txt", ".md", ".pdf")) for file in files): + has_files = True + break + + if not has_files: + logging.warning( + f"No documents found in {DATA_STORE_PATH}. Please add some documents first." + ) + return + + # Check if embeddings need to be generated + if not os.path.exists(CHROMA_DB_PATH) or not os.listdir(CHROMA_DB_PATH): + logging.info("No ChromaDB found. Running embedding generation...") + # Import here to avoid circular imports + from embeddings import ( + generate_data_store, + EMBEDDING_MODEL_NAME, + CHUNK_SIZE, + CHUNK_OVERLAP, + ) + + generate_data_store( + DATA_STORE_PATH, + CHROMA_DB_PATH, + EMBEDDING_MODEL_NAME, + CHUNK_SIZE, + CHUNK_OVERLAP, + ) + + embedding_function = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") + + try: + chroma_db = Chroma( + embedding_function=embedding_function, + persist_directory=CHROMA_DB_PATH, + collection_name="knowledge_base", + ) + + # Debug: Check if ChromaDB has documents + collection_stats = chroma_db._collection.count() + logging.info(f"ChromaDB collection contains {collection_stats} documents") + + if collection_stats == 0: + logging.warning( + "ChromaDB is empty. Please ensure documents are loaded correctly." + ) + return + + # Example query + query = "What do we mean by 'knowledge base'?" + + # Get the relevant documents with relevance scores + results = chroma_db.similarity_search_with_relevance_scores(query=query, k=5) + + # Log the raw response + logging.info( + f"Raw ChromaDB Response: {len(results)} results found. With scores: {[result[1] for result in results]}" + ) + + if not results: + logging.warning("No relevant documents found in ChromaDB.") + return + + # Extract the relevant documents and scores but only allow documents with score > 0.4 + relevant_documents = [] + relevant_scores = [] + + for doc, score in results: + if score > 0.4: + relevant_documents.append(doc) + relevant_scores.append(score) + else: + logging.warning( + f"Document with score {score} is below threshold and will not be included." + ) + continue + + if not relevant_documents: + logging.warning("No relevant documents found after filtering by score.") + return + + # Log the extracted documents and scores + for i, doc in enumerate(relevant_documents): + logging.info( + f"Retrieved Document {i+1}:\nContent: {doc.page_content}\nScore: {relevant_scores[i]}" + ) + + # Format the context for the prompt + context_text = "\n\n---\n\n".join( + [doc.page_content for doc in relevant_documents] + ) + + # Create the prompt + prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) + prompt = prompt_template.format(context=context_text, question=query) + + logging.info(f"Formatted Prompt:\n{prompt}") + + # Send the prompt to the Google Generative AI API with proper model parameter + try: + # Initialize the model with required parameters + model = ChatGoogleGenerativeAI(model=LLM_MODEL_NAME) + response = model.invoke(prompt) + logging.info(f"Response from Google Generative AI: {response}") + + # Log the response to response file + response_file_path = os.path.join("responses", f"response-{response.id}.md") + with open(response_file_path, "w") as response_file: + response_file.write(response.content) + logging.info(f"Response logged to {response_file_path}") + except ValueError as ve: + logging.error(f"ValueError: {ve}") + raise + except Exception as e: + logging.error(f"Error calling Google Generative AI: {e}") + raise + + except Exception as e: + logging.error(f"Error in main: {e}, traceback: {e.__traceback__}") + + +if __name__ == "__main__": + main() diff --git a/prepare-db.sh b/prepare-db.sh index 3bb2d58..0cb3eed 100755 --- a/prepare-db.sh +++ b/prepare-db.sh @@ -1,19 +1,19 @@ -#!/bin/bash -set -e - -# Fetch the latest docs from source - - -# Ensure the ChromaDB exists locally -if [ ! -d "db/chroma" ] || [ ! "$(ls -A db/chroma)" ]; then - echo "No ChromaDB found. Generating embeddings..." - # Run the Python script to generate embeddings - python embeddings.py -else - echo "Using existing ChromaDB from db/chroma" -fi - -# Make sure permissions are correct for Docker -chmod -R 755 db/chroma - +#!/bin/bash +set -e + +# Fetch the latest docs from source + + +# Ensure the ChromaDB exists locally +if [ ! -d "db/chroma" ] || [ ! "$(ls -A db/chroma)" ]; then + echo "No ChromaDB found. Generating embeddings..." + # Run the Python script to generate embeddings + python embeddings.py +else + echo "Using existing ChromaDB from db/chroma" +fi + +# Make sure permissions are correct for Docker +chmod -R 755 db/chroma + echo "Database prepared for Docker build" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1d5aad4..97f3698 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,162 +1,162 @@ -## The following requirements were added by pip freeze: -aiofiles==24.1.0 -aiohappyeyeballs==2.6.1 -aiohttp==3.11.16 -aiosignal==1.3.2 -annotated-types==0.7.0 -anyio==4.9.0 -asgiref==3.8.1 -astroid==3.3.9 -attrs==25.3.0 -backoff==2.2.1 -bcrypt==4.3.0 -beautifulsoup4==4.13.3 -black==25.1.0 -build==1.2.2.post1 -cachetools==5.5.2 -certifi==2025.1.31 -cffi==1.17.1 -chardet==5.2.0 -charset-normalizer==3.4.1 -chroma-hnswlib==0.7.6 -chromadb==0.6.3 -click==8.1.8 -coloredlogs==15.0.1 -cryptography==44.0.2 -dataclasses-json==0.6.7 -Deprecated==1.2.18 -dill==0.3.9 -distro==1.9.0 -durationpy==0.9 -emoji==2.14.1 -eval_type_backport==0.2.2 -fastapi==0.115.9 -filelock==3.18.0 -filetype==1.2.0 -flatbuffers==25.2.10 -frozenlist==1.5.0 -fsspec==2025.3.2 -google-ai-generativelanguage==0.6.17 -google-api-core==2.24.2 -google-auth==2.38.0 -google-genai==1.10.0 -googleapis-common-protos==1.69.2 -grpcio==1.71.0 -grpcio-status==1.71.0 -h11==0.14.0 -html5lib==1.1 -httpcore==1.0.7 -httptools==0.6.4 -httpx==0.28.1 -httpx-sse==0.4.0 -huggingface-hub==0.30.2 -humanfriendly==10.0 -idna==3.10 -importlib_metadata==8.6.1 -importlib_resources==6.5.2 -isort==6.0.1 -joblib==1.4.2 -jsonpatch==1.33 -jsonpointer==3.0.0 -jsonschema==4.23.0 -jsonschema-specifications==2024.10.1 -kubernetes==32.0.1 -langchain==0.3.23 -langchain-chroma==0.2.2 -langchain-community==0.3.21 -langchain-core==0.3.51 -langchain-google-genai==2.1.2 -langchain-text-splitters==0.3.8 -langdetect==1.0.9 -langsmith==0.3.27 -lxml==5.3.2 -Markdown==3.7 -markdown-it-py==3.0.0 -marshmallow==3.26.1 -mccabe==0.7.0 -mdurl==0.1.2 -mmh3==5.1.0 -monotonic==1.6 -mpmath==1.3.0 -multidict==6.2.0 -mypy-extensions==1.0.0 -nest-asyncio==1.6.0 -nltk==3.9.1 -numpy==1.26.4 -oauthlib==3.2.2 -olefile==0.47 -onnxruntime==1.19.0 -opentelemetry-api==1.31.1 -opentelemetry-exporter-otlp-proto-common==1.31.1 -opentelemetry-exporter-otlp-proto-grpc==1.31.1 -opentelemetry-instrumentation==0.52b1 -opentelemetry-instrumentation-asgi==0.52b1 -opentelemetry-instrumentation-fastapi==0.52b1 -opentelemetry-proto==1.31.1 -opentelemetry-sdk==1.31.1 -opentelemetry-semantic-conventions==0.52b1 -opentelemetry-util-http==0.52b1 -orjson==3.10.16 -overrides==7.7.0 -packaging==24.2 -pathspec==0.12.1 -platformdirs==4.3.7 -posthog==3.23.0 -propcache==0.3.1 -proto-plus==1.26.1 -protobuf==5.29.4 -psutil==7.0.0 -pyasn1==0.6.1 -pyasn1_modules==0.4.2 -pycparser==2.22 -pydantic==2.11.3 -pydantic-settings==2.8.1 -pydantic_core==2.33.1 -Pygments==2.19.1 -pylint==3.3.6 -pypdf==5.4.0 -PyPika==0.48.9 -pyproject_hooks==1.2.0 -python-dateutil==2.9.0.post0 -python-dotenv==1.1.0 -python-iso639==2025.2.18 -python-magic==0.4.27 -python-oxmsg==0.0.2 -PyYAML==6.0.2 -RapidFuzz==3.13.0 -referencing==0.36.2 -regex==2024.11.6 -requests==2.32.3 -requests-oauthlib==2.0.0 -requests-toolbelt==1.0.0 -rich==14.0.0 -rpds-py==0.24.0 -rsa==4.9 -shellingham==1.5.4 -six==1.17.0 -sniffio==1.3.1 -soupsieve==2.6 -SQLAlchemy==2.0.40 -starlette==0.45.3 -sympy==1.13.3 -tenacity==9.1.2 -tokenizers==0.21.1 -tomlkit==0.13.2 -tqdm==4.67.1 -typer==0.15.2 -typing-inspect==0.9.0 -typing-inspection==0.4.0 -typing_extensions==4.13.1 -unstructured==0.17.2 -unstructured-client==0.32.3 -urllib3==2.3.0 -uvicorn==0.34.0 -uvloop==0.21.0 -watchfiles==1.0.5 -webencodings==0.5.1 -websocket-client==1.8.0 -websockets==15.0.1 -wrapt==1.17.2 -yarl==1.19.0 -zipp==3.21.0 -zstandard==0.23.0 +## The following requirements were added by pip freeze: +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.16 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.9.0 +asgiref==3.8.1 +astroid==3.3.9 +attrs==25.3.0 +backoff==2.2.1 +bcrypt==4.3.0 +beautifulsoup4==4.13.3 +black==25.1.0 +build==1.2.2.post1 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.1 +chroma-hnswlib==0.7.6 +chromadb==0.6.3 +click==8.1.8 +coloredlogs==15.0.1 +cryptography==44.0.2 +dataclasses-json==0.6.7 +Deprecated==1.2.18 +dill==0.3.9 +distro==1.9.0 +durationpy==0.9 +emoji==2.14.1 +eval_type_backport==0.2.2 +fastapi==0.115.9 +filelock==3.18.0 +filetype==1.2.0 +flatbuffers==25.2.10 +frozenlist==1.5.0 +fsspec==2025.3.2 +google-ai-generativelanguage==0.6.17 +google-api-core==2.24.2 +google-auth==2.38.0 +google-genai==1.10.0 +googleapis-common-protos==1.69.2 +grpcio==1.71.0 +grpcio-status==1.71.0 +h11==0.14.0 +html5lib==1.1 +httpcore==1.0.7 +httptools==0.6.4 +httpx==0.28.1 +httpx-sse==0.4.0 +huggingface-hub==0.30.2 +humanfriendly==10.0 +idna==3.10 +importlib_metadata==8.6.1 +importlib_resources==6.5.2 +isort==6.0.1 +joblib==1.4.2 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kubernetes==32.0.1 +langchain==0.3.23 +langchain-chroma==0.2.2 +langchain-community==0.3.21 +langchain-core==0.3.51 +langchain-google-genai==2.1.2 +langchain-text-splitters==0.3.8 +langdetect==1.0.9 +langsmith==0.3.27 +lxml==5.3.2 +Markdown==3.7 +markdown-it-py==3.0.0 +marshmallow==3.26.1 +mccabe==0.7.0 +mdurl==0.1.2 +mmh3==5.1.0 +monotonic==1.6 +mpmath==1.3.0 +multidict==6.2.0 +mypy-extensions==1.0.0 +nest-asyncio==1.6.0 +nltk==3.9.1 +numpy==1.26.4 +oauthlib==3.2.2 +olefile==0.47 +onnxruntime==1.19.0 +opentelemetry-api==1.31.1 +opentelemetry-exporter-otlp-proto-common==1.31.1 +opentelemetry-exporter-otlp-proto-grpc==1.31.1 +opentelemetry-instrumentation==0.52b1 +opentelemetry-instrumentation-asgi==0.52b1 +opentelemetry-instrumentation-fastapi==0.52b1 +opentelemetry-proto==1.31.1 +opentelemetry-sdk==1.31.1 +opentelemetry-semantic-conventions==0.52b1 +opentelemetry-util-http==0.52b1 +orjson==3.10.16 +overrides==7.7.0 +packaging==24.2 +pathspec==0.12.1 +platformdirs==4.3.7 +posthog==3.23.0 +propcache==0.3.1 +proto-plus==1.26.1 +protobuf==5.29.4 +psutil==7.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pycparser==2.22 +pydantic==2.11.3 +pydantic-settings==2.8.1 +pydantic_core==2.33.1 +Pygments==2.19.1 +pylint==3.3.6 +pypdf==5.4.0 +PyPika==0.48.9 +pyproject_hooks==1.2.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-iso639==2025.2.18 +python-magic==0.4.27 +python-oxmsg==0.0.2 +PyYAML==6.0.2 +RapidFuzz==3.13.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rich==14.0.0 +rpds-py==0.24.0 +rsa==4.9 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==2.0.40 +starlette==0.45.3 +sympy==1.13.3 +tenacity==9.1.2 +tokenizers==0.21.1 +tomlkit==0.13.2 +tqdm==4.67.1 +typer==0.15.2 +typing-inspect==0.9.0 +typing-inspection==0.4.0 +typing_extensions==4.13.1 +unstructured==0.17.2 +unstructured-client==0.32.3 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +watchfiles==1.0.5 +webencodings==0.5.1 +websocket-client==1.8.0 +websockets==15.0.1 +wrapt==1.17.2 +yarl==1.19.0 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/uploads/5ede19ca-74ca-4fe6-977f-657d1f8575ea.pdf b/uploads/5ede19ca-74ca-4fe6-977f-657d1f8575ea.pdf new file mode 100644 index 0000000..c0a829c Binary files /dev/null and b/uploads/5ede19ca-74ca-4fe6-977f-657d1f8575ea.pdf differ diff --git a/uploads/607f2de3-52db-46d5-89e3-468eeee1f416.pdf b/uploads/607f2de3-52db-46d5-89e3-468eeee1f416.pdf new file mode 100644 index 0000000..27cb960 Binary files /dev/null and b/uploads/607f2de3-52db-46d5-89e3-468eeee1f416.pdf differ diff --git a/uploads/8f1b49eb-f9b7-4f82-8e88-86e41f7d82f2.pdf b/uploads/8f1b49eb-f9b7-4f82-8e88-86e41f7d82f2.pdf new file mode 100644 index 0000000..c0a829c Binary files /dev/null and b/uploads/8f1b49eb-f9b7-4f82-8e88-86e41f7d82f2.pdf differ diff --git a/uploads/9fb8c345-1296-42fe-9257-acc85d8e1fdf.pdf b/uploads/9fb8c345-1296-42fe-9257-acc85d8e1fdf.pdf new file mode 100644 index 0000000..f97ed62 Binary files /dev/null and b/uploads/9fb8c345-1296-42fe-9257-acc85d8e1fdf.pdf differ diff --git a/weav_client.py b/weav_client.py new file mode 100644 index 0000000..cc4fec2 --- /dev/null +++ b/weav_client.py @@ -0,0 +1,34 @@ +import weaviate +from weaviate.classes.init import Auth +from dotenv import load_dotenv +import os + +load_dotenv() + +# Best practice: store your credentials in environment variables +weaviate_url = os.environ["WEAVIATE_REST_ENDPOINT"] +weaviate_api_key = os.environ["WEAVIATE_ADMIN_API_KEY"] + +# Connect to Weaviate Cloud +client = weaviate.connect_to_weaviate_cloud( + cluster_url=weaviate_url, + auth_credentials=Auth.api_key(weaviate_api_key), +) + +print(client.is_ready()) # Should print: `True` + +print(client.collections.list_all()) # Should print: `['GitBookChunk']` + +try: + chunks = client.collections.get("GitBookChunk") + + results = chunks.query.near_text( + query="What is the history of git?", + limit=3, + ) + +except weaviate.exceptions.UnexpectedStatusCodeException as e: + print(f"An error occurred: {e}") +finally: + client.close() + print("Connection closed.")