From 06e3e1a8bf3263ef59c64c129d71f3b450f53956 Mon Sep 17 00:00:00 2001 From: mainuddinMAins Date: Wed, 25 Mar 2026 11:18:33 -0500 Subject: [PATCH 1/3] docs: reorganize README with better structure for setup and running instructions --- README.md | 409 +++++++++++++++++++++++------------------------------- 1 file changed, 176 insertions(+), 233 deletions(-) diff --git a/README.md b/README.md index 523ade83..956c569c 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,31 @@ BAIO (Bioinformatics AI for Open-set detection) is a web-based metagenomic analy --- +## ⚡ Quick Start (2 Minutes) + +### Option A: Docker (Easiest) +```bash +docker compose up +# Frontend: http://localhost:4173 +# Backend: http://localhost:8080 +``` + +### Option B: Local Development +```bash +# Terminal 1 - Backend +source .venv/bin/activate +uvicorn api.main:app --reload --port 8080 + +# Terminal 2 - Frontend +cd frontend && npm install && npm run dev +# Frontend: http://localhost:5173 +# API: http://localhost:8080 +``` + +⚠️ **First time?** Make sure `.env` has `GOOGLE_API_KEY` set. [Get API Key](https://makersuite.google.com/app/apikey) + +--- + ## Features - **Sequence Classification**: Classifies DNA sequences into Virus or Host categories @@ -19,61 +44,6 @@ BAIO (Bioinformatics AI for Open-set detection) is a web-based metagenomic analy --- -## Tech Stack - -| Layer | Technology | -|-------|------------| -| Frontend | React + Vite + TypeScript + Tailwind CSS | -| Backend | FastAPI + Python 3.12 | -| ML | Scikit-learn (SVM, RandomForest) + optional research dependencies for embedding experiments | -| AI | Google Gemini API | -| DevOps | Docker, GitHub Actions, pytest | - ---- - -## Code File Guide - -This guide explains what each main file does - easy to understand for developers. - -### Backend (API) - -| File | What it does | -|------|--------------| -| `api/main.py` | Main FastAPI server - handles all API requests like `/classify`, `/chat`, `/health` | -| `api/llm_client.py` | Connects to Google Gemini AI for the chat assistant | - -### Frontend (React UI) - -| File | What it does | -|------|--------------| -| `frontend/src/App.tsx` | Main React component - ties everything together | -| `frontend/src/components/Header.tsx` | Top navigation bar with AI Assistant, dark mode, health status | -| `frontend/src/components/SequenceInput.tsx` | Input form for pasting DNA sequences or uploading FASTA files | -| `frontend/src/components/ConfigPanel.tsx` | Settings panel (threshold, model selection, OOD toggle) | -| `frontend/src/components/ResultsDashboard.tsx` | Shows classification results, tables, charts, export options | -| `frontend/src/components/ChatWidget.tsx` | AI Assistant floating chat window (legacy - now in Header) | -| `frontend/src/api.ts` | Functions to call backend API endpoints | - -### Machine Learning - -| File | What it does | -|------|--------------| -| `binary_classifiers/predict_class.py` | Core classification logic - takes DNA sequence, returns Virus/Host prediction | -| `binary_classifiers/transformers/kmers_transformer.py` | Converts raw DNA into overlapping 6-mers | -| `binary_classifiers/evaluation.py` | Loads labeled data and computes classifier metrics | -| `retrain_model.py` | Retrains saved SVM and RandomForest artifacts from local FASTA files | -| `metaseq/train.py` | Separate configurable training pipeline for experiments and future consolidation | - -### Root Scripts - -| File | What it does | -|------|--------------| -| `retrain_model.py` | Standalone script to retrain the ML model | -| `predict_class.py` | Quick prediction script for testing | -| `scripts/evaluate_binary_classifier.py` | Evaluates the deployed model on labeled host/virus FASTA or FASTQ data | - ---- - ## Project Structure ``` @@ -125,7 +95,9 @@ baio/ --- -## How The ML Pipeline Works +## 🔬 Advanced Features & Workflows + +### How The ML Pipeline Works 1. **Validate input DNA** The API checks sequence length, allowed nucleotide characters, GC/AT extremes, and ambiguous-base ratio before classification. @@ -199,176 +171,145 @@ To use Evo 2 embeddings instead of k-mer features: --- -## Current Limitations +## 📚 Technical Documentation + +### Tech Stack + +| Layer | Technology | +|-------|------------| +| Frontend | React + Vite + TypeScript + Tailwind CSS | +| Backend | FastAPI + Python 3.12 | +| ML | Scikit-learn (SVM, RandomForest) + optional research dependencies for embedding experiments | +| AI | Google Gemini API | +| DevOps | Docker, GitHub Actions, pytest | + +--- + +### Code File Guide + +This guide explains what each main file does - easy to understand for developers. + +#### Backend (API) + +| File | What it does | +|------|--------------| +| `api/main.py` | Main FastAPI server - handles all API requests like `/classify`, `/chat`, `/health` | +| `api/llm_client.py` | Connects to Google Gemini AI for the chat assistant | -- The default demo retraining data in `data/` is very small: 5 virus reads and 5 host reads. -- The novelty score is heuristic, so "Novel" should be treated as "needs further validation," not proof of a new pathogen. +#### Frontend (React UI) + +| File | What it does | +|------|--------------| +| `frontend/src/App.tsx` | Main React component - ties everything together | +| `frontend/src/components/Header.tsx` | Top navigation bar with AI Assistant, dark mode, health status | +| `frontend/src/components/SequenceInput.tsx` | Input form for pasting DNA sequences or uploading FASTA files | +| `frontend/src/components/ConfigPanel.tsx` | Settings panel (threshold, model selection, OOD toggle) | +| `frontend/src/components/ResultsDashboard.tsx` | Shows classification results, tables, charts, export options | +| `frontend/src/components/ChatWidget.tsx` | AI Assistant floating chat window (legacy - now in Header) | +| `frontend/src/api.ts` | Functions to call backend API endpoints | + +#### Machine Learning + +| File | What it does | +|------|--------------| +| `binary_classifiers/predict_class.py` | Core classification logic - takes DNA sequence, returns Virus/Host prediction | +| `binary_classifiers/transformers/kmers_transformer.py` | Converts raw DNA into overlapping 6-mers | +| `binary_classifiers/evaluation.py` | Loads labeled data and computes classifier metrics | +| `retrain_model.py` | Retrains saved SVM and RandomForest artifacts from local FASTA files | +| `metaseq/train.py` | Separate configurable training pipeline for experiments and future consolidation | + +#### Root Scripts + +| File | What it does | +|------|--------------| +| `retrain_model.py` | Standalone script to retrain the ML model | +| `predict_class.py` | Quick prediction script for testing | +| `scripts/evaluate_binary_classifier.py` | Evaluates the deployed model on labeled host/virus FASTA or FASTQ data | --- -## Prerequisites +## Installation & Setup + +### Prerequisites | Requirement | Version | Notes | |-------------|---------|-------| | Python | 3.10+ (3.12 recommended) | Required for backend | | Node.js | 18+ | Required for frontend | | Git | Any recent version | For cloning | -| Conda | Optional | Recommended for Python env | | Docker | Optional | For containerized deployment | ---- - -## Dependencies - -### Python Dependencies (Backend) - -**Core:** -- python-dotenv>=1.0 -- numpy>=2.2 -- pandas>=2.2 -- scikit-learn>=1.5 -- matplotlib>=3.9 -- seaborn>=0.13 -- plotly>=5.20 -- tqdm>=4.67 -- pyyaml>=6.0 -- requests>=2.32 - -**Bioinformatics:** -- biopython>=1.85 -- hdbscan>=0.8.39 -- umap-learn==0.5.7 - -**ML/AI:** -- torch>=2.8.0 -- transformers==4.56.1 -- tokenizers==0.22.0 -- accelerate>=0.30 -- datasets>=2.19 -- joblib>=1.3 - -**API:** -- fastapi>=0.115.0 -- uvicorn - -**Testing/Dev:** -- pytest -- pytest-cov -- black -- flake8 -- mypy - -### Node.js Dependencies (Frontend) - -**Dependencies:** -- react^18.3.1 -- react-dom^18.3.1 -- lucide-react^0.562.0 -- clsx^2.1.1 -- tailwind-merge^3.4.0 -- jspdf^4.2.0 - -**Dev Dependencies:** -- vite^6.0.5 -- typescript^5.7.3 -- tailwindcss^3.4.17 -- postcss^8.4.49 -- autoprefixer^10.4.20 -- eslint^9.17.0 -- @vitejs/plugin-react^4.3.3 - ---- - -## Installation - -### Option 1: Conda (Recommended) +### Step 1: Clone Repository ```bash -# 1. Clone the repository git clone https://github.com/oss-slu/baio.git cd baio +``` + +### Step 2: Set Up Environment -# 2. Create conda environment +**Option A: Conda (Recommended)** +```bash conda env create -f environment.yml conda activate baio - -# 3. Verify installation -python --version -conda list | head -20 ``` -### Option 2: Virtual Environment - +**Option B: Python venv** ```bash -# 1. Clone the repository -git clone https://github.com/oss-slu/baio.git -cd baio - -# 2. Create virtual environment -python3 -m venv baio-env - -# 3. Activate -# macOS/Linux: -source baio-env/bin/activate -# Windows: -baio-env\Scripts\activate - -# 4. Install dependencies -pip install --upgrade pip +python3 -m venv .venv +source .venv/bin/activate # macOS/Linux +# .venv\Scripts\activate # Windows pip install -r requirements.txt -pip install fastapi uvicorn - -# 5. Verify -python --version -pip list ``` -### Option 3: Docker +### Step 3: Configure API Keys -```bash -# Build and run -docker compose up --build +Create `.env` in project root: +``` +GOOGLE_API_KEY=your_google_api_key_here ``` ---- +[Get Google API Key](https://makersuite.google.com/app/apikey) -## Environment Setup +--- -### 1. Create .env File +## 🚀 Running the Project -Create a `.env` file in the project root: +### Method 1: Docker (Production-Like) ```bash -# .env -GOOGLE_API_KEY=your_google_api_key_here -``` - -### 2. Get Google API Key +# Build and run both frontend and backend +docker compose up --build -1. Go to [Google AI Studio](https://makersuite.google.com/app/apikey) -2. Create a new API key -3. Copy it to your `.env` file +# Run in background +docker compose up -d --build ---- +# Stop +docker compose down +``` -## Running the Project +**Access:** +- 🌐 Frontend: http://localhost:4173 +- 📊 Backend API: http://localhost:8080 +- 📖 API Docs: http://localhost:8080/docs -### Development Mode (Recommended) +--- -**Terminal 1 - Backend:** +### Method 2: Local Development (Recommended for Development) +#### Terminal 1: Start Backend ```bash # Activate environment conda activate baio -# OR (venv): -source baio-env/bin/activate +# OR: source .venv/bin/activate -# Start FastAPI server -python -m uvicorn api.main:app --reload --port 8080 +# Start API server +uvicorn api.main:app --reload --port 8080 ``` -**Terminal 2 - Frontend:** +**Backend ready at:** http://localhost:8080 +#### Terminal 2: Start Frontend ```bash cd frontend @@ -379,25 +320,20 @@ npm install npm run dev ``` -**Access:** -- Frontend: http://localhost:5173 -- API: http://localhost:8080 -- API Docs: http://localhost:8080/docs +**Frontend ready at:** http://localhost:5173 -### Production Mode (Docker) +--- + +### Verify Everything Works ```bash -# Build and start -docker compose up --build +# Check backend health +curl http://localhost:8080/health -# Or run in background -docker compose up -d --build +# Frontend should load automatically +# Open: http://localhost:5173 ``` -**Access:** -- Frontend: http://localhost:4173 -- API: http://localhost:8080 - --- ## Usage Guide @@ -432,8 +368,40 @@ docker compose up -d --build --- -## Testing +### Model Training & Retraining + +If you need to retrain the model: +```bash +python retrain_model.py +``` + +This will: +1. Load training data from `data/` +2. Extract k-mer features +3. Train both RandomForest and SVM classifiers +4. Save model and vectorizer artifacts under `binary_classifiers/` + +The default training data in `data/covid_reads5.fasta` and `data/human_reads5.fasta` is only a tiny demo dataset. It is useful for development, but not enough for a robust biological classifier. + +**To evaluate the current saved models on labeled files:** + +```bash +python scripts/evaluate_binary_classifier.py --model RandomForest +python scripts/evaluate_binary_classifier.py --model SVM --output runs/metrics/svm_eval.json +``` + +The evaluation script reports: +- accuracy, precision, recall, F1, and ROC-AUC +- confusion matrix +- per-class report +- misclassified sequence IDs with confidence and virus probability + +--- + +### Testing & Code Quality + +**Run Tests:** ```bash # Activate environment first conda activate baio @@ -448,10 +416,7 @@ pytest tests/test_api_classification.py pytest --cov=. tests/ ``` ---- - -## Code Quality - +**Code Quality:** ```bash # Format code black . @@ -468,37 +433,6 @@ black . && ruff check . && mypy . --- -## Model Training - -If you need to retrain the model: - -```bash -python retrain_model.py -``` - -This will: -1. Load training data from `data/` -2. Extract k-mer features -3. Train both RandomForest and SVM classifiers -4. Save model and vectorizer artifacts under `binary_classifiers/` - -The default training data in `data/covid_reads5.fasta` and `data/human_reads5.fasta` is only a tiny demo dataset. It is useful for development, but not enough for a robust biological classifier. - -To evaluate the current saved models on labeled files: - -```bash -python scripts/evaluate_binary_classifier.py --model RandomForest -python scripts/evaluate_binary_classifier.py --model SVM --output runs/metrics/svm_eval.json -``` - -The evaluation script reports: -- accuracy, precision, recall, F1, and ROC-AUC -- confusion matrix -- per-class report -- misclassified sequence IDs with confidence and virus probability - ---- - ## Common Issues & Solutions | Issue | Cause | Solution | @@ -514,14 +448,23 @@ The evaluation script reports: --- -## Project URLs +## ⚠️ Current Limitations + +- **Small Demo Dataset**: The default demo retraining data in `data/` is very small: 5 virus reads and 5 host reads. +- **Heuristic Novelty Score**: The novelty score is heuristic-based, so "Novel" should be treated as "needs further validation," not proof of a new pathogen. +- **Limited Scope**: Current models distinguish only Virus vs Host. Multi-class classification coming in future versions. + +--- + +## Project URLs & Resources | Service | URL | |---------|-----| -| Frontend | http://localhost:5173 | -| API | http://localhost:8080 | +| Frontend (Dev) | http://localhost:5173 | +| Frontend (Docker) | http://localhost:4173 | | API Docs | http://localhost:8080/docs | -| Health Check | http://localhost:8080/health | +| API Health | http://localhost:8080/health | +| GitHub Repository | https://github.com/oss-slu/baio | --- From 419dbb63525c3a7395ff5dd2cf6bacdc0af54c93 Mon Sep 17 00:00:00 2001 From: mainuddinMAins Date: Wed, 25 Mar 2026 11:24:19 -0500 Subject: [PATCH 2/3] chore: update frontend components --- frontend/src/App.tsx | 4 +- frontend/src/components/LandingPage.tsx | 57 +++++++++---------------- frontend/src/index.css | 2 +- 3 files changed, 25 insertions(+), 38 deletions(-) diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 89442509..c5e907d4 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -91,8 +91,10 @@ function App() { useEffect(() => { if (darkMode) { document.documentElement.classList.add('dark') + document.body.classList.add('dark') } else { document.documentElement.classList.remove('dark') + document.body.classList.remove('dark') } localStorage.setItem('darkMode', String(darkMode)) }, [darkMode]) @@ -167,7 +169,7 @@ function App() { if (showLanding) { return (
- + setDarkMode(!darkMode)} />
) } diff --git a/frontend/src/components/LandingPage.tsx b/frontend/src/components/LandingPage.tsx index 8f258c0a..479c83bf 100644 --- a/frontend/src/components/LandingPage.tsx +++ b/frontend/src/components/LandingPage.tsx @@ -1,13 +1,15 @@ import { useState } from 'react' -import { Dna, Shield, Zap, FileText, Bot, BarChart3, GitBranch, Code, Database, Brain, ChevronRight, ExternalLink, Github, Play, CheckCircle, AlertTriangle, Users, BookOpen, Activity, FlaskConical, GitPullRequest } from 'lucide-react' +import { Dna, Shield, Zap, FileText, Bot, BarChart3, GitBranch, Code, Database, Brain, ChevronRight, ExternalLink, Github, Play, CheckCircle, AlertTriangle, Activity, FlaskConical, GitPullRequest, Moon, Sun } from 'lucide-react' import { cn } from '../lib/utils' import ArchitectureDiagram from './ArchitectureDiagram' type LandingPageProps = { onGetStarted: () => void + darkMode: boolean + toggleDarkMode: () => void } -export default function LandingPage({ onGetStarted }: LandingPageProps) { +export default function LandingPage({ onGetStarted, darkMode, toggleDarkMode }: LandingPageProps) { const [showArchitecture, setShowArchitecture] = useState(false) if (showArchitecture) { @@ -86,7 +88,23 @@ export default function LandingPage({ onGetStarted }: LandingPageProps) { ] return ( -
+
+ {/* Dark Mode Toggle */} +
+ +
+ {/* Hero Section */}
@@ -332,39 +350,6 @@ export default function LandingPage({ onGetStarted }: LandingPageProps) {
- {/* Contributors Section */} -
-
-

- Our Team -

-

- Open-source project from Saint Louis University -

- -
- {[ - { name: 'Mainuddin', role: 'Tech Lead', icon: Users }, - { name: 'Luis Palmejar', role: 'Developer', icon: Code }, - { name: 'Kevin Yang', role: 'Developer', icon: BookOpen }, - ].map((contributor, idx) => ( -
-
- -
-

{contributor.name}

-

{contributor.role}

-
- ))} -
-
-
{/* CTA Section */}
diff --git a/frontend/src/index.css b/frontend/src/index.css index 3e6c4ee9..32c1d8f1 100644 --- a/frontend/src/index.css +++ b/frontend/src/index.css @@ -21,7 +21,7 @@ body:not(.dark) { @apply text-slate-900; } -body.dark, .dark { +body.dark, .dark, html.dark body { background: #0f172a; @apply text-slate-100; } From 85128c18decd10a84e3beed538e2d29cdf5b55fd Mon Sep 17 00:00:00 2001 From: mainuddinMAins Date: Wed, 25 Mar 2026 11:29:32 -0500 Subject: [PATCH 3/3] refactor: update frontend components and architecture diagram --- frontend/src/components/ArchitectureDiagram.tsx | 2 +- frontend/src/components/LandingPage.tsx | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/frontend/src/components/ArchitectureDiagram.tsx b/frontend/src/components/ArchitectureDiagram.tsx index 0ed4604c..eacd4286 100644 --- a/frontend/src/components/ArchitectureDiagram.tsx +++ b/frontend/src/components/ArchitectureDiagram.tsx @@ -3,7 +3,7 @@ import { cn } from '../lib/utils' export default function ArchitectureDiagram() { return ( -
+

diff --git a/frontend/src/components/LandingPage.tsx b/frontend/src/components/LandingPage.tsx index 479c83bf..ad29b820 100644 --- a/frontend/src/components/LandingPage.tsx +++ b/frontend/src/components/LandingPage.tsx @@ -14,7 +14,7 @@ export default function LandingPage({ onGetStarted, darkMode, toggleDarkMode }: if (showArchitecture) { return ( -
+
+
)