diff --git a/.gitignore b/.gitignore index 710958d..555b42f 100644 --- a/.gitignore +++ b/.gitignore @@ -234,3 +234,9 @@ mlflow_tracking_uri.txt # Large trained model files (can be regenerated with train_and_save_models.py) models/stack_*.pkl # Exclude large stack models but keep ensemble model + +# Remove Mac file +*.DS_Store + +# Do not include pre-commit-output.txt +*.pre-commit-output.txt diff --git a/Makefile b/Makefile index d741b2e..ed048aa 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # Author: AI Assistant # Date: 2025-07-14 -.PHONY: help install format lint test run train-models dash stop-dash +.PHONY: help install format lint typecheck security check-all test run train-models dash stop-dash # Default target help: @@ -12,7 +12,10 @@ help: @echo "Available targets:" @echo " install - Install dependencies using uv" @echo " format - Format code with ruff" - @echo " lint - Lint code with ruff" + @echo " lint - Lint code with ruff (includes format check)" + @echo " typecheck - Type check with mypy" + @echo " security - Security check with bandit" + @echo " check-all - Run all code quality checks (lint, typecheck, security)" @echo " test - Run tests" @echo " run - Run the modular pipeline" @echo " train-models - Train and save ML models" @@ -32,7 +35,22 @@ format: lint: @echo "๐Ÿ” Linting code with ruff..." - uv run ruff check src/ dash_app/ tests/ scripts/ --output-format=github + uv run ruff check . --fix + uv run ruff format --check . + +# Type checking +typecheck: + @echo "๐Ÿ”Ž Type checking with mypy..." + uv run mypy src/ --ignore-missing-imports + +# Security checking +security: + @echo "๐Ÿ”’ Security checking with bandit..." + uv run bandit -r src/ -f json + +# Run all quality checks +check-all: lint typecheck security + @echo "โœ… All code quality checks completed!" # Testing test: diff --git a/README.md b/README.md index 33b31d0..961bf75 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,13 @@ # Six-Stack Personality Classification Pipeline -A state-of-the-art, production-ready machine learning pipeline for personality classification leveraging ensemble learning, advanced data augmentation, and automated hyperparameter optimization. Features a fully modular, maintainable architecture with interactive dashboard. +Production-ready machine learning pipeline for personality classification using ensemble learning, data augmentation, and automated hyperparameter optimization. Modular, maintainable, and includes an interactive dashboard. -## ๐Ÿ”ง Technology Stack +## Technology Stack -**Core ML**: scikit-learn, XGBoost, LightGBM, CatBoost, Optuna -**Data Science**: pandas, numpy, scipy, SDV (synthetic data) -**Dashboard**: Dash, Plotly, Bootstrap components -**DevOps**: Docker, GitHub Actions, pre-commit hooks -**Tools**: uv (package manager), Ruff (linting), mypy (types), Bandit (security) +**ML**: scikit-learn, XGBoost, LightGBM, CatBoost, Optuna +**Data**: pandas, numpy, scipy, SDV +**Dashboard**: Dash, Plotly +**DevOps**: Docker, GitHub Actions, pre-commit, uv, Ruff, mypy, Bandit [![Python](https://img.shields.io/badge/Python-3.11+-blue.svg)](https://python.org) [![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](LICENSE) @@ -16,110 +15,49 @@ A state-of-the-art, production-ready machine learning pipeline for personality c [![Dashboard](https://img.shields.io/badge/Dashboard-Dash-red.svg)](https://plotly.com/dash/) [![Architecture](https://img.shields.io/badge/Architecture-Modular-purple.svg)](#-architecture) -## ๐Ÿ“ฑ Dashboard Preview +## Dashboard Preview
- Dashboard Interface +
- Main dashboard interface with personality feature sliders and input controls -

- Prediction Results -
- Prediction results with confidence visualization and detailed personality insights + Watch a live demo of the Personality Classification Dashboard in action
-## ๐Ÿš€ Quick Start +## Quick Start ```bash -# Clone and setup git clone cd Personality-classification - -# Install dependencies (using uv - modern Python package manager) uv sync - -# Train models (required for dashboard) -make train-models - -# Launch interactive dashboard -make dash - -# Or run the production pipeline -uv run python src/main_modular.py - -# Or explore examples -uv run python examples/main_final.py # Lightweight version -uv run python examples/main_demo.py # Demo with dummy models -uv run python examples/minimal_test.py # Installation verification +make train-models # Train models +make dash # Launch dashboard +uv run python src/main_modular.py # Run pipeline ``` -## ๐Ÿ“‹ Table of Contents - -- [Dashboard Preview](#-dashboard-preview) -- [Features](#-features) -- [Architecture](#-architecture) -- [Installation](#-installation) -- [Usage](#-usage) -- [Dashboard](#-dashboard) -- [Configuration](#-configuration) -- [Model Stacks](#-model-stacks) -- [Performance](#-performance) -- [Documentation](#-documentation) -- [Contributing](#-contributing) - -## ๐ŸŽฏ Features - -### **๐Ÿ—๏ธ Modern Modular Architecture** - -- **8 specialized modules** with single responsibility principle -- **Clean separation of concerns** for maximum maintainability -- **Independent testing** and validation of each component -- **Thread-safe configuration** management - -### **๐Ÿค– Advanced Machine Learning Pipeline** - -- **6 specialized ensemble stacks** (A-F) with complementary algorithms -- **Automated hyperparameter optimization** using Optuna -- **Intelligent ensemble blending** with optimized weights -- **Advanced data augmentation** with quality filtering and diversity control -- **Adaptive augmentation strategies** based on dataset characteristics +## Table of Contents -### **๐Ÿญ Production-Ready Infrastructure** +**Contents:** +- Dashboard Preview +- Quick Start +- Features +- Usage +- Documentation -- **Interactive Dashboard**: Modern Dash-based web interface for model inference and exploration -- **Model Training Pipeline**: Automated training and saving of ensemble models with metadata -- **Docker Support**: Complete containerization for easy deployment and scaling -- **Comprehensive Testing**: Full pytest coverage for all components with CI/CD integration -- **Modular Architecture**: Clean separation of concerns for maintainability and extensibility +## Features -### **๐Ÿ“Š Data Science Excellence** +- Modular architecture: 8 specialized modules +- 6 ensemble stacks (A-F) with complementary ML algorithms +- Automated hyperparameter optimization (Optuna) +- Advanced data augmentation (SDV Copula) +- Interactive Dash dashboard +- Dockerized deployment +- Full test coverage (pytest) -- **External data integration** using advanced merge strategy -- **Sophisticated preprocessing** with correlation-based imputation -- **Quality-controlled synthetic data** generation using SDV Copula -- **Cross-validation** with stratified folds for robust evaluation -- **Label noise injection** for improved generalization - -### **๐Ÿ› ๏ธ Modern Development Tools** - -- **uv Package Manager**: Lightning-fast dependency resolution and virtual environment management -- **Ruff Integration**: Ultra-fast Python linting and formatting (replaces Black, isort, flake8) -- **Type Safety**: Comprehensive mypy type checking with strict configuration -- **Security Scanning**: Bandit integration for security vulnerability detection -- **Pre-commit Hooks**: Automated code quality checks on every commit -- **GitHub Actions CI/CD**: Automated testing, linting, and validation on push -- **Make Automation**: Simple Makefile for common development tasks - -### **๐Ÿš€ Production Features** - -- **Professional logging** with structured output and configurable levels -- **Comprehensive error handling** and timeout protection for robust operation -- **Model persistence** with metadata for reproducibility and version control -- **Configurable settings** via centralized configuration management -- **Health monitoring** with dashboard health checks and status endpoints -- **Container support** with Docker and docker-compose for easy deployment - -## ๐Ÿ—๏ธ Architecture +## Architecture ``` src/ @@ -136,7 +74,7 @@ src/ โ”‚ โ””โ”€โ”€ utils.py # ๐Ÿ› ๏ธ Utility functions dash_app/ # ๐Ÿ–ฅ๏ธ Interactive Dashboard -โ”œโ”€โ”€ src/ # Application source +โ”œโ”€โ”€ dashboard/ # Application source โ”‚ โ”œโ”€โ”€ app.py # Main Dash application โ”‚ โ”œโ”€โ”€ layout.py # UI layout components โ”‚ โ”œโ”€โ”€ callbacks.py # Interactive callbacks @@ -154,16 +92,7 @@ models/ # ๐Ÿค– Trained Models scripts/ # ๐Ÿ› ๏ธ Utility Scripts โ””โ”€โ”€ train_and_save_models.py # Model training and persistence -examples/ # ๐Ÿ“š Usage examples -โ”œโ”€โ”€ main_final.py # โšก Lightweight production -โ”œโ”€โ”€ main_demo.py # ๐ŸŽช Demonstration -โ””โ”€โ”€ minimal_test.py # โœ… Installation check - data/ # ๐Ÿ“Š Datasets -โ”œโ”€โ”€ train.csv # Training data -โ”œโ”€โ”€ test.csv # Test data -โ”œโ”€โ”€ sample_submission.csv # Submission template -โ””โ”€โ”€ personality_datasert.csv # External data docs/ # ๐Ÿ“ Documentation โ””โ”€โ”€ [Generated documentation] # Technical guides @@ -172,7 +101,7 @@ best_params/ # ๐Ÿ’พ Optimized parameters โ””โ”€โ”€ stack_*_best_params.json # Per-stack best parameters ``` -## ๐Ÿ’ป Installation +## Installation ### Prerequisites @@ -200,164 +129,33 @@ uv run python examples/minimal_test.py pip install -r requirements.txt # Generated from pyproject.toml ``` -## ๐Ÿ“– Usage - -### ๐ŸŽฏ Production Pipeline +## Usage ```bash -# Full six-stack ensemble (recommended) +# Run production pipeline uv run python src/main_modular.py -``` -### ๐Ÿ–ฅ๏ธ Interactive Dashboard - -```bash -# Train models (one-time setup) +# Launch dashboard (after training models) make train-models - -# Launch dashboard make dash # Stop dashboard make stop-dash ``` -### โšก Quick Examples +## Dashboard -```bash -# Lightweight version -uv run python examples/main_final.py +## Dashboard -# Demo with dummy models (educational) -uv run python examples/main_demo.py - -# Test individual modules -uv run python examples/test_modules.py -``` - -### ๐Ÿ› ๏ธ Development Commands - -Available Makefile targets for streamlined development: - -```bash -make install # Install all dependencies -make format # Format code with Ruff -make lint # Run linting checks -make test # Run test suite -make train-models # Train and save production models -make dash # Launch dashboard -make stop-dash # Stop dashboard -make help # Show all available targets -``` - -### ๐Ÿ”ง Development +See the video demo above for the latest dashboard interface and features. To launch the dashboard: ```bash -# Run linting -uv run ruff check src/ - -# Auto-fix issues -uv run ruff check --fix src/ - -# Format code -uv run ruff format src/ - -# Run tests -make test - -# Train models make train-models -``` - -## ๐Ÿ–ฅ๏ธ Dashboard - -The project includes a modern, interactive Dash web application for real-time personality classification and model exploration. - -### Visual Demo - -![Dashboard Interface](docs/images/Dash_example1.png) -*Main dashboard interface with personality feature sliders and input controls* - -![Prediction Results](docs/images/Dash_example2.png) -*Prediction results with confidence visualization and detailed personality insights* - -### Features - -- **Real-time Predictions**: Input personality features and get instant predictions -- **Confidence Visualization**: Interactive probability bars for all personality types -- **Model Insights**: Detailed personality descriptions and confidence scores -- **Professional UI**: Clean, responsive design with modern styling -- **Production Ready**: Dockerized deployment with health checks - -### Quick Start - -```bash -# Ensure models are trained -make train-models - -# Launch dashboard (locally) make dash - -# Dashboard will be available at http://localhost:8050 -``` - -### Live Demo - -Experience the dashboard yourself in just a few commands: - -```bash -git clone && cd Personality-classification -uv sync && make train-models && make dash -# Then open http://localhost:8050 in your browser -``` - -The dashboard features: -- ๐ŸŽ›๏ธ **Interactive Sliders** for all personality dimensions -- ๐Ÿ“Š **Real-time Predictions** with confidence visualization -- ๐ŸŽจ **Professional UI** with responsive design -- ๐Ÿ“ˆ **Probability Bars** showing prediction confidence -- ๐Ÿ“ **Personality Insights** with detailed descriptions - -### Docker Deployment - -```bash -# Build and run with Docker Compose -cd dash_app -docker-compose up --build - -# Or run individual Docker container -docker build -t personality-dashboard . -docker run -p 8050:8050 personality-dashboard -``` - -### Dashboard Usage - -1. **Access the Dashboard**: Navigate to `http://localhost:8050` -2. **Input Features**: Use the sliders to set personality feature values: - - Gender, Age, openness, neuroticism, conscientiousness - - extraversion, agreeableness, Text_length, punctuation -3. **Get Predictions**: Click "Predict Personality" to see results -4. **Analyze Results**: View confidence scores and personality descriptions - -### API Endpoints - -The dashboard exposes a simple prediction API: - -- **Health Check**: `GET /health` - Service status -- **Predictions**: Handled through Dash callbacks (internal) - -### Stopping the Dashboard - -```bash -# Stop local dashboard -make stop-dash - -# Stop Docker containers -cd dash_app -docker-compose down +# Dashboard available at http://localhost:8050 ``` -## โš™๏ธ Configuration +## Configuration The pipeline is highly configurable through `src/modules/config.py`: @@ -407,7 +205,7 @@ TESTING_SAMPLE_SIZE = 1000 # Samples in testing mode LOG_LEVEL = "INFO" # DEBUG, INFO, WARNING, ERROR ``` -## ๐Ÿค– Model Stacks +## Model Stacks The pipeline employs six specialized ensemble stacks, each optimized for different aspects of the problem: @@ -427,7 +225,7 @@ The pipeline employs six specialized ensemble stacks, each optimized for differe - **Meta-learning approach** with Logistic Regression as final combiner - **Stratified cross-validation** ensures robust evaluation -## ๐Ÿ“Š Performance Metrics +## Performance Metrics ### Target Performance @@ -449,22 +247,7 @@ The pipeline is designed to achieve high accuracy through ensemble learning and โ””โ”€โ”€ Reproducibility: โœ… Fixed random seeds ``` -### Stack Configuration - -The pipeline employs six specialized ensemble stacks optimized for different aspects: - -| Stack | Focus | Algorithms | Hyperparameter Space | Training Approach | -| ----- | ----------------------- | --------------------------------------------------------------- | ---------------------------- | --------------------------- | -| **A** | Traditional ML (Narrow) | Random Forest, Logistic Regression, XGBoost, LightGBM, CatBoost | Conservative search space | Stable baseline performance | -| **B** | Traditional ML (Wide) | Same as Stack A | Extended search space | Broader exploration | -| **C** | Gradient Boosting | XGBoost, CatBoost | Gradient boosting focused | Tree-based specialists | -| **D** | Sklearn Ensemble | Extra Trees, Hist Gradient Boosting, SVM, Gaussian NB | Sklearn-native models | Diverse algorithm mix | -| **E** | Neural Networks | MLPClassifier, Deep architectures | Neural network tuning | Non-linear pattern capture | -| **F** | Noise-Robust Training | Same as Stack A | Standard space + label noise | Improved generalization | - -> **Note**: To see actual performance metrics, run the pipeline with your data. Use `make train-models` to train models and generate real performance reports. - -## ๐Ÿงช Testing & Validation +## Testing & Validation ### Quick Validation @@ -480,8 +263,6 @@ uv run python examples/test_modules.py ``` ### Development Testing - -```bash # Enable testing mode (faster execution) # Edit src/modules/config.py: TESTING_MODE = True @@ -491,7 +272,7 @@ TESTING_SAMPLE_SIZE = 1000 uv run python src/main_modular.py ``` -## ๐Ÿ”ง Troubleshooting +## Troubleshooting ### Common Issues @@ -530,7 +311,16 @@ uv sync # Reinstall dependencies uv run python -c "import sklearn, pandas, numpy, dash; print('OK')" ``` -#### Performance Issues + +Key folders: +- src/: Main pipeline and modules +- dash_app/: Dashboard app and Docker config +- models/: Trained models and metadata +- scripts/: Model training scripts +- examples/: Usage examples +- data/: Datasets +- docs/: Documentation +- best_params/: Optimized parameters ```bash # Optimize for your system @@ -558,106 +348,29 @@ LOG_LEVEL = "DEBUG" uv run python src/main_modular.py 2>&1 | tee debug.log ``` -## ๐Ÿ“š Documentation - -Comprehensive documentation is available in the `docs/` directory: - -- **[Technical Guide](docs/technical-guide.md)** - Deep dive into architecture, algorithms, and dashboard -- **[API Reference](docs/api-reference.md)** - Detailed module and function documentation -- **[MLOps Infrastructure](docs/mlops-infrastructure.md)** - Production deployment and monitoring -- **[Data Augmentation](docs/data-augmentation.md)** - Advanced synthetic data generation strategies -- **[Configuration Guide](docs/configuration.md)** - Complete configuration reference -- **[Performance Tuning](docs/performance-tuning.md)** - Optimization strategies and best practices -- **[Deployment Guide](docs/deployment.md)** - Production deployment instructions - -### Quick References - -- [`src/modules/README.md`](src/modules/README.md) - Module overview -- [`examples/README.md`](examples/README.md) - Usage examples -- [Architecture Diagram](docs/architecture.md) - Visual system overview - -## ๐Ÿ‘จโ€๐Ÿ’ป Lead Developer & Maintainer +## Documentation -**[Jeremy Vachier](https://github.com/jvachier)** - Lead Developer & Maintainer +See the `docs/` directory for: +- Technical Guide +- API Reference +- Data Augmentation +- Configuration Guide +- Performance Tuning +- Deployment Guide -For questions, suggestions, or collaboration opportunities: +## Lead Developer & Maintainer -- ๐Ÿ› **Issues & Bug Reports**: [Open an issue](https://github.com/jvachier/Personality-classification/issues) -- ๐Ÿ’ก **Feature Requests**: [Create a feature request](https://github.com/jvachier/Personality-classification/issues/new) -- ๐Ÿ“ง **Direct Contact**: Contact the maintainer through GitHub -- ๐Ÿ’ฌ **Discussions**: Use GitHub Discussions for general questions +**Lead Developer:** [Jeremy Vachier](https://github.com/jvachier) +For issues, feature requests, or questions, use GitHub Issues or Discussions. -## ๐Ÿค Contributing +## Contributing -We welcome contributions! Please follow these guidelines: - -### Development Setup - -```bash -# Clone and setup development environment -git clone -cd Personality-classification -uv sync --dev - -# Install pre-commit hooks -uv run pre-commit install -``` +Contributions welcome! Fork the repo, create a feature branch, implement and test your changes, then submit a pull request. -### Code Standards +## License -- **Code Quality**: Use Ruff for linting and formatting -- **Type Hints**: Required for all public functions -- **Documentation**: Docstrings for all modules and functions -- **Testing**: Add tests for new features +Licensed under the Apache License 2.0. See [LICENSE](LICENSE). -### Contribution Process +## Project Status -1. **Fork** the repository -2. **Create** a feature branch: `git checkout -b feature/amazing-feature` -3. **Implement** changes with proper testing -4. **Lint** code: `uv run ruff check --fix src/` -5. **Test** thoroughly: `uv run python examples/test_modules.py` -6. **Commit** with descriptive messages -7. **Submit** a pull request - -### Areas for Contribution - -- ๐Ÿง  **New model architectures** in Stack builders -- ๐Ÿ“Š **Additional data augmentation** methods -- โšก **Performance optimizations** -- ๐Ÿ“ **Documentation improvements** -- ๐Ÿงช **Test coverage expansion** -- ๐Ÿ”ง **Configuration enhancements** - -## ๐Ÿ“„ License - -This project is licensed under the **Apache License 2.0** - see the [LICENSE](LICENSE) file for details. - -## ๐Ÿ™ Acknowledgments - -- **Optuna Team** - For excellent hyperparameter optimization framework -- **scikit-learn Community** - For robust machine learning foundations -- **SDV Team** - For advanced synthetic data generation -- **uv/Ruff Teams** - For modern Python tooling -- **Dash/Plotly Team** - For powerful visualization and dashboarding - -## ๐Ÿ“ˆ Project Status - -| Component | Status | Version | Last Updated | -| ------------------------ | -------------------- | ------- | ------------ | -| ๐Ÿ—๏ธ **Architecture** | โœ… **Production** | v2.0 | 2025-07-14 | -| ๐Ÿค– **ML Pipeline** | โœ… **Production** | v2.0 | 2025-07-14 | -| ๐Ÿ–ฅ๏ธ **Dashboard** | โœ… **Production** | v1.0 | 2025-07-14 | -| ๐Ÿ“Š **Data Augmentation** | โœ… **Advanced** | v1.5 | 2025-07-14 | -| ๐Ÿ”ง **Configuration** | โœ… **Centralized** | v1.0 | 2025-07-14 | -| ๐Ÿ“ **Documentation** | โœ… **Comprehensive** | v1.0 | 2025-07-14 | -| ๐Ÿงช **Testing** | โœ… **CI/CD Ready** | v1.0 | 2025-07-14 | -| ๐Ÿ› ๏ธ **DevOps** | โœ… **Automated** | v1.0 | 2025-07-14 | - ---- - -
- -**๐ŸŽฏ Production Ready** | **๏ธ Interactive Dashboard** | **๐Ÿ—๏ธ Fully Modular** | **๐Ÿ“š Well Documented** - -
+**Status:** Production Ready | Interactive Dashboard | Modular | Well Documented diff --git a/dash_app/src/app.py b/dash_app/dashboard/app.py similarity index 65% rename from dash_app/src/app.py rename to dash_app/dashboard/app.py index 131d2a0..72df9f2 100644 --- a/dash_app/src/app.py +++ b/dash_app/dashboard/app.py @@ -6,6 +6,7 @@ from typing import Any import dash +import dash_bootstrap_components as dbc from .callbacks import register_callbacks from .layout import create_layout @@ -46,8 +47,49 @@ def __init__( __name__, title=f"Personality Classifier - {model_name}", suppress_callback_exceptions=True, + external_stylesheets=[dbc.themes.BOOTSTRAP, dbc.icons.FONT_AWESOME], ) + # Add custom CSS to ensure white background + self.app.index_string = """ + + + + {%metas%} + {%title%} + {%favicon%} + {%css%} + + + + {%app_entry%} +
+ {%config%} + {%scripts%} + {%renderer%} +
+ + + """ + # Load model self.model_loader = ModelLoader(model_name, model_version, model_stage) diff --git a/dash_app/dashboard/assets/enhanced_styles.css b/dash_app/dashboard/assets/enhanced_styles.css new file mode 100644 index 0000000..edd1c26 --- /dev/null +++ b/dash_app/dashboard/assets/enhanced_styles.css @@ -0,0 +1,466 @@ +/* Enhanced UI/UX Styles for Personality Dashboard */ + +/* CSS Variables for consistent theming */ +:root { + /* Personality colors */ + --intro-color: #3498db; + --extro-color: #e74c3c; + --neutral-color: #95a5a6; + + /* Brand colors */ + --primary: #2c3e50; + --secondary: #34495e; + --success: #27ae60; + --warning: #f39c12; + --info: #3498db; + --light: #ecf0f1; + --dark: #2c3e50; + + /* Spacing */ + --spacing-xs: 0.25rem; + --spacing-sm: 0.5rem; + --spacing-md: 1rem; + --spacing-lg: 1.5rem; + --spacing-xl: 2rem; + + /* Border radius */ + --border-radius: 0.5rem; + --border-radius-lg: 1rem; + + /* Shadows */ + --shadow-sm: 0 0.125rem 0.25rem rgba(0, 0, 0, 0.075); + --shadow-md: 0 0.5rem 1rem rgba(0, 0, 0, 0.15); + --shadow-lg: 0 1rem 3rem rgba(0, 0, 0, 0.175); + + /* Transitions */ + --transition-fast: 0.15s ease-in-out; + --transition-normal: 0.3s ease-in-out; + --transition-slow: 0.5s ease-in-out; +} + +/* Global styles */ +.personality-dashboard { + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); + min-height: 100vh; + padding: var(--spacing-lg); +} + +/* Header styles */ +.personality-dashboard h1 { + background: linear-gradient(45deg, var(--intro-color), var(--extro-color)); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + font-weight: 700; + font-size: 2.5rem; +} + +/* Card enhancements */ +.input-panel, +.feedback-panel, +.results-panel { + background: rgba(255, 255, 255, 0.95); + backdrop-filter: blur(10px); + border: none; + border-radius: var(--border-radius-lg); + box-shadow: var(--shadow-lg); + transition: transform var(--transition-normal), box-shadow var(--transition-normal); +} + +.input-panel:hover, +.feedback-panel:hover, +.results-panel:hover { + transform: translateY(-2px); + box-shadow: 0 1rem 4rem rgba(0, 0, 0, 0.2); +} + +/* Section titles */ +.section-title { + color: var(--primary); + font-weight: 600; + margin-bottom: var(--spacing-lg); + padding-bottom: var(--spacing-sm); + border-bottom: 2px solid var(--light); +} + +/* Enhanced sliders */ +.personality-slider { + margin: var(--spacing-lg) 0; +} + +.personality-slider .rc-slider-track { + background: linear-gradient(90deg, var(--intro-color), var(--extro-color)); + height: 8px; + border-radius: 4px; +} + +.personality-slider .rc-slider-handle { + width: 20px; + height: 20px; + border: 3px solid #fff; + box-shadow: var(--shadow-md); + background: var(--primary); + transition: all var(--transition-fast); +} + +.personality-slider .rc-slider-handle:hover, +.personality-slider .rc-slider-handle:focus { + transform: scale(1.2); + box-shadow: var(--shadow-lg); +} + +.personality-slider .rc-slider-rail { + background: var(--light); + height: 8px; + border-radius: 4px; +} + +/* Slider containers with category styling */ +.slider-social .rc-slider-track { + background: linear-gradient(90deg, #e74c3c, #c0392b); +} + +.slider-lifestyle .rc-slider-track { + background: linear-gradient(90deg, #27ae60, #229954); +} + +.slider-digital .rc-slider-track { + background: linear-gradient(90deg, #9b59b6, #8e44ad); +} + +/* Slider labels and help text */ +.slider-label { + color: var(--primary); + margin-bottom: var(--spacing-sm); + display: block; +} + +.slider-help { + font-style: italic; + margin-top: var(--spacing-xs); + display: block; +} + +.slider-container { + background: rgba(52, 73, 94, 0.05); + padding: var(--spacing-lg); + border-radius: var(--border-radius); + transition: background var(--transition-normal); +} + +.slider-container:hover { + background: rgba(52, 73, 94, 0.1); +} + +/* Enhanced dropdowns */ +.personality-dropdown .Select-control { + border: 2px solid var(--light); + border-radius: var(--border-radius); + transition: all var(--transition-fast); + min-height: 45px; +} + +.personality-dropdown .Select-control:hover { + border-color: var(--info); +} + +.personality-dropdown .Select-control.is-focused { + border-color: var(--primary); + box-shadow: 0 0 0 3px rgba(52, 73, 94, 0.1); +} + +.dropdown-label { + color: var(--primary); + margin-bottom: var(--spacing-sm); + display: block; +} + +.dropdown-container { + background: rgba(52, 73, 94, 0.05); + padding: var(--spacing-lg); + border-radius: var(--border-radius); + transition: background var(--transition-normal); +} + +.dropdown-container:hover { + background: rgba(52, 73, 94, 0.1); +} + +/* Predict button enhancement */ +.predict-button { + background: linear-gradient(45deg, var(--intro-color), var(--extro-color)); + border: none; + border-radius: 25px; + padding: var(--spacing-md) var(--spacing-xl); + font-weight: 600; + font-size: 1.1rem; + text-transform: uppercase; + letter-spacing: 0.5px; + transition: all var(--transition-normal); + position: relative; + overflow: hidden; +} + +.predict-button:hover { + transform: translateY(-2px); + box-shadow: var(--shadow-lg); +} + +.predict-button:active { + transform: translateY(0); +} + +.predict-button::before { + content: ''; + position: absolute; + top: 0; + left: -100%; + width: 100%; + height: 100%; + background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent); + transition: left var(--transition-slow); +} + +.predict-button:hover::before { + left: 100%; +} + +/* Feedback panel styles */ +.meter-container { + height: 20px; + background: var(--light); + border-radius: 10px; + position: relative; + overflow: hidden; + margin: var(--spacing-md) 0; +} + +.meter-container::after { + content: ''; + position: absolute; + top: 0; + left: 0; + height: 100%; + width: 50%; /* This would be dynamic based on current input */ + background: linear-gradient(90deg, var(--intro-color), var(--extro-color)); + border-radius: 10px; + transition: width var(--transition-normal); +} + +.meter-label { + font-size: 0.85rem; + font-weight: 500; +} + +.meter-label.intro { + color: var(--intro-color); +} + +.meter-label.extro { + color: var(--extro-color); +} + +.insights-container { + background: rgba(52, 73, 94, 0.05); + padding: var(--spacing-md); + border-radius: var(--border-radius); + border-left: 4px solid var(--info); +} + +/* Results panel styles */ +.personality-result { + font-size: 3rem; + font-weight: 700; + background: linear-gradient(45deg, var(--intro-color), var(--extro-color)); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + margin-bottom: var(--spacing-md); +} + +.confidence-score { + font-size: 1.2rem; + color: var(--secondary); + margin-bottom: var(--spacing-lg); +} + +.confidence-row { + display: flex; + align-items: center; + gap: var(--spacing-md); +} + +.personality-label { + flex: 0 0 100px; + font-weight: 500; + color: var(--primary); +} + +.confidence-bar { + flex: 1; + height: 25px; + border-radius: 12px; +} + +.confidence-text { + flex: 0 0 50px; + text-align: right; + font-weight: 600; + color: var(--primary); +} + +/* Personality insights */ +.insights-list { + list-style: none; + padding: 0; +} + +.insight-item { + background: rgba(52, 152, 219, 0.1); + margin: var(--spacing-sm) 0; + padding: var(--spacing-md); + border-radius: var(--border-radius); + border-left: 4px solid var(--info); + transition: all var(--transition-fast); +} + +.insight-item:hover { + background: rgba(52, 152, 219, 0.15); + transform: translateX(5px); +} + +/* Radar chart container */ +.personality-radar { + background: rgba(255, 255, 255, 0.9); + border-radius: var(--border-radius); + padding: var(--spacing-md); +} + +/* Responsive design */ +@media (max-width: 768px) { + .personality-dashboard { + padding: var(--spacing-md); + } + + .personality-dashboard h1 { + font-size: 2rem; + } + + .personality-result { + font-size: 2rem; + } + + .slider-container, + .dropdown-container { + padding: var(--spacing-md); + } + + .confidence-row { + flex-direction: column; + gap: var(--spacing-sm); + } + + .personality-label, + .confidence-text { + flex: none; + text-align: center; + } +} + +/* Animation keyframes */ +@keyframes pulse { + 0%, 100% { + opacity: 1; + } + 50% { + opacity: 0.7; + } +} + +@keyframes slideIn { + from { + transform: translateY(20px); + opacity: 0; + } + to { + transform: translateY(0); + opacity: 1; + } +} + +@keyframes fadeIn { + from { + opacity: 0; + } + to { + opacity: 1; + } +} + +/* Loading states */ +.loading { + animation: pulse 1.5s ease-in-out infinite; +} + +.slide-in { + animation: slideIn 0.5s ease-out; +} + +.fade-in { + animation: fadeIn 0.3s ease-in; +} + +/* Focus states for accessibility */ +.personality-slider:focus-within, +.personality-dropdown:focus-within, +.dropdown-container:focus-within { + outline: 2px solid var(--primary); + outline-offset: 2px; +} + +/* High contrast mode support */ +@media (prefers-contrast: high) { + :root { + --intro-color: #0066cc; + --extro-color: #cc0000; + --primary: #000000; + --light: #ffffff; + } + + .input-panel, + .feedback-panel, + .results-panel { + border: 2px solid var(--primary); + } +} + +/* Reduced motion support */ +@media (prefers-reduced-motion: reduce) { + * { + animation-duration: 0.01ms !important; + animation-iteration-count: 1 !important; + transition-duration: 0.01ms !important; + } +} + +/* Dark mode support */ +@media (prefers-color-scheme: dark) { + :root { + --primary: #ecf0f1; + --secondary: #bdc3c7; + --light: #34495e; + --dark: #ecf0f1; + } + + .personality-dashboard { + background: linear-gradient(135deg, #2c3e50 0%, #34495e 100%); + } + + .input-panel, + .feedback-panel, + .results-panel { + background: rgba(44, 62, 80, 0.95); + color: var(--primary); + } +} diff --git a/dash_app/src/callbacks.py b/dash_app/dashboard/callbacks.py similarity index 61% rename from dash_app/src/callbacks.py rename to dash_app/dashboard/callbacks.py index 3b16076..717d6da 100644 --- a/dash_app/src/callbacks.py +++ b/dash_app/dashboard/callbacks.py @@ -5,7 +5,7 @@ import logging from datetime import datetime -from dash import dash_table, html +from dash import html from dash.dependencies import Input, Output, State from .layout import ( @@ -82,6 +82,7 @@ def make_prediction( # Make prediction result = model_loader.predict(data) result["timestamp"] = datetime.now().isoformat() + result["input_data"] = data # Add input data for radar chart # Add to history prediction_history.append( @@ -98,64 +99,57 @@ def make_prediction( logger.error(f"Prediction error: {e}") return html.Div(f"Error: {e!s}", style={"color": "red"}) + # Enhanced predict button with loading states @app.callback( - Output("prediction-history", "children"), - Input("interval-component", "n_intervals"), - Input("predict-button", "n_clicks"), - ) - def update_prediction_history(n_intervals, n_clicks): - """Update prediction history display.""" - if not prediction_history: - return html.Div("No predictions yet", style={"color": "#7f8c8d"}) - - # Create table data - table_data = [] - for i, pred in enumerate(reversed(prediction_history[-10:])): # Show last 10 - table_data.append( - { - "ID": f"#{len(prediction_history) - i}", - "Timestamp": pred["timestamp"][:19], # Remove microseconds - "Prediction": pred["result"].get("prediction", "N/A"), - "Confidence": f"{pred['result'].get('confidence', 0):.3f}" - if pred["result"].get("confidence") - else "N/A", - } - ) - - return dash_table.DataTable( - data=table_data, - columns=[ - {"name": "ID", "id": "ID"}, - {"name": "Timestamp", "id": "Timestamp"}, - {"name": "Prediction", "id": "Prediction"}, - {"name": "Confidence", "id": "Confidence"}, - ], - style_cell={"textAlign": "left", "padding": "10px"}, - style_header={ - "backgroundColor": "#3498db", - "color": "white", - "fontWeight": "bold", - }, - style_data_conditional=[ - { - "if": {"row_index": 0}, - "backgroundColor": "#ecf0f1", - } - ], - ) - - @app.callback( - Output("interval-component", "disabled"), Input("auto-refresh-toggle", "value") + [ + Output("predict-button", "children"), + Output("predict-button", "disabled"), + Output("predict-button", "color"), + ], + [Input("predict-button", "n_clicks")], + prevent_initial_call=True, ) - def toggle_auto_refresh(value): - """Toggle auto-refresh based on checkbox.""" - return "auto" not in value - + def update_predict_button(n_clicks): + """Update predict button state with loading animation.""" + if n_clicks: + # Show loading state briefly (will be overridden by prediction callback) + return [ + [ + html.I(className="fas fa-spinner fa-spin me-2"), + "Analyzing Your Personality...", + ], + True, + "warning", + ] + + # Default state + return [ + [html.I(className="fas fa-magic me-2"), "Analyze My Personality"], + False, + "primary", + ] + + # Reset button state after prediction @app.callback( - Output("json-input", "value"), - Input("json-input-display", "value"), + [ + Output("predict-button", "children", allow_duplicate=True), + Output("predict-button", "disabled", allow_duplicate=True), + Output("predict-button", "color", allow_duplicate=True), + ], + [Input("prediction-results", "children")], prevent_initial_call=True, ) - def sync_json_input(value): - """Sync the display JSON input with the hidden one.""" - return value + def reset_predict_button(results): + """Reset predict button after prediction is complete.""" + if results: + return [ + [html.I(className="fas fa-magic me-2"), "Analyze Again"], + False, + "success", + ] + + return [ + [html.I(className="fas fa-magic me-2"), "Analyze My Personality"], + False, + "primary", + ] diff --git a/dash_app/dashboard/layout.py b/dash_app/dashboard/layout.py new file mode 100644 index 0000000..df5644c --- /dev/null +++ b/dash_app/dashboard/layout.py @@ -0,0 +1,663 @@ +"""Layout components for the Dash application.""" + +from __future__ import annotations + +from typing import Any + +import dash_bootstrap_components as dbc +import plotly.graph_objects as go +from dash import dcc, html + + +def create_layout(model_name: str, model_metadata: dict[str, Any]) -> html.Div: + """Create the main layout for the Dash application. + + Args: + model_name: Name of the model + model_metadata: Model metadata dictionary + + Returns: + Dash HTML layout + """ + return html.Div( + [ + # Professional Header + create_professional_header(), + # Main Content + dbc.Container( + [ + dbc.Row( + [ + # Input Panel - Original size + dbc.Col( + [create_input_panel()], + md=5, + className="d-flex align-self-stretch", + ), + # Results Panel - Original size + dbc.Col( + [ + html.Div( + id="prediction-results", + children=[ + dbc.Card( + [ + dbc.CardHeader( + [ + html.H4( + "Analysis Results", + className="mb-0 text-center", + style={ + "color": "#2c3e50", + "fontWeight": "400", + }, + ) + ], + style={ + "backgroundColor": "#ffffff", + "border": "none", + }, + ), + dbc.CardBody( + [ + html.Div( + [ + html.I( + className="fas fa-chart-radar fa-3x mb-3", + style={ + "color": "#bdc3c7" + }, + ), + html.H5( + "Ready for Analysis", + style={ + "color": "#7f8c8d" + }, + ), + html.P( + "Adjust the parameters and click 'Analyze Personality' to see your results.", + style={ + "color": "#95a5a6" + }, + ), + ], + className="text-center py-5", + ) + ], + style={"padding": "2rem"}, + ), + ], + className="shadow-sm h-100", + style={ + "border": "none", + "borderRadius": "15px", + }, + ) + ], + className="h-100 d-flex flex-column", + ) + ], + md=7, + className="d-flex align-self-stretch", + ), + ], + justify="center", + className="g-4", + style={"minHeight": "80vh"}, + ) + ], + fluid=True, + className="py-4", + style={ + "backgroundColor": "#ffffff", + "maxWidth": "1400px", + "margin": "0 auto", + }, + ), + ], + className="personality-dashboard", + style={"backgroundColor": "#ffffff !important", "minHeight": "100vh"}, + ) + + +def create_professional_header() -> dbc.Row: + """Create a professional header.""" + return dbc.Container( + [ + dbc.Row( + [ + dbc.Col( + [ + dbc.Card( + [ + dbc.CardBody( + [ + html.Div( + [ + html.I( + className="fas fa-brain me-3", + style={ + "fontSize": "2.5rem", + "color": "#2c3e50", + }, + ), + html.H1( + "Personality Classification", + className="d-inline-block mb-0", + style={ + "color": "#2c3e50", + "fontWeight": "300", + }, + ), + ], + className="d-flex align-items-center justify-content-center", + ), + html.P( + "Advanced AI-powered personality assessment platform using ensemble machine learning to analyze behavioral patterns and predict introversion-extraversion tendencies based on social, lifestyle, and digital behavior indicators.", + className="text-center text-muted mt-2 mb-0", + style={ + "fontSize": "1.0rem", + "maxWidth": "800px", + "margin": "0 auto", + }, + ), + ], + className="py-3", + ) + ], + className="shadow-sm border-0", + style={"backgroundColor": "#ffffff"}, + ) + ] + ) + ], + className="mb-4", + ) + ], + fluid=True, + style={"maxWidth": "1400px", "margin": "0 auto"}, + ) + + +def create_input_panel() -> dbc.Card: + """Create a clean, professional input panel.""" + return dbc.Card( + [ + dbc.CardHeader( + [ + html.H4( + "Assessment Parameters", + className="mb-0 text-center", + style={"color": "#2c3e50", "fontWeight": "400"}, + ) + ], + style={"backgroundColor": "#ffffff", "border": "none"}, + ), + dbc.CardBody( + [ + # Social Behavior Section + html.H5( + [ + html.I( + className="fas fa-users me-2", + style={"color": "#3498db"}, + ), + "Social Behavior", + ], + className="section-title mb-4", + ), + create_enhanced_slider( + "time-spent-alone", + "Time Spent Alone (hours/day)", + 0, + 24, + 8, + "Less alone time", + "More alone time", + "slider-social", + ), + create_enhanced_slider( + "social-event-attendance", + "Social Event Attendance (events/month)", + 0, + 20, + 4, + "Fewer events", + "More events", + "slider-social", + ), + # Lifestyle Section + html.H5( + [ + html.I( + className="fas fa-compass me-2", + style={"color": "#27ae60"}, + ), + "Lifestyle", + ], + className="section-title mt-5 mb-4", + ), + create_enhanced_slider( + "going-outside", + "Going Outside Frequency (times/week)", + 0, + 15, + 5, + "Stay indoors", + "Go out frequently", + "slider-lifestyle", + ), + create_enhanced_slider( + "friends-circle-size", + "Friends Circle Size", + 0, + 50, + 12, + "Small circle", + "Large network", + "slider-lifestyle", + ), + # Digital Behavior Section + html.H5( + [ + html.I( + className="fas fa-share-alt me-2", + style={"color": "#9b59b6"}, + ), + "Digital Behavior", + ], + className="section-title mt-5 mb-4", + ), + create_enhanced_slider( + "post-frequency", + "Social Media Posts (per week)", + 0, + 20, + 3, + "Rarely post", + "Frequently post", + "slider-digital", + ), + # Psychological Assessment Section + html.H5( + [ + html.I( + className="fas fa-mind-share me-2", + style={"color": "#e67e22"}, + ), + "Psychological Assessment", + ], + className="section-title mt-5 mb-4", + ), + create_enhanced_dropdown( + "stage-fear", + "Do you have stage fear?", + [ + { + "label": "No - I'm comfortable with public speaking", + "value": "No", + }, + { + "label": "Yes - I avoid speaking in public", + "value": "Yes", + }, + { + "label": "Sometimes - It depends on the situation", + "value": "Unknown", + }, + ], + "No", + ), + create_enhanced_dropdown( + "drained-after-socializing", + "Do you feel drained after socializing?", + [ + { + "label": "No - I feel energized by social interaction", + "value": "No", + }, + { + "label": "Yes - I need time alone to recharge", + "value": "Yes", + }, + { + "label": "It varies - Depends on the context", + "value": "Unknown", + }, + ], + "No", + ), + # Analysis Button + html.Div( + [ + dbc.Button( + [ + html.I(className="fas fa-brain me-2"), + "Analyze Personality", + ], + id="predict-button", + color="primary", + size="lg", + className="predict-button px-5 py-3", + style={"fontSize": "1.1rem", "fontWeight": "500"}, + ) + ], + className="text-center mt-5", + ), + ], + style={"padding": "2rem"}, + ), + ], + className="shadow-sm h-100", + style={"border": "none", "borderRadius": "15px"}, + ) + + +def create_enhanced_slider( + slider_id: str, + label: str, + min_val: int, + max_val: int, + default: int, + intro_text: str, + extro_text: str, + css_class: str, +) -> html.Div: + """Create an enhanced slider with personality hints.""" + return html.Div( + [ + html.Label(label, className="slider-label fw-bold"), + dcc.Slider( + id=slider_id, + min=min_val, + max=max_val, + step=1, + value=default, + marks={ + min_val: { + "label": intro_text, + "style": {"color": "#3498db", "fontSize": "0.8rem"}, + }, + max_val: { + "label": extro_text, + "style": {"color": "#e74c3c", "fontSize": "0.8rem"}, + }, + }, + tooltip={"placement": "bottom", "always_visible": True}, + className=f"personality-slider {css_class}", + ), + ], + className="slider-container mb-3", + ) + + +def create_enhanced_dropdown( + dropdown_id: str, label: str, options: list, default: str +) -> html.Div: + """Create an enhanced dropdown with better styling.""" + return html.Div( + [ + html.Label(label, className="dropdown-label fw-bold"), + dcc.Dropdown( + id=dropdown_id, + options=options, + value=default, + className="personality-dropdown", + ), + ], + className="dropdown-container mb-3", + ) + + +def format_prediction_result(result: dict[str, Any]) -> html.Div: + """Format prediction result for display. + + Args: + result: Prediction result dictionary + + Returns: + Formatted result component + """ + prediction = result.get("prediction", "Unknown") + confidence = result.get("confidence", 0) + prob_extrovert = result.get("probability_extrovert", 0) + prob_introvert = result.get("probability_introvert", 0) + input_data = result.get("input_data", {}) + + # Determine confidence level + if confidence > 0.7: + confidence_color = "success" + confidence_badge = "High Confidence" + elif confidence > 0.5: + confidence_color = "warning" + confidence_badge = "Medium Confidence" + else: + confidence_color = "danger" + confidence_badge = "Low Confidence" + + # Create enhanced results with Bootstrap components + return dbc.Card( + [ + dbc.CardHeader( + [ + html.H4( + "Analysis Results", + className="mb-0 text-center", + style={"color": "#2c3e50", "fontWeight": "400"}, + ) + ], + style={"backgroundColor": "#ffffff", "border": "none"}, + ), + dbc.CardBody( + [ + dbc.Row( + [ + # Main Result + dbc.Col( + [ + html.Div( + [ + html.H2( + f"๐Ÿง  {prediction}", + className="personality-result text-center", + ), + html.P( + f"Confidence: {confidence:.1%}", + className="confidence-score text-center", + ), + dbc.Badge( + confidence_badge, + color=confidence_color, + className="mb-3", + ), + ], + className="text-center", + ) + ], + md=6, + ), + # Confidence Bars + dbc.Col( + [ + html.H5("Probability Breakdown"), + create_confidence_bars( + { + "Extrovert": prob_extrovert, + "Introvert": prob_introvert, + } + ), + ], + md=6, + ), + ] + ), + # Larger Radar Chart - Full Width + dbc.Row( + [ + dbc.Col( + [ + html.H5( + "Personality Dimensions", + className="text-center mb-3", + ), + html.Div( + [ + dcc.Graph( + figure=create_personality_radar( + { + "Introvert": prob_introvert, + "Extrovert": prob_extrovert, + }, + input_data, + ), + config={"displayModeBar": False}, + className="personality-radar", + style={ + "height": "450px", + "width": "100%", + }, + ) + ], + style={"padding": "0 20px"}, + ), + ], + md=12, + className="text-center", + ) + ], + className="mt-4", + ), + # Personality Insights + html.Hr(), + html.Div( + [ + html.H5("Personality Insights"), + create_personality_insights(prediction, confidence), + ] + ), + # Metadata + html.Hr(), + html.Small( + [ + f"Model: {result.get('model_name', 'Unknown')} | ", + f"Version: {result.get('model_version', 'Unknown')} | ", + f"Timestamp: {result.get('timestamp', 'Unknown')}", + ], + className="text-muted", + ), + ], + style={"padding": "2rem"}, + ), + ], + className="shadow-sm h-100", + style={"border": "none", "borderRadius": "15px"}, + ) + + +def create_confidence_bars(probabilities: dict) -> html.Div: + """Create animated confidence bars.""" + bars = [] + for personality, prob in probabilities.items(): + color = "primary" if personality == "Introvert" else "danger" + bars.append( + html.Div( + [ + html.Span(personality, className="personality-label"), + dbc.Progress( + value=prob * 100, + color=color, + className="confidence-bar mb-2", + animated=True, + striped=True, + ), + html.Span(f"{prob:.1%}", className="confidence-text"), + ], + className="confidence-row mb-2", + ) + ) + return html.Div(bars) + + +def create_personality_insights(prediction: str, confidence: float) -> html.Div: + """Create personality insights based on prediction.""" + insights = { + "Introvert": [ + "๐Ÿ’ญ You likely process information internally before sharing", + "โšก You recharge through quiet, solitary activities", + "๐Ÿ‘ฅ You prefer deep, meaningful conversations over small talk", + "๐ŸŽฏ You tend to think before speaking", + ], + "Extrovert": [ + "๐Ÿ—ฃ๏ธ You likely think out loud and enjoy verbal processing", + "โšก You gain energy from social interactions", + "๐Ÿ‘ฅ You enjoy meeting new people and large gatherings", + "๐ŸŽฏ You tend to speak spontaneously", + ], + } + + prediction_insights = insights.get(prediction, ["Analysis in progress..."]) + + return html.Ul( + [html.Li(insight, className="insight-item") for insight in prediction_insights], + className="insights-list", + ) + + +def create_personality_radar( + probabilities: dict, input_data: dict[str, Any] | None = None +) -> go.Figure: + """Create radar chart for personality visualization.""" + categories = [ + "Social Energy", + "Processing Style", + "Decision Making", + "Lifestyle", + "Communication", + ] + + # Calculate values based on probabilities and input data + intro_tendency = probabilities.get("Introvert", 0.5) + + # Map input data to personality dimensions (simplified) + if input_data: + social_energy = 1 - (input_data.get("Time_spent_Alone", 12) / 24) + processing_style = 1 - (input_data.get("Post_frequency", 10) / 20) + decision_making = 0.8 if input_data.get("Stage_fear_Yes", 0) else 0.3 + lifestyle = 1 - (input_data.get("Going_outside", 7) / 15) + communication = 1 - (input_data.get("Friends_circle_size", 25) / 50) + + values = [ + social_energy, + processing_style, + decision_making, + lifestyle, + communication, + ] + else: + # Default values based on prediction + values = [intro_tendency] * len(categories) + + fig = go.Figure() + fig.add_trace( + go.Scatterpolar( + r=values, + theta=categories, + fill="toself", + name="Your Profile", + line_color="#3498db" if intro_tendency > 0.5 else "#e74c3c", + ) + ) + + fig.update_layout( + polar={ + "radialaxis": {"visible": True, "range": [0, 1]}, + "angularaxis": {"tickfont": {"size": 12}}, + }, + showlegend=False, + height=450, + font={"size": 12}, + title="Personality Dimensions", + margin={"l": 80, "r": 80, "t": 60, "b": 80}, + ) + + return fig diff --git a/dash_app/src/model_loader.py b/dash_app/dashboard/model_loader.py similarity index 99% rename from dash_app/src/model_loader.py rename to dash_app/dashboard/model_loader.py index 82ebb90..008493e 100644 --- a/dash_app/src/model_loader.py +++ b/dash_app/dashboard/model_loader.py @@ -49,7 +49,7 @@ def _load_model(self) -> None: for models_dir in models_paths: if models_dir.exists(): # Look for saved models based on model name - if self.model_name == "ensemble": + if self.model_name in ["ensemble", "ensemble_model"]: model_file = models_dir / "ensemble_model.pkl" metadata_file = models_dir / "ensemble_metadata.json" else: diff --git a/dash_app/main.py b/dash_app/main.py index 84f73bb..81ce7d8 100644 --- a/dash_app/main.py +++ b/dash_app/main.py @@ -2,8 +2,15 @@ import argparse import logging +import sys +from pathlib import Path -from src import PersonalityClassifierApp +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.insert(0, str(project_root)) + +# Import after path modification +from dash_app.dashboard.app import PersonalityClassifierApp # noqa: E402 def main(): diff --git a/dash_app/src/__init__.py b/dash_app/src/__init__.py deleted file mode 100644 index 9e00603..0000000 --- a/dash_app/src/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Package initialization for the Dash application.""" - -from .app import PersonalityClassifierApp, create_app - -__all__ = ["PersonalityClassifierApp", "create_app"] diff --git a/dash_app/src/layout.py b/dash_app/src/layout.py deleted file mode 100644 index b42a0de..0000000 --- a/dash_app/src/layout.py +++ /dev/null @@ -1,604 +0,0 @@ -"""Layout components for the Dash application.""" - -from __future__ import annotations - -from typing import Any - -from dash import dcc, html - - -def create_layout(model_name: str, model_metadata: dict[str, Any]) -> html.Div: - """Create the main layout for the Dash application. - - Args: - model_name: Name of the model - model_metadata: Model metadata dictionary - - Returns: - Dash HTML layout - """ - return html.Div( - [ - # Header - html.Div( - [ - html.H1( - "Personality Classification Dashboard", - style={ - "textAlign": "center", - "color": "#2c3e50", - "marginBottom": "10px", - }, - ), - html.H3( - f"Model: {model_name}", - style={ - "textAlign": "center", - "color": "#7f8c8d", - "marginBottom": "30px", - }, - ), - ] - ), - # Model Status Section - html.Div( - [ - html.H3("Model Status", style={"color": "#34495e"}), - html.Div( - id="model-status", - children=[create_status_cards(model_metadata)], - style={"marginBottom": "30px"}, - ), - ] - ), - # Prediction Section - html.Div( - [ - html.H3("Make Predictions", style={"color": "#34495e"}), - # Input methods tabs (simplified to manual only) - html.Div( - style={ - "display": "none" - }, # Hide tabs since we only have manual input - children=[ - dcc.Tabs( - id="input-tabs", - value="manual", - children=[ - dcc.Tab(label="Manual Input", value="manual"), - ], - ) - ], - ), - # Input content (always manual input) - html.Div( - id="input-content", - style={"marginTop": "20px"}, - children=[create_manual_input()], - ), - # Predict button - html.Div( - [ - html.Button( - "Predict", - id="predict-button", - style={ - "backgroundColor": "#3498db", - "color": "white", - "border": "none", - "padding": "10px 20px", - "fontSize": "16px", - "borderRadius": "5px", - "cursor": "pointer", - "marginTop": "20px", - }, - ) - ], - style={"textAlign": "center"}, - ), - # Results - html.Div(id="prediction-results", style={"marginTop": "30px"}), - ], - style={"marginBottom": "30px"}, - ), - # Prediction History Section - html.Div( - [ - html.H3("Prediction History", style={"color": "#34495e"}), - html.Div(id="prediction-history"), - # Auto-refresh toggle - html.Div( - [ - dcc.Checklist( - id="auto-refresh-toggle", - options=[ - {"label": "Auto-refresh (5s)", "value": "auto"} - ], - value=[], - style={"marginTop": "10px"}, - ), - dcc.Interval( - id="interval-component", - interval=5 * 1000, # in milliseconds - n_intervals=0, - disabled=True, - ), - ] - ), - ] - ), - ], - style={"margin": "20px", "fontFamily": "Arial, sans-serif"}, - ) - - -def create_status_cards(model_metadata: dict[str, Any]) -> html.Div: - """Create status cards showing model information. - - Args: - model_metadata: Model metadata dictionary - - Returns: - Div containing status cards - """ - model_loaded = bool(model_metadata) - status_color = "#27ae60" if model_loaded else "#e74c3c" - status_text = "Loaded" if model_loaded else "Not Loaded" - - return html.Div( - [ - # Model Status Card - html.Div( - [ - html.H4("Model Status", style={"margin": "0", "color": "#2c3e50"}), - html.P( - status_text, - style={ - "margin": "5px 0", - "color": status_color, - "fontWeight": "bold", - }, - ), - html.P( - f"Version: {model_metadata.get('version', 'Unknown')}", - style={"margin": "5px 0", "color": "#7f8c8d"}, - ), - html.P( - f"Stage: {model_metadata.get('stage', 'Unknown')}", - style={"margin": "5px 0", "color": "#7f8c8d"}, - ), - ], - style={ - "border": "1px solid #bdc3c7", - "padding": "15px", - "borderRadius": "5px", - "width": "300px", - "display": "inline-block", - "margin": "10px", - }, - ), - # Prediction Stats Card (placeholder) - html.Div( - [ - html.H4( - "Prediction Stats", style={"margin": "0", "color": "#2c3e50"} - ), - html.P( - "Total Predictions: 0", - style={"margin": "5px 0", "color": "#7f8c8d"}, - ), - html.P( - "Last Prediction: None", - style={"margin": "5px 0", "color": "#7f8c8d"}, - ), - ], - style={ - "border": "1px solid #bdc3c7", - "padding": "15px", - "borderRadius": "5px", - "width": "300px", - "display": "inline-block", - "margin": "10px", - }, - ), - ] - ) - - -def create_manual_input() -> html.Div: - """Create manual input form with actual personality features. - - Returns: - Div containing manual input components - """ - return html.Div( - [ - html.P( - "Enter your personality traits below:", - style={"fontSize": "16px", "marginBottom": "20px", "color": "#2c3e50"}, - ), - # Time spent alone - html.Div( - [ - html.Label( - "Time Spent Alone (hours per day):", - style={ - "display": "block", - "fontWeight": "bold", - "marginBottom": "5px", - }, - ), - dcc.Input( - id="time-spent-alone", - type="number", - value=2.0, - min=0, - max=24, - step=0.5, - style={"margin": "5px", "width": "200px", "padding": "5px"}, - ), - ], - style={"marginBottom": "15px"}, - ), - # Social event attendance - html.Div( - [ - html.Label( - "Social Event Attendance (events per month):", - style={ - "display": "block", - "fontWeight": "bold", - "marginBottom": "5px", - }, - ), - dcc.Input( - id="social-event-attendance", - type="number", - value=4.0, - min=0, - max=30, - step=1, - style={"margin": "5px", "width": "200px", "padding": "5px"}, - ), - ], - style={"marginBottom": "15px"}, - ), - # Going outside - html.Div( - [ - html.Label( - "Going Outside (frequency per week):", - style={ - "display": "block", - "fontWeight": "bold", - "marginBottom": "5px", - }, - ), - dcc.Input( - id="going-outside", - type="number", - value=3.0, - min=0, - max=7, - step=1, - style={"margin": "5px", "width": "200px", "padding": "5px"}, - ), - ], - style={"marginBottom": "15px"}, - ), - # Friends circle size - html.Div( - [ - html.Label( - "Friends Circle Size:", - style={ - "display": "block", - "fontWeight": "bold", - "marginBottom": "5px", - }, - ), - dcc.Input( - id="friends-circle-size", - type="number", - value=8.0, - min=0, - max=50, - step=1, - style={"margin": "5px", "width": "200px", "padding": "5px"}, - ), - ], - style={"marginBottom": "15px"}, - ), - # Post frequency - html.Div( - [ - html.Label( - "Social Media Post Frequency (posts per week):", - style={ - "display": "block", - "fontWeight": "bold", - "marginBottom": "5px", - }, - ), - dcc.Input( - id="post-frequency", - type="number", - value=3.0, - min=0, - max=20, - step=1, - style={"margin": "5px", "width": "200px", "padding": "5px"}, - ), - ], - style={"marginBottom": "15px"}, - ), - # Stage fear - html.Div( - [ - html.Label( - "Do you have stage fear?", - style={ - "display": "block", - "fontWeight": "bold", - "marginBottom": "5px", - }, - ), - dcc.Dropdown( - id="stage-fear", - options=[ - {"label": "No", "value": "No"}, - {"label": "Yes", "value": "Yes"}, - {"label": "Unknown", "value": "Unknown"}, - ], - value="No", - style={"width": "200px"}, - ), - ], - style={"marginBottom": "15px"}, - ), - # Drained after socializing - html.Div( - [ - html.Label( - "Do you feel drained after socializing?", - style={ - "display": "block", - "fontWeight": "bold", - "marginBottom": "5px", - }, - ), - dcc.Dropdown( - id="drained-after-socializing", - options=[ - {"label": "No", "value": "No"}, - {"label": "Yes", "value": "Yes"}, - {"label": "Unknown", "value": "Unknown"}, - ], - value="No", - style={"width": "200px"}, - ), - ], - style={"marginBottom": "15px"}, - ), - ], - id="manual-inputs", - style={ - "padding": "20px", - "backgroundColor": "#f8f9fa", - "borderRadius": "10px", - "border": "1px solid #dee2e6", - }, - ) - - -def format_prediction_result(result: dict[str, Any]) -> html.Div: - """Format prediction result for display. - - Args: - result: Prediction result dictionary - - Returns: - Formatted result component - """ - prediction = result.get("prediction", "Unknown") - confidence = result.get("confidence", 0) - prob_extrovert = result.get("probability_extrovert", 0) - prob_introvert = result.get("probability_introvert", 0) - - # Create visual elements - confidence_color = ( - "#27ae60" if confidence > 0.7 else "#f39c12" if confidence > 0.5 else "#e74c3c" - ) - - # Choose personality color - personality_color = "#e74c3c" if prediction == "Extrovert" else "#3498db" - - elements = [ - html.H4( - "Personality Classification Result", - style={"color": "#2c3e50", "marginBottom": "15px"}, - ), - # Main prediction with personality-specific styling - html.Div( - [ - html.H2( - f"๐Ÿง  You are classified as: {prediction}", - style={ - "color": personality_color, - "margin": "10px 0", - "textAlign": "center", - "backgroundColor": "#ecf0f1", - "padding": "15px", - "borderRadius": "10px", - "border": f"2px solid {personality_color}", - }, - ) - ] - ), - # Confidence score - html.Div( - [ - html.P( - f"Confidence Score: {confidence:.1%}", - style={ - "fontSize": "18px", - "color": confidence_color, - "margin": "15px 0", - "textAlign": "center", - "fontWeight": "bold", - }, - ) - ] - ), - ] - - # Add detailed probability breakdown - if prob_extrovert is not None and prob_introvert is not None: - elements.append( - html.Div( - [ - html.H5( - "Detailed Probabilities:", - style={"margin": "20px 0 10px 0", "color": "#2c3e50"}, - ), - html.Div( - [ - # Extrovert bar - html.Div( - [ - html.Span( - "Extrovert: ", - style={ - "fontWeight": "bold", - "width": "100px", - "display": "inline-block", - }, - ), - html.Div( - style={ - "backgroundColor": "#e74c3c", - "width": f"{prob_extrovert * 100}%", - "height": "20px", - "borderRadius": "10px", - "display": "inline-block", - "marginRight": "10px", - "minWidth": "2px", - } - ), - html.Span( - f"{prob_extrovert:.1%}", - style={"fontWeight": "bold"}, - ), - ], - style={ - "margin": "10px 0", - "display": "flex", - "alignItems": "center", - }, - ), - # Introvert bar - html.Div( - [ - html.Span( - "Introvert: ", - style={ - "fontWeight": "bold", - "width": "100px", - "display": "inline-block", - }, - ), - html.Div( - style={ - "backgroundColor": "#3498db", - "width": f"{prob_introvert * 100}%", - "height": "20px", - "borderRadius": "10px", - "display": "inline-block", - "marginRight": "10px", - "minWidth": "2px", - } - ), - html.Span( - f"{prob_introvert:.1%}", - style={"fontWeight": "bold"}, - ), - ], - style={ - "margin": "10px 0", - "display": "flex", - "alignItems": "center", - }, - ), - ], - style={ - "backgroundColor": "#f8f9fa", - "padding": "15px", - "borderRadius": "8px", - "border": "1px solid #dee2e6", - }, - ), - ] - ) - ) - - # Add personality description - if prediction == "Extrovert": - description = "๐ŸŽ‰ Extroverts typically enjoy social situations, feel energized by being around people, and tend to be outgoing and expressive." - description_color = "#e74c3c" - elif prediction == "Introvert": - description = "๐Ÿค” Introverts typically prefer quieter environments, feel energized by alone time, and tend to be more reflective and reserved." - description_color = "#3498db" - else: - description = "The model could not clearly determine your personality type." - description_color = "#7f8c8d" - - elements.append( - html.Div( - [ - html.P( - description, - style={ - "fontSize": "14px", - "color": description_color, - "margin": "15px 0", - "padding": "10px", - "backgroundColor": "#ecf0f1", - "borderRadius": "5px", - "fontStyle": "italic", - }, - ) - ] - ) - ) - - # Add metadata - elements.append( - html.Div( - [ - html.Hr(style={"margin": "20px 0"}), - html.P( - f"Model: {result.get('model_name', 'Unknown')}", - style={"color": "#7f8c8d", "margin": "5px 0", "fontSize": "12px"}, - ), - html.P( - f"Version: {result.get('model_version', 'Unknown')}", - style={"color": "#7f8c8d", "margin": "5px 0", "fontSize": "12px"}, - ), - html.P( - f"Timestamp: {result.get('timestamp', 'Unknown')}", - style={"color": "#7f8c8d", "margin": "5px 0", "fontSize": "12px"}, - ), - ] - ) - ) - - return html.Div( - elements, - style={ - "border": "2px solid " + confidence_color, - "padding": "20px", - "borderRadius": "10px", - "backgroundColor": "#ffffff", - "boxShadow": "0 2px 4px rgba(0,0,0,0.1)", - }, - ) diff --git a/docs/README.md b/docs/README.md index e610682..ff5884e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,281 +1,64 @@ # Documentation Index -Welcome to the comprehensive documentation for the Six-Stack Personality Classification Pipeline. This documentation covers everything from basic usage to advanced deployment strategies. - -## ๐Ÿ“š Documentation Structure - -### ๐ŸŽฏ Core Guides - -#### [Technical Guide](technical-guide.md) - -**Deep technical dive into the architecture and algorithms** - -- Modular design principles and SOLID architecture -- Algorithm implementation details for all 6 stacks -- Ensemble strategy and out-of-fold prediction methodology -- Data processing pipeline with external integration -- Advanced preprocessing and feature engineering -- Error handling, robustness, and reproducibility -- Extension points for customization - -#### [API Reference](api-reference.md) - -**Complete module and function documentation** - -- All 8 core modules with detailed interfaces -- Function signatures, parameters, and return types -- Type hints and validation patterns -- Error handling and exception classes -- Usage examples for each component -- Configuration options and enums - -### ๐Ÿ”ง Configuration and Tuning - -#### [Configuration Guide](configuration.md) - -**Comprehensive configuration reference** - -- Core parameters and reproducibility settings -- Threading and parallelization configuration -- Data augmentation method selection and tuning -- Environment-specific configuration profiles -- Validation and debugging strategies -- Best practices for different scenarios - -#### [Performance Tuning Guide](performance-tuning.md) - -**Optimization strategies for speed, memory, and accuracy** - -- Speed optimization for development and production -- Memory management for constrained environments -- Accuracy optimization through advanced ensemble strategies -- Threading and parallelization best practices -- I/O optimization and caching strategies -- Model-specific performance tuning -- Monitoring and profiling techniques - -### ๐Ÿค– Advanced Features - -#### [Data Augmentation Guide](data-augmentation.md) - -**Advanced synthetic data generation strategies** - -- Adaptive augmentation method selection -- SDV Copula, SMOTE, ADASYN, and basic methods -- Quality control framework with multi-dimensional assessment -- Diversity control and filtering pipeline -- Configuration options and parameter tuning -- Performance optimization and best practices -- Troubleshooting and diagnostic tools - -### ๐Ÿš€ Deployment - -#### [Deployment Guide](deployment.md) - -**Production deployment instructions** - -- Local server deployment with systemd services -- Docker containerization and Docker Compose -- Kubernetes deployment with scaling and monitoring -- Cloud platform deployment (AWS, GCP, Azure) -- REST API service with FastAPI -- Monitoring, logging, and security best practices -- Backup, recovery, and troubleshooting - -## ๐ŸŽ“ Getting Started Path - -### For New Users - -1. **Start with the main [README](../README.md)** for quick setup -2. **Try the examples** in `examples/` directory -3. **Read the [Configuration Guide](configuration.md)** for basic customization -4. **Explore the [Technical Guide](technical-guide.md)** for deeper understanding +# Documentation Index -### For Developers +Welcome! This documentation covers all aspects of the Six-Stack Personality Classification Pipeline. -1. **Review the [API Reference](api-reference.md)** for module interfaces -2. **Study the [Technical Guide](technical-guide.md)** for architecture details -3. **Follow the [Performance Tuning Guide](performance-tuning.md)** for optimization -4. **Check the [Data Augmentation Guide](data-augmentation.md)** for advanced features +## Main Guides -### For DevOps/Deployment +- [Technical Guide](technical-guide.md): Architecture, algorithms, and stacks +- [API Reference](api-reference.md): Modules, functions, and usage +- [Configuration Guide](configuration.md): All config options +- [Performance Tuning](performance-tuning.md): Speed, memory, accuracy +- [Data Augmentation](data-augmentation.md): Synthetic data strategies +- [Deployment Guide](deployment.md): Docker, Compose, production -1. **Read the [Deployment Guide](deployment.md)** for production setup -2. **Configure monitoring** using the deployment examples -3. **Set up CI/CD** following the containerization examples -4. **Implement backup strategies** from the deployment guide +## Quick Start -## ๐Ÿ“Š Quick Reference +1. See [README](../README.md) for setup +2. Try examples in `examples/` +3. Read [Configuration Guide](configuration.md) for customization +4. Explore [Technical Guide](technical-guide.md) for details -### Configuration Quick Start +## Quick Reference +**Config:** ```python -# Development (fast iteration) -TESTING_MODE = True +TESTING_MODE = True # Fast dev N_TRIALS_STACK = 5 ENABLE_DATA_AUGMENTATION = False - -# Production (high accuracy) +``` +**Production:** +```python TESTING_MODE = False N_TRIALS_STACK = 100 AUGMENTATION_METHOD = "sdv_copula" ``` - -### Performance Quick Wins - -```python -# Speed optimization -ThreadConfig.N_JOBS = 4 -N_TRIALS_STACK = 50 -AUGMENTATION_METHOD = "smote" - -# Memory optimization -TESTING_SAMPLE_SIZE = 1000 -ThreadConfig.N_JOBS = 2 -ENABLE_DATA_AUGMENTATION = False -``` - -### Docker Quick Deploy - +**Docker:** ```bash -# Build and run +docker-compose up --build +# or docker build -t personality-classifier . -docker run -d --name pc -p 8080:8080 personality-classifier - -# With Docker Compose -docker-compose up -d +docker run -p 8080:8080 personality-classifier ``` -## ๐Ÿ” Finding What You Need - -### By Use Case - -| Use Case | Primary Guide | Supporting Docs | -| ------------------------------ | ------------------------------------------- | ------------------------------------------- | -| **Quick prototyping** | [README](../README.md) | [Configuration](configuration.md) | -| **Understanding architecture** | [Technical Guide](technical-guide.md) | [API Reference](api-reference.md) | -| **Optimizing performance** | [Performance Tuning](performance-tuning.md) | [Configuration](configuration.md) | -| **Improving accuracy** | [Data Augmentation](data-augmentation.md) | [Technical Guide](technical-guide.md) | -| **Production deployment** | [Deployment Guide](deployment.md) | [Performance Tuning](performance-tuning.md) | -| **Custom development** | [API Reference](api-reference.md) | [Technical Guide](technical-guide.md) | - -### By Component - -| Component | Documentation | -| ------------------ | ------------------------------------------------------------- | -| **Config system** | [Configuration Guide](configuration.md) | -| **Data loading** | [API Reference](api-reference.md#data_loaderpy) | -| **Preprocessing** | [API Reference](api-reference.md#preprocessingpy) | -| **Augmentation** | [Data Augmentation Guide](data-augmentation.md) | -| **Model builders** | [API Reference](api-reference.md#model_builderspy) | -| **Ensemble** | [Technical Guide](technical-guide.md#ensemble-strategy) | -| **Optimization** | [API Reference](api-reference.md#optimizationpy) | -| **Main pipeline** | [Technical Guide](technical-guide.md#architecture-philosophy) | - -### By Problem - -| Problem | Solution Location | -| ------------------------ | ------------------------------------------------------------------------------------------------------------ | -| **Slow training** | [Performance Tuning](performance-tuning.md#speed-optimization) | -| **Memory issues** | [Performance Tuning](performance-tuning.md#memory-optimization) | -| **Poor accuracy** | [Data Augmentation](data-augmentation.md), [Performance Tuning](performance-tuning.md#accuracy-optimization) | -| **Configuration errors** | [Configuration Guide](configuration.md#validation-and-error-handling) | -| **Deployment issues** | [Deployment Guide](deployment.md#troubleshooting) | -| **Understanding code** | [API Reference](api-reference.md), [Technical Guide](technical-guide.md) | - -## ๐Ÿ› ๏ธ Development Resources - -### Code Examples - -- **Basic usage**: `examples/minimal_test.py` -- **Development workflow**: `examples/main_demo.py` -- **Production pipeline**: `src/main_modular.py` -- **Module testing**: `examples/test_modules.py` - -### Configuration Templates - -- **Development**: [Configuration Guide](configuration.md#development-presets) -- **Production**: [Configuration Guide](configuration.md#production-server) -- **Docker**: [Deployment Guide](deployment.md#docker-deployment) -- **Kubernetes**: [Deployment Guide](deployment.md#kubernetes-deployment) - -### Monitoring and Debugging - -- **Performance monitoring**: [Performance Tuning](performance-tuning.md#monitoring-and-profiling) -- **Structured logging**: [Deployment Guide](deployment.md#structured-logging) -- **Quality diagnostics**: [Data Augmentation](data-augmentation.md#debugging-augmentation) - -## ๐Ÿ“ˆ Advanced Topics - -### Research and Experimentation - -- **Adding new model stacks**: [Technical Guide](technical-guide.md#adding-new-model-stacks) -- **Custom augmentation methods**: [Data Augmentation](data-augmentation.md#future-enhancements) -- **Meta-learning approaches**: [Technical Guide](technical-guide.md#future-enhancements) - -### Production Optimization - -- **Auto-scaling strategies**: [Deployment Guide](deployment.md#kubernetes-deployment) -- **A/B testing framework**: [Technical Guide](technical-guide.md#future-enhancements) -- **Model versioning**: [Deployment Guide](deployment.md#api-service-deployment) - -### Integration Patterns - -- **REST API development**: [Deployment Guide](deployment.md#fastapi-rest-api) -- **Batch processing**: [Deployment Guide](deployment.md#scheduled-training-with-cron) -- **Real-time inference**: [Deployment Guide](deployment.md#api-service-deployment) - -## ๐Ÿ†• What's New - -### Latest Features (v2.0) - -- โœ… **Advanced data augmentation** with SDV Copula and quality control -- โœ… **Centralized configuration** system with threading management -- โœ… **Modular architecture** with 8 specialized modules -- โœ… **Production-ready deployment** with Docker and Kubernetes support -- โœ… **Comprehensive documentation** with guides for all use cases - -### Upcoming Features - -- ๐Ÿ”„ **GPU acceleration** for neural network stacks -- ๐Ÿ”„ **AutoML integration** for automatic hyperparameter tuning -- ๐Ÿ”„ **Distributed training** support for large datasets -- ๐Ÿ”„ **Model interpretability** tools and dashboards - -## ๐Ÿ’ฌ Support and Contributing - -### Getting Help - -1. **Check this documentation** for comprehensive guides -2. **Review examples** in the `examples/` directory -3. **Search issues** in the repository -4. **Create new issue** with detailed problem description - -### Contributing - -1. **Read the [README](../README.md#contributing)** for contribution guidelines -2. **Focus on modular development** using the established architecture -3. **Add tests** for new features in the `examples/` directory -4. **Update documentation** for significant changes - -### Community - -- **Repository**: [GitHub Repository Link] -- **Issues**: For bug reports and feature requests -- **Discussions**: For questions and community support +## ๏ธ Resources ---- +- Code: `src/main_modular.py`, `examples/` +- Config templates: [Configuration Guide](configuration.md) +- Monitoring: [Performance Tuning](performance-tuning.md) +- Deployment: [Deployment Guide](deployment.md) -_This documentation is continuously updated. For the latest information, check the repository and individual guide timestamps._ +## Latest Features -## ๐Ÿ“‹ Documentation Checklist +- Advanced SDV Copula augmentation +- Centralized config system +- Modular architecture +- Dockerized deployment +- Comprehensive documentation -When working with the pipeline, use this checklist to find the right documentation: +## Help & Contributing -- [ ] **New to the project?** โ†’ Start with [README](../README.md) -- [ ] **Need to configure settings?** โ†’ [Configuration Guide](configuration.md) -- [ ] **Want to understand the code?** โ†’ [API Reference](api-reference.md) -- [ ] **Looking to optimize performance?** โ†’ [Performance Tuning](performance-tuning.md) -- [ ] **Need better accuracy?** โ†’ [Data Augmentation](data-augmentation.md) -- [ ] **Ready for production?** โ†’ [Deployment Guide](deployment.md) -- [ ] **Want deep technical details?** โ†’ [Technical Guide](technical-guide.md) +- Review guides and examples +- Search or create issues in the repo +- See [README](../README.md#contributing) for contribution steps diff --git a/docs/api-reference.md b/docs/api-reference.md index bb04d89..ee82db2 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -1,63 +1,40 @@ # API Reference - Six-Stack Personality Classification Pipeline - -## Module Overview - -The pipeline consists of 8 core modules, each with well-defined interfaces and responsibilities. - -## config.py - -### Configuration Management - -#### Global Constants - -```python -RND: int = 42 # Global random seed -N_SPLITS: int = 5 # Cross-validation folds -N_TRIALS_STACK: int = 15 # Optuna trials per stack -N_TRIALS_BLEND: int = 200 # Ensemble blending trials -LOG_LEVEL: str = "INFO" # Logging level -``` - -#### Threading Configuration - -```python -class ThreadConfig(Enum): - """Centralized threading configuration.""" - N_JOBS: int = 4 # Parallel jobs for sklearn - THREAD_COUNT: int = 4 # Thread count for XGB/LGB -``` - -#### Data Augmentation Configuration - -```python -ENABLE_DATA_AUGMENTATION: bool = True -AUGMENTATION_METHOD: str = "sdv_copula" -AUGMENTATION_RATIO: float = 0.05 -DIVERSITY_THRESHOLD: float = 0.95 -QUALITY_THRESHOLD: float = 0.7 -``` - -#### Functions - -```python -def setup_logging() -> None: - """Initialize structured logging configuration.""" - -def get_logger(name: str) -> logging.Logger: - """Get configured logger instance.""" -``` - -## data_loader.py - -### Data Loading and External Integration - -#### Primary Functions - -```python -def load_data_with_external_merge() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Load training/test data with external dataset merge using TOP-4 strategy. - +## Modules & Functions + +**config.py** +- RND: int = 42 +- N_SPLITS: int = 5 +- N_TRIALS_STACK: int = 15 +- N_TRIALS_BLEND: int = 200 +- LOG_LEVEL: str = "INFO" +- ENABLE_DATA_AUGMENTATION: bool = True +- AUGMENTATION_METHOD: str = "sdv_copula" +- AUGMENTATION_RATIO: float = 0.05 +- DIVERSITY_THRESHOLD: float = 0.95 +- QUALITY_THRESHOLD: float = 0.7 +- class ThreadConfig(Enum): N_JOBS, THREAD_COUNT +- setup_logging(), get_logger(name) + +**data_loader.py** +- load_data_with_external_merge() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] + +**preprocessing.py** +- preprocess_data(df) -> pd.DataFrame + +**data_augmentation.py** +- augment_data(X, y, method, ratio) -> pd.DataFrame + +**model_builders.py** +- build_stack(stack_id, X, y) -> model + +**ensemble.py** +- blend_predictions(preds_list) -> np.ndarray + +**optimization.py** +- optimize_hyperparameters(model, X, y) -> dict + +**utils.py** +- Utility functions for metrics, logging, etc. Returns: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: (train_df, test_df, submission_template) diff --git a/docs/architecture.md b/docs/architecture.md index f3e1d0d..a530c85 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,52 +1,21 @@ # Architecture Documentation -## Overview - -This project implements a modular personality classification pipeline with ensemble learning and hyperparameter optimization. - -## Component Architecture - -### Core Modules (`src/modules/`) - -1. **config.py** - Configuration management and logging setup -2. **data_loader.py** - Data loading with external dataset merging -3. **preprocessing.py** - Feature engineering and data preprocessing -4. **data_augmentation.py** - Data augmentation strategies -5. **model_builders.py** - Model construction for different stacks -6. **ensemble.py** - Out-of-fold predictions and ensemble methods -7. **optimization.py** - Optuna hyperparameter optimization utilities -8. **utils.py** - General utility functions - -### Execution Scripts - -- **src/main_modular.py** - Main production pipeline -- **examples/main_final.py** - Lightweight working example -- **examples/main_demo.py** - Demo with simplified models -- **examples/test_modules.py** - Module testing script -- **examples/minimal_test.py** - Import verification - -## Data Flow - -1. **Data Loading** โ†’ External dataset merge โ†’ Feature extraction -2. **Preprocessing** โ†’ Feature engineering โ†’ Data augmentation -3. **Model Training** โ†’ 6 specialized stacks with Optuna optimization -4. **Ensemble** โ†’ Out-of-fold predictions โ†’ Blend optimization -5. **Pseudo-labeling** โ†’ Conservative high-confidence labeling -6. **Final Prediction** โ†’ Weighted ensemble โ†’ Submission generation - -## Stack Configurations - -- **Stack A**: Traditional ML (narrow hyperparameters) -- **Stack B**: Traditional ML (wide hyperparameters) -- **Stack C**: XGBoost + CatBoost specialized -- **Stack D**: Sklearn ensemble models -- **Stack E**: Neural network models -- **Stack F**: Noisy label training - -## Performance Features - -- Memory-efficient processing -- CPU-optimized configurations -- Robust error handling with timeouts -- Modular testing capabilities -- Comprehensive logging +## Architecture Overview + +- Modular pipeline: 8 core modules in `src/modules/` +- Main pipeline: `src/main_modular.py` +- Dashboard: `dash_app/` (Dash, Docker) +- Model stacks: 6 specialized ensembles (A-F) +- Data flow: Load โ†’ Preprocess โ†’ Augment โ†’ Train โ†’ Ensemble โ†’ Predict + +## Stacks +- A: Traditional ML (narrow) +- B: Traditional ML (wide) +- C: XGBoost/CatBoost +- D: Sklearn ensemble +- E: Neural networks +- F: Noise-robust + +## Key Features +- Efficient, reproducible, and testable +- Full logging and error handling diff --git a/docs/configuration.md b/docs/configuration.md deleted file mode 100644 index 423ebee..0000000 --- a/docs/configuration.md +++ /dev/null @@ -1,544 +0,0 @@ -# Configuration Guide - -## Overview - -The Six-Stack Personality Classification Pipeline provides extensive configuration options through the centralized `src/modules/config.py` file. This guide covers all configuration parameters, their purposes, and best practices for tuning. - -## Configuration Architecture - -### Centralized Configuration - -All configuration is managed through a single module to ensure: - -- **Consistency** across all components -- **Easy maintenance** and updates -- **Environment-specific** settings -- **Type safety** with enums and validation - -### Configuration Categories - -1. **Core Parameters** - Basic pipeline settings -2. **Threading Configuration** - Parallel processing control -3. **Data Augmentation** - Synthetic data generation -4. **Model Training** - Algorithm-specific settings -5. **Development** - Testing and debugging options -6. **Logging** - Output and monitoring control - -## Core Parameters - -### Reproducibility Settings - -```python -# Global random seed for reproducibility -RND: int = 42 - -# Description: Controls all random number generation across the pipeline -# Impact: Ensures reproducible results across runs -# Tuning: Change only when you need different random behavior -# Valid Range: Any integer (0-2^31) -``` - -### Cross-Validation Configuration - -```python -# Number of stratified folds for cross-validation -N_SPLITS: int = 5 - -# Description: Controls k-fold cross-validation splitting -# Impact: More folds = more reliable estimates but longer training -# Tuning: 3-10 folds typically, 5 is standard -# Memory Impact: Linear increase with more folds -``` - -### Hyperparameter Optimization - -```python -# Optuna trials per individual stack -N_TRIALS_STACK: int = 15 - -# Description: Number of hyperparameter combinations to try per stack -# Impact: More trials = better optimization but longer training -# Tuning Guidelines: -# - Development: 5-15 trials -# - Production: 50-200 trials -# - Competition: 500+ trials -# Time Impact: Linear increase with trial count - -# Ensemble blending optimization trials -N_TRIALS_BLEND: int = 200 - -# Description: Trials for optimizing ensemble weights -# Impact: Critical for final performance, usually converges quickly -# Tuning: 100-500 trials, diminishing returns after 200 -``` - -## Threading Configuration - -### Thread Management Enum - -```python -class ThreadConfig(Enum): - """Centralized threading configuration for all models.""" - - N_JOBS: int = 4 # sklearn parallel jobs - THREAD_COUNT: int = 4 # XGBoost/LightGBM threads -``` - -### Optimization Guidelines - -#### System-Specific Tuning - -```python -# For development machines (4-8 cores) -N_JOBS = 2 -THREAD_COUNT = 2 - -# For production servers (16+ cores) -N_JOBS = 8 -THREAD_COUNT = 8 - -# For memory-constrained environments -N_JOBS = 1 -THREAD_COUNT = 1 - -# Auto-detection approach -import multiprocessing -optimal_threads = min(multiprocessing.cpu_count(), 8) -``` - -#### Performance vs Resource Trade-offs - -| Setting | Training Speed | Memory Usage | CPU Usage | -| ----------- | -------------- | ------------ | --------- | -| 1 thread | Slowest | Lowest | Low | -| 2-4 threads | Moderate | Moderate | Medium | -| 8+ threads | Fastest | Highest | High | - -## Data Augmentation Configuration - -### Main Augmentation Settings - -```python -# Enable/disable data augmentation globally -ENABLE_DATA_AUGMENTATION: bool = True - -# Augmentation method selection -AUGMENTATION_METHOD: str = "sdv_copula" -# Options: "auto", "sdv_copula", "smote", "adasyn", "basic" - -# Augmentation ratio (fraction of original dataset) -AUGMENTATION_RATIO: float = 0.05 # 5% additional synthetic data -``` - -### Method Selection Guide - -#### "auto" (Recommended) - -- **Best for**: Most use cases -- **Behavior**: Automatically selects optimal method based on data characteristics -- **Fallback**: Always provides a working solution - -#### "sdv_copula" - -- **Best for**: Large datasets with complex distributions -- **Pros**: High-quality synthetic data, preserves correlations -- **Cons**: Computationally intensive, requires more memory -- **Use when**: Dataset >5K samples, complex feature interactions - -#### "smote" - -- **Best for**: Small to medium datasets with class imbalance -- **Pros**: Fast, well-tested, handles imbalance well -- **Cons**: May create unrealistic edge cases -- **Use when**: Dataset <5K samples, clear class imbalance - -#### "adasyn" - -- **Best for**: Severely imbalanced datasets -- **Pros**: Adaptive to difficult examples, improved boundary learning -- **Cons**: Sensitive to noise, may overfit to outliers -- **Use when**: Extreme imbalance (>90% majority class) - -#### "basic" - -- **Best for**: High-categorical datasets or fallback -- **Pros**: Fast, simple, always works -- **Cons**: Lower quality, limited sophistication -- **Use when**: Many categorical features, quick prototyping - -### Quality Control Parameters - -```python -# Quality filtering threshold (0-1, higher = stricter) -QUALITY_THRESHOLD: float = 0.7 - -# Diversity requirement (0-1, higher = more diverse) -DIVERSITY_THRESHOLD: float = 0.95 - -# Method-specific parameters -SDV_EPOCHS: int = 100 # SDV training epochs (5 in testing) -SMOTE_K_NEIGHBORS: int = 5 # k for SMOTE (auto-adjusted) -BASIC_NOISE_FACTOR: float = 0.1 # Noise factor for basic method -``` - -### Advanced Augmentation Tuning - -#### Quality Threshold Tuning - -```python -# Conservative (high quality, fewer samples) -QUALITY_THRESHOLD = 0.8 - -# Balanced (moderate quality, moderate samples) -QUALITY_THRESHOLD = 0.7 - -# Aggressive (lower quality, more samples) -QUALITY_THRESHOLD = 0.6 - -# Development/testing (relaxed quality) -QUALITY_THRESHOLD = 0.5 -``` - -#### Ratio Optimization Strategy - -```python -# Start conservative and increase -AUGMENTATION_RATIOS = [0.02, 0.05, 0.10, 0.15, 0.20] - -# Monitor cross-validation scores -for ratio in AUGMENTATION_RATIOS: - AUGMENTATION_RATIO = ratio - cv_score = evaluate_pipeline() - if cv_score < previous_best: - break # Diminishing returns detected -``` - -## Model Training Configuration - -### Label Noise for Robustness - -```python -# Label noise rate for Stack F (noise-robust training) -LABEL_NOISE_RATE: float = 0.02 # 2% of labels randomly flipped - -# Description: Improves generalization by training on noisy labels -# Impact: Better robustness to annotation errors -# Tuning Range: 0.01-0.05 (1-5%) -# Warning: Too much noise degrades performance -``` - -### Timeout and Resource Limits - -```python -# Training timeout per stack (seconds) -STACK_TIMEOUT: int = 1800 # 30 minutes - -# Memory limit warning threshold (GB) -MEMORY_WARNING_THRESHOLD: float = 8.0 - -# Early stopping patience for neural networks -EARLY_STOPPING_PATIENCE: int = 10 -``` - -## Development and Testing - -### Testing Mode Configuration - -```python -# Enable reduced dataset for faster development -TESTING_MODE: bool = True - -# Sample size in testing mode -TESTING_SAMPLE_SIZE: int = 1000 - -# Reduced trials in testing mode -TESTING_N_TRIALS_STACK: int = 5 -TESTING_N_TRIALS_BLEND: int = 50 - -# Fast augmentation in testing -TESTING_SDV_EPOCHS: int = 5 -``` - -### Development Presets - -```python -# Quick development preset -def configure_for_development(): - global TESTING_MODE, N_TRIALS_STACK, ENABLE_DATA_AUGMENTATION - TESTING_MODE = True - N_TRIALS_STACK = 5 - ENABLE_DATA_AUGMENTATION = False - logger.info("Configured for rapid development") - -# Full production preset -def configure_for_production(): - global TESTING_MODE, N_TRIALS_STACK, N_TRIALS_BLEND - TESTING_MODE = False - N_TRIALS_STACK = 100 - N_TRIALS_BLEND = 300 - logger.info("Configured for production run") -``` - -## Logging Configuration - -### Log Level Settings - -```python -# Logging level -LOG_LEVEL: str = "INFO" - -# Options: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL" -# DEBUG: Very detailed information for debugging -# INFO: General information about progress (recommended) -# WARNING: Important warnings and issues -# ERROR: Only error messages -``` - -### Advanced Logging Configuration - -```python -# Log file configuration -LOG_FILE: str = "personality_classifier.log" -LOG_MAX_SIZE: int = 10 * 1024 * 1024 # 10MB -LOG_BACKUP_COUNT: int = 5 - -# Performance logging -ENABLE_PERFORMANCE_LOGGING: bool = True -LOG_MEMORY_USAGE: bool = True -LOG_TIMING_INFO: bool = True - -# Progress bar configuration -SHOW_PROGRESS_BARS: bool = True -PROGRESS_BAR_STYLE: str = "tqdm" # "tqdm" or "simple" -``` - -## Environment-Specific Configuration - -### Configuration Profiles - -#### Local Development - -```python -# config_development.py -TESTING_MODE = True -N_TRIALS_STACK = 5 -N_TRIALS_BLEND = 50 -ENABLE_DATA_AUGMENTATION = False -LOG_LEVEL = "DEBUG" -ThreadConfig.N_JOBS = 2 -ThreadConfig.THREAD_COUNT = 2 -``` - -#### CI/CD Pipeline - -```python -# config_ci.py -TESTING_MODE = True -N_TRIALS_STACK = 3 -N_TRIALS_BLEND = 20 -ENABLE_DATA_AUGMENTATION = False -LOG_LEVEL = "WARNING" -ThreadConfig.N_JOBS = 1 -ThreadConfig.THREAD_COUNT = 1 -``` - -#### Production Server - -```python -# config_production.py -TESTING_MODE = False -N_TRIALS_STACK = 100 -N_TRIALS_BLEND = 300 -ENABLE_DATA_AUGMENTATION = True -AUGMENTATION_METHOD = "sdv_copula" -LOG_LEVEL = "INFO" -ThreadConfig.N_JOBS = 8 -ThreadConfig.THREAD_COUNT = 8 -``` - -### Environment Variable Integration - -```python -import os - -# Override with environment variables -N_TRIALS_STACK = int(os.getenv('PERSONALITY_TRIALS_STACK', N_TRIALS_STACK)) -TESTING_MODE = os.getenv('PERSONALITY_TESTING_MODE', 'false').lower() == 'true' -LOG_LEVEL = os.getenv('PERSONALITY_LOG_LEVEL', LOG_LEVEL) - -# Docker environment detection -if os.getenv('RUNNING_IN_DOCKER'): - ThreadConfig.N_JOBS = min(ThreadConfig.N_JOBS, 4) - MEMORY_WARNING_THRESHOLD = 2.0 # Lower threshold in containers -``` - -## Performance Tuning Guidelines - -### Memory Optimization - -```python -# For systems with <8GB RAM -TESTING_MODE = True -TESTING_SAMPLE_SIZE = 500 -ThreadConfig.N_JOBS = 1 -ENABLE_DATA_AUGMENTATION = False - -# For systems with 8-16GB RAM (recommended) -TESTING_SAMPLE_SIZE = 1000 -ThreadConfig.N_JOBS = 2 -AUGMENTATION_RATIO = 0.03 - -# For systems with >16GB RAM -ThreadConfig.N_JOBS = 4 -AUGMENTATION_RATIO = 0.05 -N_TRIALS_STACK = 50 -``` - -### Speed Optimization - -```python -# Fastest configuration (for quick iteration) -TESTING_MODE = True -N_TRIALS_STACK = 3 -N_TRIALS_BLEND = 20 -ENABLE_DATA_AUGMENTATION = False -SHOW_PROGRESS_BARS = False - -# Balanced configuration (development) -N_TRIALS_STACK = 15 -N_TRIALS_BLEND = 100 -AUGMENTATION_METHOD = "smote" # Faster than SDV - -# Quality-focused configuration (production) -N_TRIALS_STACK = 100 -N_TRIALS_BLEND = 300 -AUGMENTATION_METHOD = "sdv_copula" -``` - -### GPU Configuration (Future) - -```python -# GPU settings (when available) -USE_GPU: bool = False -GPU_MEMORY_FRACTION: float = 0.8 -ENABLE_MIXED_PRECISION: bool = False - -# GPU-specific model settings -GPU_BATCH_SIZE: int = 64 -GPU_N_ESTIMATORS_FACTOR: float = 2.0 # Increase for GPU -``` - -## Validation and Error Handling - -### Configuration Validation - -```python -def validate_configuration(): - """Validate configuration parameters.""" - assert 0 < AUGMENTATION_RATIO <= 1.0, "Invalid augmentation ratio" - assert N_SPLITS >= 2, "Need at least 2 CV folds" - assert 0 <= LABEL_NOISE_RATE <= 0.2, "Label noise rate too high" - assert ThreadConfig.N_JOBS >= 1, "Need at least 1 job" - - if TESTING_MODE and N_TRIALS_STACK > 20: - logger.warning("High trial count in testing mode may be slow") - - if not ENABLE_DATA_AUGMENTATION and AUGMENTATION_RATIO > 0: - logger.warning("Augmentation ratio set but augmentation disabled") -``` - -### Configuration Debugging - -```python -def log_configuration(): - """Log current configuration for debugging.""" - logger.info("Configuration Summary:") - logger.info(f" Mode: {'Testing' if TESTING_MODE else 'Production'}") - logger.info(f" Trials per stack: {N_TRIALS_STACK}") - logger.info(f" Augmentation: {AUGMENTATION_METHOD if ENABLE_DATA_AUGMENTATION else 'Disabled'}") - logger.info(f" Threading: {ThreadConfig.N_JOBS} jobs, {ThreadConfig.THREAD_COUNT} threads") - logger.info(f" Random seed: {RND}") -``` - -## Configuration Best Practices - -### 1. Start Conservative - -- Begin with default settings -- Use testing mode for development -- Gradually increase complexity - -### 2. Monitor Resources - -- Watch memory usage during training -- Monitor CPU utilization -- Adjust threading based on available resources - -### 3. Validate Changes - -- Test configuration changes on small datasets first -- Compare cross-validation scores -- Ensure reproducibility with fixed seeds - -### 4. Document Customizations - -- Comment configuration changes -- Track performance impacts -- Maintain environment-specific configs - -### 5. Use Version Control - -- Track configuration changes -- Tag configurations with results -- Maintain separate configs for different environments - -## Troubleshooting Common Issues - -### Memory Issues - -```python -# Reduce memory usage -TESTING_MODE = True -ThreadConfig.N_JOBS = 1 -ENABLE_DATA_AUGMENTATION = False -TESTING_SAMPLE_SIZE = 500 -``` - -### Slow Training - -```python -# Speed up training -N_TRIALS_STACK = 5 -N_TRIALS_BLEND = 50 -AUGMENTATION_METHOD = "basic" -SHOW_PROGRESS_BARS = False -``` - -### Poor Performance - -```python -# Increase optimization -N_TRIALS_STACK = 100 -N_TRIALS_BLEND = 300 -AUGMENTATION_METHOD = "sdv_copula" -AUGMENTATION_RATIO = 0.08 -``` - -### Reproducibility Issues - -```python -# Ensure reproducibility -# Set fixed seed -RND = 42 - -# Single-threaded for determinism -ThreadConfig.N_JOBS = 1 -ThreadConfig.THREAD_COUNT = 1 - -# Disable random augmentation -ENABLE_DATA_AUGMENTATION = False -``` - ---- - -_This configuration guide covers all current options. For the latest parameters and features, check the source code in `src/modules/config.py`._ diff --git a/docs/data-augmentation.md b/docs/data-augmentation.md index 9ad61c8..df8b39f 100644 --- a/docs/data-augmentation.md +++ b/docs/data-augmentation.md @@ -1,63 +1,21 @@ # Data Augmentation Guide -## Overview +## Data Augmentation Guide -The Six-Stack Personality Classification Pipeline features an advanced, adaptive data augmentation system designed to improve model generalization and performance through high-quality synthetic data generation. - -## Architecture - -### Adaptive Strategy Selection - -The pipeline automatically selects the optimal augmentation method based on dataset characteristics: - -```python -def analyze_data_characteristics(X, y): - """Analyze dataset to determine optimal augmentation strategy.""" - return { - 'n_samples': len(X), - 'n_features': X.shape[1], - 'class_balance_ratio': min(y.value_counts()) / max(y.value_counts()), - 'categorical_ratio': (X.dtypes == 'object').sum() / len(X.columns), - 'feature_complexity': calculate_feature_complexity(X), - 'is_small_dataset': len(X) < 1000, - 'is_imbalanced': min(y.value_counts()) / max(y.value_counts()) < 0.3, - 'is_highly_categorical': (X.dtypes == 'object').sum() / len(X.columns) > 0.5 - } -``` +### Strategy +- Adaptive selection based on dataset size, balance, and feature types ### Decision Matrix - -| Dataset Characteristics | Recommended Method | Rationale | -| -------------------------------- | ------------------ | --------------------------------- | -| Small datasets (<1K samples) | SMOTE | Fast, proven for small data | -| Severe imbalance (<30% minority) | ADASYN | Adaptive sampling for minorities | -| High categorical (>50%) | Basic | Simple methods for categorical | -| Complex numerical data | SDV Copula | Preserves complex distributions | -| Large balanced datasets | SDV Copula | Best quality for complex patterns | - -## Augmentation Methods - -### 1. SDV Copula (Recommended) - -**Best for**: Large datasets with complex feature distributions - -#### Features - -- **Gaussian Copula modeling** for complex dependency structures -- **Marginal distribution preservation** for each feature -- **Correlation structure maintenance** across features -- **Fast training mode** for development/testing - -#### Implementation - -```python -def sdv_copula_augmentation(X, y, n_samples): - """Generate synthetic data using SDV Gaussian Copula.""" - # Combine features and target - data = X.copy() - data['target'] = y - - # Configure copula synthesizer +| Data Type | Method | +|-------------------|---------------| +| Small/Imbalanced | SMOTE/ADASYN | +| High Categorical | Basic | +| Complex Numeric | SDV Copula | + +### Main Method +**SDV Copula** (recommended): +- Preserves feature distributions and correlations +- Fast mode for development synthesizer = GaussianCopula( enforce_rounding=True, enforce_min_max_values=True diff --git a/docs/deployment.md b/docs/deployment.md deleted file mode 100644 index 431f90d..0000000 --- a/docs/deployment.md +++ /dev/null @@ -1,846 +0,0 @@ -# Deployment Guide - -## Overview - -This guide covers deploying the Six-Stack Personality Classification Pipeline using modern containerization and orchestration technologies. The focus is on **Docker containerization** and **Kubernetes orchestration** with a **Dash web application** for interactive model serving. - -## Deployment Strategy - -### Core Technologies - -- **๐Ÿณ Docker**: Containerization for consistent environments -- **โ˜ธ๏ธ Kubernetes**: Container orchestration and scaling -- **๐Ÿ“Š Dash**: Interactive web application for model inference -- **๐Ÿ“ˆ Monitoring**: Prometheus and Grafana integration - -### Architecture Overview - -``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ Dash Web App โ”‚โ”€โ”€โ”€โ–ถโ”‚ ML Pipeline โ”‚โ”€โ”€โ”€โ–ถโ”‚ Data Store โ”‚ -โ”‚ (Port 8050) โ”‚ โ”‚ (Containers) โ”‚ โ”‚ (Volumes) โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - โ”‚ โ”‚ โ”‚ - โ–ผ โ–ผ โ–ผ -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ Load Balancer โ”‚ โ”‚ Kubernetes โ”‚ โ”‚ Monitoring โ”‚ -โ”‚ (Ingress) โ”‚ โ”‚ Cluster โ”‚ โ”‚ (Prometheus) โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ -``` - -## ๐Ÿณ Docker Deployment - -### Prerequisites - -```bash -# System requirements -- Docker 20.10+ -- Docker Compose 2.0+ -- 8GB+ RAM available for containers -- 4+ CPU cores -- 20GB+ disk space - -# Install Docker (Ubuntu/Debian) -curl -fsSL https://get.docker.com -o get-docker.sh -sudo sh get-docker.sh -sudo usermod -aG docker $USER - -# Install Docker Compose -sudo apt install docker-compose-plugin -``` - -### Dockerfile - -```dockerfile -# Multi-stage build for optimal image size -FROM python:3.11-slim as builder - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - gcc \ - g++ \ - && rm -rf /var/lib/apt/lists/* - -# Install uv -COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv - -# Set working directory -WORKDIR /app - -# Copy dependency files -COPY pyproject.toml uv.lock ./ - -# Install dependencies -RUN uv sync --no-dev --frozen - -# Production stage -FROM python:3.11-slim - -# Install runtime dependencies -RUN apt-get update && apt-get install -y \ - && rm -rf /var/lib/apt/lists/* - -# Create non-root user -RUN useradd --create-home --shell /bin/bash personality - -# Set working directory -WORKDIR /app - -# Copy virtual environment from builder -COPY --from=builder /app/.venv /app/.venv - -# Copy source code -COPY src/ src/ -COPY data/ data/ -COPY examples/ examples/ - -# Set ownership -RUN chown -R personality:personality /app - -# Switch to non-root user -USER personality - -# Set environment variables -ENV PATH="/app/.venv/bin:$PATH" -ENV PYTHONPATH="/app/src" - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ - CMD python -c "import src.modules.config; print('OK')" || exit 1 - -# Default command -CMD ["python", "src/main_modular.py"] -``` - -### Pipeline Dockerfile - -```dockerfile -# Dockerfile.pipeline - ML Training Pipeline -FROM python:3.11-slim as builder - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - gcc \ - g++ \ - && rm -rf /var/lib/apt/lists/* - -# Install uv -COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv - -# Set working directory -WORKDIR /app - -# Copy dependency files -COPY pyproject.toml uv.lock ./ - -# Install dependencies -RUN uv sync --no-dev --frozen - -# Production stage -FROM python:3.11-slim - -# Install runtime dependencies -RUN apt-get update && apt-get install -y \ - && rm -rf /var/lib/apt/lists/* - -# Create non-root user -RUN useradd --create-home --shell /bin/bash personality - -# Set working directory -WORKDIR /app - -# Copy virtual environment from builder -COPY --from=builder /app/.venv /app/.venv - -# Copy source code -COPY src/ src/ -COPY data/ data/ - -# Create model artifacts directory -RUN mkdir -p models best_params submissions logs - -# Set ownership -RUN chown -R personality:personality /app - -# Switch to non-root user -USER personality - -# Set environment variables -ENV PATH="/app/.venv/bin:$PATH" -ENV PYTHONPATH="/app/src" - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ - CMD python -c "import src.modules.config; print('OK')" || exit 1 - -# Default command -CMD ["python", "src/main_modular.py"] -``` - -### Dash Application Dockerfile - -```dockerfile -# Dockerfile.dash - Interactive Dash Application -FROM python:3.11-slim - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - gcc \ - g++ \ - && rm -rf /var/lib/apt/lists/* - -# Install uv -COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv - -# Set working directory -WORKDIR /app - -# Copy dependency files (include dash dependencies) -COPY pyproject.toml uv.lock ./ -COPY requirements-dash.txt ./ - -# Install dependencies -RUN uv sync --no-dev --frozen -RUN uv pip install -r requirements-dash.txt - -# Copy application code -COPY src/ src/ -COPY dash_app/ dash_app/ - -# Create non-root user -RUN useradd --create-home --shell /bin/bash dashuser -RUN chown -R dashuser:dashuser /app - -# Switch to non-root user -USER dashuser - -# Set environment variables -ENV PATH="/app/.venv/bin:$PATH" -ENV PYTHONPATH="/app/src" - -# Expose Dash port -EXPOSE 8050 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \ - CMD curl -f http://localhost:8050/ || exit 1 - -# Run Dash application -CMD ["python", "dash_app/app.py"] -``` - -### Dash Requirements - -```txt -# requirements-dash.txt -dash>=2.14.0 -dash-bootstrap-components>=1.5.0 -plotly>=5.17.0 -pandas>=2.1.0 -numpy>=1.24.0 -gunicorn>=21.2.0 -``` - -### Multi-Service Docker Compose - -```yaml -# docker-compose.yml -version: "3.8" - -services: - # ML Pipeline Service - ml-pipeline: - build: - context: . - dockerfile: Dockerfile.pipeline - container_name: personality-ml-pipeline - restart: unless-stopped - - deploy: - resources: - limits: - memory: 8G - cpus: "4" - reservations: - memory: 2G - cpus: "1" - - environment: - - PERSONALITY_LOG_LEVEL=INFO - - PERSONALITY_TESTING_MODE=false - - RUNNING_IN_DOCKER=true - - volumes: - - ./data:/app/data:ro - - ./best_params:/app/best_params - - ./submissions:/app/submissions - - ./logs:/app/logs - - model-artifacts:/app/models - - networks: - - personality-net - - # Dash Web Application - dash-app: - build: - context: . - dockerfile: Dockerfile.dash - container_name: personality-dash-app - restart: unless-stopped - ports: - - "8050:8050" - - depends_on: - - ml-pipeline - - environment: - - DASH_HOST=0.0.0.0 - - DASH_PORT=8050 - - MODEL_PATH=/app/models - - volumes: - - model-artifacts:/app/models:ro - - ./data:/app/data:ro - - networks: - - personality-net - - # Monitoring with Prometheus - prometheus: - image: prom/prometheus:latest - container_name: prometheus - ports: - - "9090:9090" - volumes: - - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml - - prometheus-data:/prometheus - networks: - - personality-net - - # Visualization with Grafana - grafana: - image: grafana/grafana:latest - container_name: grafana - ports: - - "3000:3000" - environment: - - GF_SECURITY_ADMIN_PASSWORD=admin - volumes: - - grafana-storage:/var/lib/grafana - - ./monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards - networks: - - personality-net - -volumes: - model-artifacts: - prometheus-data: - grafana-storage: - -networks: - personality-net: - driver: bridge -``` - -### Build and Deploy with Docker Compose - -```bash -# Build all images -docker-compose build - -# Start all services -docker-compose up -d - -# View logs -docker-compose logs -f dash-app -docker-compose logs -f ml-pipeline - -# Scale pipeline instances -docker-compose up --scale ml-pipeline=3 -d - -# Stop all services -docker-compose down - -# Clean up (removes containers, networks, and volumes) -docker-compose down -v -``` - -## โ˜ธ๏ธ Kubernetes Deployment - -### ML Pipeline Deployment - -````yaml -# k8s/ml-pipeline-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: ml-pipeline - labels: - app: ml-pipeline -spec: - replicas: 2 - selector: - matchLabels: - app: ml-pipeline - template: - metadata: - labels: - app: ml-pipeline - spec: - containers: - - name: ml-pipeline - image: personality-ml-pipeline:latest - - resources: - requests: - memory: "2Gi" - cpu: "500m" - limits: - memory: "8Gi" - cpu: "2000m" - - env: - - name: PERSONALITY_LOG_LEVEL - value: "INFO" - - name: RUNNING_IN_KUBERNETES - value: "true" - - volumeMounts: - - name: data-volume - mountPath: /app/data - readOnly: true - - name: model-artifacts - mountPath: /app/models - - name: logs-volume - mountPath: /app/logs - - livenessProbe: - exec: - command: - - python - - -c - - "import src.modules.config; print('OK')" - initialDelaySeconds: 60 - periodSeconds: 30 - - readinessProbe: - exec: - command: - - python - - -c - - "import src.modules.config; print('OK')" - initialDelaySeconds: 30 - periodSeconds: 10 - - volumes: - - name: data-volume - configMap: - name: training-data - - name: model-artifacts - persistentVolumeClaim: - claimName: model-artifacts-pvc - - name: logs-volume - persistentVolumeClaim: - claimName: logs-pvc - ---- -### Dash Application Deployment -```yaml -# k8s/dash-app-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: dash-app - labels: - app: dash-app -spec: - replicas: 3 - selector: - matchLabels: - app: dash-app - template: - metadata: - labels: - app: dash-app - spec: - containers: - - name: dash-app - image: personality-dash-app:latest - ports: - - containerPort: 8050 - - resources: - requests: - memory: "1Gi" - cpu: "200m" - limits: - memory: "4Gi" - cpu: "1000m" - - env: - - name: DASH_HOST - value: "0.0.0.0" - - name: DASH_PORT - value: "8050" - - name: MODEL_PATH - value: "/app/models" - - volumeMounts: - - name: model-artifacts - mountPath: /app/models - readOnly: true - - name: data-volume - mountPath: /app/data - readOnly: true - - livenessProbe: - httpGet: - path: / - port: 8050 - initialDelaySeconds: 30 - periodSeconds: 10 - - readinessProbe: - httpGet: - path: / - port: 8050 - initialDelaySeconds: 15 - periodSeconds: 5 - - volumes: - - name: model-artifacts - persistentVolumeClaim: - claimName: model-artifacts-pvc - - name: data-volume - configMap: - name: training-data -```` - -### Services and Ingress - -```yaml -# k8s/services.yaml -apiVersion: v1 -kind: Service -metadata: - name: dash-app-service -spec: - selector: - app: dash-app - ports: - - protocol: TCP - port: 80 - targetPort: 8050 - type: ClusterIP - ---- -apiVersion: v1 -kind: Service -metadata: - name: ml-pipeline-service -spec: - selector: - app: ml-pipeline - ports: - - protocol: TCP - port: 80 - targetPort: 8080 - type: ClusterIP - ---- -# k8s/ingress.yaml -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: personality-classifier-ingress - annotations: - nginx.ingress.kubernetes.io/rewrite-target: / - nginx.ingress.kubernetes.io/ssl-redirect: "true" - cert-manager.io/cluster-issuer: "letsencrypt-prod" -spec: - tls: - - hosts: - - personality.yourdomain.com - secretName: personality-tls - rules: - - host: personality.yourdomain.com - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: dash-app-service - port: - number: 80 - - path: /api - pathType: Prefix - backend: - service: - name: ml-pipeline-service - port: - number: 80 -``` - -### Persistent Storage - -```yaml -# k8s/storage.yaml -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: model-artifacts-pvc -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi - storageClassName: fast-ssd - ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: logs-pvc -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 5Gi - storageClassName: standard - ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: training-data -data: - # Add your training data files here - # or mount from external storage -``` - -### Deploy to Kubernetes - -```bash -# Build and push images to registry -docker build -f Dockerfile.pipeline -t your-registry/personality-ml-pipeline:latest . -docker build -f Dockerfile.dash -t your-registry/personality-dash-app:latest . - -docker push your-registry/personality-ml-pipeline:latest -docker push your-registry/personality-dash-app:latest - -# Create namespace -kubectl create namespace personality-classifier - -# Apply storage resources -kubectl apply -f k8s/storage.yaml -n personality-classifier - -# Apply deployments -kubectl apply -f k8s/ml-pipeline-deployment.yaml -n personality-classifier -kubectl apply -f k8s/dash-app-deployment.yaml -n personality-classifier - -# Apply services and ingress -kubectl apply -f k8s/services.yaml -n personality-classifier -kubectl apply -f k8s/ingress.yaml -n personality-classifier - -# Check deployment status -kubectl get all -n personality-classifier -kubectl get pvc -n personality-classifier - -# View logs -kubectl logs -f deployment/ml-pipeline -n personality-classifier -kubectl logs -f deployment/dash-app -n personality-classifier - -# Scale deployments -kubectl scale deployment dash-app --replicas=5 -n personality-classifier -kubectl scale deployment ml-pipeline --replicas=3 -n personality-classifier - -# Port forward for local access (development) -kubectl port-forward service/dash-app-service 8050:80 -n personality-classifier -``` - -## ๐Ÿ”ง Production Best Practices - -### Security Considerations - -```bash -# Use secrets for sensitive configuration -kubectl create secret generic model-secrets \ - --from-literal=api-key=your-api-key \ - --from-literal=db-password=your-password \ - -n personality-classifier - -# Apply security contexts -securityContext: - runAsNonRoot: true - runAsUser: 1000 - fsGroup: 2000 - capabilities: - drop: - - ALL -``` - -### Backup Strategy - -```bash -#!/bin/bash -# backup.sh - Automated backup script - -# Backup model artifacts -kubectl exec deployment/ml-pipeline -n personality-classifier -- \ - tar -czf /tmp/models-backup-$(date +%Y%m%d).tar.gz /app/models - -# Copy to persistent storage -kubectl cp personality-classifier/ml-pipeline-pod:/tmp/models-backup-$(date +%Y%m%d).tar.gz \ - ./backups/models-backup-$(date +%Y%m%d).tar.gz - -# Upload to cloud storage (optional) -aws s3 cp ./backups/models-backup-$(date +%Y%m%d).tar.gz \ - s3://your-backup-bucket/models/ - -# Rotate old backups (keep last 30 days) -find ./backups -name "models-backup-*.tar.gz" -mtime +30 -delete -``` - -### Health Checks and Monitoring - -```yaml -# k8s/monitoring.yaml -apiVersion: v1 -kind: Service -metadata: - name: prometheus-service -spec: - selector: - app: prometheus - ports: - - port: 9090 - targetPort: 9090 - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app: prometheus - template: - metadata: - labels: - app: prometheus - spec: - containers: - - name: prometheus - image: prom/prometheus:latest - ports: - - containerPort: 9090 - volumeMounts: - - name: prometheus-config - mountPath: /etc/prometheus - volumes: - - name: prometheus-config - configMap: - name: prometheus-config -``` - -## ๐Ÿš€ Quick Deployment Commands - -### Local Development - -```bash -# Quick start with Docker Compose -git clone -cd Personality-classification - -# Build and start all services -docker-compose up --build -d - -# Access Dash application -open http://localhost:8050 - -# View logs -docker-compose logs -f dash-app -``` - -### Production Deployment - -```bash -# Build and push images -docker build -f Dockerfile.pipeline -t your-registry/ml-pipeline:v1.0 . -docker build -f Dockerfile.dash -t your-registry/dash-app:v1.0 . -docker push your-registry/ml-pipeline:v1.0 -docker push your-registry/dash-app:v1.0 - -# Deploy to Kubernetes -kubectl create namespace personality-classifier -kubectl apply -f k8s/ -n personality-classifier - -# Verify deployment -kubectl get all -n personality-classifier -kubectl logs -f deployment/dash-app -n personality-classifier -``` - -## ๐Ÿ“‹ Troubleshooting - -### Common Issues - -#### Container Memory Issues - -```bash -# Check memory usage -kubectl top pods -n personality-classifier - -# Increase memory limits in deployment -resources: - limits: - memory: "16Gi" # Increase from 8Gi -``` - -#### Model Loading Problems - -```bash -# Check persistent volumes -kubectl get pvc -n personality-classifier - -# Verify model artifacts -kubectl exec -it deployment/ml-pipeline -n personality-classifier -- ls -la /app/models -``` - -#### Dash Application Not Starting - -```bash -# Check logs -kubectl logs deployment/dash-app -n personality-classifier - -# Test local connectivity -kubectl port-forward service/dash-app-service 8050:80 -n personality-classifier -``` - -#### Network Connectivity Issues - -```bash -# Test service connectivity -kubectl exec -it deployment/dash-app -n personality-classifier -- \ - curl http://ml-pipeline-service - -# Check ingress status -kubectl get ingress -n personality-classifier -kubectl describe ingress personality-classifier-ingress -n personality-classifier -``` - ---- - -## ๐Ÿ“š Additional Resources - -- **Docker Documentation**: [docs.docker.com](https://docs.docker.com) -- **Kubernetes Documentation**: [kubernetes.io/docs](https://kubernetes.io/docs) -- **Dash Documentation**: [dash.plotly.com](https://dash.plotly.com) -- **Prometheus Monitoring**: [prometheus.io/docs](https://prometheus.io/docs) - ---- - -_This deployment guide focuses on containerized deployment with Docker and Kubernetes orchestration. For specific platform requirements or custom deployments, consult the platform documentation or create an issue in the repository._ diff --git a/docs/images/Dash_example1.png b/docs/images/Dash_example1.png deleted file mode 100644 index c721f52..0000000 Binary files a/docs/images/Dash_example1.png and /dev/null differ diff --git a/docs/images/Dash_example2.png b/docs/images/Dash_example2.png deleted file mode 100644 index 1d3db17..0000000 Binary files a/docs/images/Dash_example2.png and /dev/null differ diff --git a/docs/images/personality_classification_app.mp4 b/docs/images/personality_classification_app.mp4 new file mode 100644 index 0000000..af5c2ed Binary files /dev/null and b/docs/images/personality_classification_app.mp4 differ diff --git a/docs/mlops-infrastructure.md b/docs/mlops-infrastructure.md deleted file mode 100644 index 9425192..0000000 --- a/docs/mlops-infrastructure.md +++ /dev/null @@ -1,504 +0,0 @@ -# MLOps Infrastructure Documentation - -## Overview - -This document describes the comprehensive MLOps (Machine Learning Operations) infrastructure implemented for the personality classification project. The MLOps system provides end-to-end lifecycle management for machine learning models, from development to production deployment and monitoring. - -## Architecture - -### Components - -1. **Experiment Tracking** (`ExperimentTracker`) - - MLflow-based experiment tracking - - Parameter and metric logging - - Model artifacts management - - Experiment comparison and analysis - -2. **Model Registry** (`ModelRegistry`) - - Centralized model versioning - - Model stage management (Development, Staging, Production) - - Model lineage tracking - - Automated model promotion workflows - -3. **Data Validation** (`DataValidator`) - - Comprehensive data quality checks - - Data drift detection - - Schema validation - - Statistical profiling - -4. **Model Monitoring** (`ModelMonitor`) - - Real-time performance tracking - - Data drift detection - - Performance degradation alerts - - Prediction logging and analysis - -5. **Model Serving** (`ModelServer`) - - HTTP API for model inference - - Batch prediction support - - Model versioning in production - - Health checks and monitoring - -6. **MLOps Pipeline** (`MLOpsPipeline`) - - Integrated workflow orchestration - - End-to-end pipeline automation - - Cross-component coordination - -## Getting Started - -### Prerequisites - -```bash -# Install MLOps dependencies -pip install mlflow flask joblib - -# Or install with all dependencies -pip install -e ".[dev]" -``` - -### Basic Usage - -```python -from src.mlops import MLOpsPipeline - -# Initialize MLOps pipeline -mlops = MLOpsPipeline( - experiment_name="personality_classification", - model_name="personality_model" -) - -# Validate data -validation_results = mlops.validate_and_track_data(train_data, test_data) - -# Train and track model -training_results = mlops.train_and_track_model( - model=your_model, - X_train=X_train, - y_train=y_train, - X_test=X_test, - y_test=y_test, - model_params={"param1": "value1"}, - register_model=True -) - -# Promote model to production -mlops.promote_model(model_version="1", stage="Production") - -# Monitor production model -monitoring_results = mlops.monitor_production_model( - prediction_data=recent_predictions, - reference_data=reference_dataset -) -``` - -## Detailed Component Guide - -### 1. Experiment Tracking - -The `ExperimentTracker` provides comprehensive experiment management using MLflow. - -#### Key Features: -- **Parameter Logging**: Hyperparameters, model configurations -- **Metric Tracking**: Performance metrics, custom metrics -- **Artifact Storage**: Models, plots, datasets -- **Run Comparison**: Side-by-side experiment comparison - -#### Example Usage: -```python -tracker = ExperimentTracker("my_experiment") - -with tracker.start_run("model_training"): - # Log parameters - tracker.log_params({"learning_rate": 0.01, "batch_size": 32}) - - # Train model - model.fit(X_train, y_train) - - # Log metrics - tracker.log_metrics({"accuracy": 0.95, "f1_score": 0.93}) - - # Log model - tracker.log_model(model, "model") - - # Log confusion matrix - tracker.log_confusion_matrix(y_true, y_pred) -``` - -### 2. Model Registry - -The `ModelRegistry` manages model versions and deployment stages. - -#### Model Stages: -- **None**: Initial registration -- **Staging**: Testing and validation -- **Production**: Live deployment -- **Archived**: Deprecated models - -#### Example Usage: -```python -registry = ModelRegistry() - -# Register model -model_version = registry.register_model( - model_uri="runs:/run_id/model", - name="personality_model", - description="Random Forest classifier" -) - -# Promote to production -registry.promote_model("personality_model", "1", "Production") - -# Load production model -model = registry.load_model("personality_model", stage="Production") -``` - -### 3. Data Validation - -The `DataValidator` ensures data quality and consistency. - -#### Validation Checks: -- **Missing Data**: Null values, completeness -- **Data Types**: Schema consistency -- **Duplicates**: Row-level duplicates -- **Outliers**: Statistical outlier detection -- **Distributions**: Class balance, feature distributions -- **Data Drift**: Distribution changes over time - -#### Example Usage: -```python -validator = DataValidator() - -# Validate dataset -results = validator.validate_dataset(df, "train_data") - -# Check data quality score -score = validator.get_data_quality_score("train_data") - -# Validate train/test split -split_results = validator.validate_train_test_split( - X_train, X_test, y_train, y_test -) -``` - -### 4. Model Monitoring - -The `ModelMonitor` tracks model performance in production. - -#### Monitoring Capabilities: -- **Performance Metrics**: Accuracy, F1-score, precision, recall -- **Data Drift Detection**: Feature distribution changes -- **Prediction Logging**: Request/response tracking -- **Alerting**: Automatic issue detection -- **Dashboard Data**: Real-time monitoring metrics - -#### Example Usage: -```python -monitor = ModelMonitor("personality_model") - -# Log predictions -monitor.log_prediction( - prediction=pred, - features=input_features, - confidence=confidence_score, - actual=actual_value -) - -# Calculate performance metrics -metrics = monitor.calculate_performance_metrics(window_hours=24) - -# Detect data drift -drift_results = monitor.detect_data_drift(reference_data) -``` - -### 5. Model Serving - -The `ModelServer` provides an interactive Dash-based dashboard for model inference and monitoring. - -#### Dashboard Features: -- **๐Ÿ“Š Interactive Dashboard**: Modern web-based interface -- **๐Ÿ”ฎ Multiple Input Methods**: Manual forms, JSON input, file upload -- **๐Ÿ“ˆ Real-time Monitoring**: Live prediction history and statistics -- **๐ŸŽจ Beautiful UI**: Professional styling with confidence visualization -- **๐Ÿ”„ Auto-refresh**: Live updates of prediction history - -#### Example Usage: -```python -# Create interactive dashboard server -server = ModelServer( - model_name="personality_model", - model_stage="Production", - port=8050 -) - -# Run dashboard server -server.run() -# Access at http://localhost:8050 -``` - -#### Dashboard Components: -- **Model Status Cards**: Real-time model health and statistics -- **Prediction Interface**: Multiple input methods with validation -- **Results Visualization**: Confidence scores and probability distributions -- **History Table**: Searchable prediction history with timestamps - -#### API Examples: -```bash -# Health check -curl http://localhost:5000/health - -# Single prediction -curl -X POST http://localhost:5000/predict \ - -H "Content-Type: application/json" \ - -d '{"features": {"feature1": 1.0, "feature2": 2.0}}' - -# Batch prediction -curl -X POST http://localhost:5000/predict/batch \ - -H "Content-Type: application/json" \ - -d '{"instances": [{"feature1": 1.0}, {"feature1": 2.0}]}' -``` - -## Deployment Patterns - -### 1. Local Development -```python -# Run MLOps demo -python examples/mlops_demo.py - -# Start MLflow UI -mlflow ui - -# Start model server -python -m src.mlops.serving --model-name personality_model -``` - -### 2. Docker Deployment -```dockerfile -FROM python:3.11-slim - -COPY requirements.txt . -RUN pip install -r requirements.txt - -COPY src/ /app/src/ -WORKDIR /app - -EXPOSE 5000 -CMD ["python", "-m", "src.mlops.serving", "--model-name", "personality_model"] -``` - -### 3. Kubernetes Deployment -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: personality-model-server -spec: - replicas: 3 - selector: - matchLabels: - app: personality-model-server - template: - metadata: - labels: - app: personality-model-server - spec: - containers: - - name: model-server - image: personality-model:latest - ports: - - containerPort: 5000 - env: - - name: MLFLOW_TRACKING_URI - value: "http://mlflow-server:5000" -``` - -## CI/CD Integration - -### GitHub Actions Workflow -```yaml -name: MLOps Pipeline - -on: - push: - branches: [main] - pull_request: - branches: [main] - -jobs: - data-validation: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - name: Install dependencies - run: pip install -e ".[dev]" - - name: Validate data - run: python scripts/validate_data.py - - model-training: - needs: data-validation - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Train model - run: python scripts/train_model.py - - name: Register model - run: python scripts/register_model.py - - model-deployment: - needs: model-training - runs-on: ubuntu-latest - if: github.ref == 'refs/heads/main' - steps: - - name: Deploy to staging - run: python scripts/deploy_model.py --stage staging - - name: Run integration tests - run: python scripts/test_model_api.py - - name: Promote to production - run: python scripts/promote_model.py --stage production -``` - -## Monitoring and Alerting - -### Setting Up Alerts -```python -# Configure monitoring thresholds -monitor = ModelMonitor("personality_model") - -# Set up performance degradation alerts -baseline_metrics = {"accuracy": 0.85, "f1_score": 0.83} -degradation_results = monitor.detect_performance_degradation( - baseline_metrics, - degradation_threshold=0.05 # 5% degradation threshold -) - -# Set up data drift alerts -drift_results = monitor.detect_data_drift( - reference_data, - drift_threshold=0.1 # 10% drift threshold -) -``` - -### Dashboard Integration -```python -# Get dashboard data -dashboard_data = monitor.get_monitoring_dashboard_data(hours=24) - -# Generate monitoring report -report = monitor.generate_monitoring_report() -``` - -## Best Practices - -### 1. Experiment Organization -- Use descriptive experiment names -- Tag experiments with metadata -- Document parameter choices -- Compare similar experiments - -### 2. Model Versioning -- Semantic versioning for models -- Clear version descriptions -- Tag models with deployment info -- Maintain model lineage - -### 3. Data Quality -- Validate all data inputs -- Monitor for drift continuously -- Set quality thresholds -- Automate data checks - -### 4. Monitoring -- Log all predictions -- Track performance metrics -- Set up alerting thresholds -- Regular monitoring reviews - -### 5. Security -- Secure MLflow tracking server -- API authentication/authorization -- Data privacy compliance -- Audit trail maintenance - -## Troubleshooting - -### Common Issues - -1. **MLflow Connection Errors** - ```python - # Check MLflow server status - import mlflow - print(mlflow.get_tracking_uri()) - ``` - -2. **Model Loading Issues** - ```python - # Verify model exists - registry = ModelRegistry() - models = registry.list_models() - print([m.name for m in models]) - ``` - -3. **Data Validation Failures** - ```python - # Check validation details - validator = DataValidator() - results = validator.validate_dataset(df) - print(results['missing_data']) - ``` - -4. **Monitoring Data Issues** - ```python - # Check monitoring logs - monitor = ModelMonitor("model_name") - dashboard = monitor.get_monitoring_dashboard_data() - print(f"Total predictions: {dashboard['total_predictions']}") - ``` - -## Performance Optimization - -### 1. MLflow Optimization -- Use artifact stores (S3, Azure Blob) -- Configure database backend -- Enable model caching - -### 2. Serving Optimization -- Use model serialization (joblib, pickle) -- Implement request batching -- Add response caching - -### 3. Monitoring Optimization -- Aggregate metrics efficiently -- Use sampling for large volumes -- Implement data retention policies - -## Future Enhancements - -1. **Advanced Monitoring** - - A/B testing framework - - Feature importance tracking - - Bias detection and mitigation - -2. **Automated Workflows** - - Auto-retaining on drift - - Automated model selection - - Self-healing deployments - -3. **Integration Enhancements** - - Kubernetes operators - - Stream processing integration - - Multi-cloud deployment - -4. **Observability** - - Distributed tracing - - Custom metrics collection - - Performance profiling - -## Support and Resources - -- **Documentation**: See `/docs` directory -- **Examples**: See `/examples` directory -- **Issues**: GitHub Issues -- **MLflow Docs**: https://mlflow.org/docs/latest/ -- **Flask Docs**: https://flask.palletsprojects.com/ diff --git a/docs/mlops-integration-summary.md b/docs/mlops-integration-summary.md deleted file mode 100644 index 5fc9145..0000000 --- a/docs/mlops-integration-summary.md +++ /dev/null @@ -1,225 +0,0 @@ -# MLOps Integration for Six-Stack Personality Classification Pipeline - -## Overview - -The Six-Stack Personality Classification Pipeline has been enhanced with comprehensive MLOps infrastructure that seamlessly integrates with the existing modular architecture. This integration provides production-ready capabilities while maintaining backward compatibility. - -## Integration Features - -### ๐Ÿ”„ Backward Compatibility -- The pipeline works exactly as before when MLOps components are not available -- Graceful degradation: MLOps failures don't break the core pipeline -- Optional enable/disable flag for MLOps functionality - -### ๐Ÿ—๏ธ MLOps Components Integrated - -#### 1. **Experiment Tracking** (MLflow) -- Automatic experiment creation and run tracking -- Parameter logging (hyperparameters, configuration) -- Metrics logging (CV scores, ensemble weights, performance metrics) -- Artifact logging (models, predictions, metadata) - -#### 2. **Data Validation** -- Training and test data quality checks -- Schema validation and data drift detection -- Automated data profiling and anomaly detection -- Statistical validation of feature distributions - -#### 3. **Model Registry** -- Automatic model registration with versioning -- Model staging (Staging โ†’ Production) -- Model lineage tracking -- Easy model loading and deployment - -#### 4. **Model Monitoring** -- Prediction monitoring and drift detection -- Performance tracking over time -- Alert generation for model degradation -- Dashboard-ready metrics collection - -#### 5. **Serving Infrastructure** -- REST API for model inference -- Batch prediction capabilities -- Health checks and model reloading -- Scalable deployment ready - -## Usage - -### Basic Usage (No Changes Required) -```python -# Existing code works exactly the same -from src.main_modular import main - -if __name__ == "__main__": - main() -``` - -### With MLOps Enabled -```python -# MLOps is automatically enabled if components are available -# No code changes needed - everything is handled internally -from src.main_modular import main - -if __name__ == "__main__": - main() # Now includes MLOps tracking, validation, monitoring -``` - -### Customizing MLOps Behavior -```python -from src.main_modular import MLOpsIntegration - -# Create custom MLOps configuration -mlops = MLOpsIntegration(enable_mlops=True) - -# Use in your own workflows -mlops.start_experiment("custom_experiment") -mlops.log_parameters({"custom_param": "value"}) -mlops.log_metrics({"custom_metric": 0.95}) -mlops.end_experiment() -``` - -## Key Benefits - -### ๐Ÿš€ **Production Ready** -- **Experiment Tracking**: Full visibility into model training and performance -- **Reproducibility**: All parameters, metrics, and artifacts are tracked -- **Model Versioning**: Automatic versioning with promotion workflows -- **Monitoring**: Real-time performance and drift monitoring - -### ๐Ÿ”ง **Developer Friendly** -- **Zero Breaking Changes**: Existing code continues to work -- **Gradual Adoption**: Enable MLOps features incrementally -- **Error Handling**: Robust error handling prevents MLOps issues from breaking training -- **Logging**: Comprehensive logging for debugging and monitoring - -### ๐Ÿ“Š **Data Science Workflow** -- **Experiment Comparison**: Compare different runs and configurations -- **Model Selection**: Track which models perform best -- **Performance Tracking**: Monitor model performance over time -- **Data Quality**: Automated data validation and drift detection - -## Technical Implementation - -### Code Structure -``` -src/ -โ”œโ”€โ”€ main_modular.py # Enhanced with MLOpsIntegration class -โ”œโ”€โ”€ mlops/ # MLOps infrastructure -โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”œโ”€โ”€ experiment_tracking.py -โ”‚ โ”œโ”€โ”€ data_validation.py -โ”‚ โ”œโ”€โ”€ model_registry.py -โ”‚ โ”œโ”€โ”€ monitoring.py -โ”‚ โ”œโ”€โ”€ serving.py -โ”‚ โ””โ”€โ”€ pipeline.py -โ””โ”€โ”€ modules/ - โ”œโ”€โ”€ config.py # Enhanced with MLOps config - โ””โ”€โ”€ ... # Existing modules unchanged -``` - -### Integration Points - -1. **Data Loading**: Automatic data validation after loading -2. **Training**: Experiment tracking throughout the training process -3. **Model Building**: Parameter and metric logging for each stack -4. **Ensemble**: Ensemble weights and performance tracking -5. **Prediction**: Model registration and monitoring setup - -### Error Handling Strategy -- **Graceful Degradation**: MLOps failures log warnings but don't stop training -- **Optional Dependencies**: Pipeline works without MLOps dependencies -- **Comprehensive Logging**: All MLOps operations are logged for debugging - -## Configuration - -### Environment Variables -```bash -# MLflow Configuration -export MLFLOW_TRACKING_URI="sqlite:///mlflow.db" -export MLFLOW_EXPERIMENT_NAME="six_stack_personality" - -# Model Registry -export MODEL_REGISTRY_NAME="six_stack_ensemble" -``` - -### Config Options -```python -# In modules/config.py -ENABLE_MLOPS = True -MLFLOW_TRACKING_URI = "sqlite:///mlflow.db" -MLFLOW_EXPERIMENT_NAME = "six_stack_personality" -MODEL_REGISTRY_NAME = "six_stack_ensemble" -``` - -## Monitoring and Observability - -### Metrics Tracked -- **Training Metrics**: CV scores for each stack, ensemble performance -- **Data Metrics**: Data quality scores, drift detection results -- **Model Metrics**: Registration success, version numbers -- **Pipeline Metrics**: Execution time, success/failure rates - -### Dashboards Available -- **Experiment Tracking**: MLflow UI for experiment comparison -- **Model Performance**: Real-time performance monitoring -- **Data Quality**: Data drift and quality dashboards -- **System Health**: Pipeline execution and error monitoring - -## Deployment - -### Local Development -```bash -# Start MLflow UI -mlflow ui --backend-store-uri sqlite:///mlflow.db - -# Run pipeline with MLOps -python src/main_modular.py -``` - -### Production Deployment -```bash -# Set up MLflow tracking server -mlflow server --backend-store-uri postgresql://user:pass@host/db \ - --default-artifact-root s3://mlflow-artifacts/ - -# Deploy model serving API -python -m mlops.serving --model-name six_stack_ensemble --port 8080 -``` - -## Testing - -```bash -# Test MLOps integration -python test_mlops_integration.py - -# Test individual components -python -m pytest src/mlops/tests/ -``` - -## Future Enhancements - -### Planned Features -- **A/B Testing**: Framework for model A/B testing -- **Auto-retraining**: Triggered retraining based on drift detection -- **Multi-environment**: Support for dev/staging/prod environments -- **Advanced Monitoring**: More sophisticated performance metrics -- **CI/CD Integration**: Automated model validation and deployment - -### Extension Points -- **Custom Validators**: Easy to add domain-specific data validators -- **Custom Metrics**: Framework for custom monitoring metrics -- **Plugin Architecture**: Support for different MLOps backends -- **Integration APIs**: Easy integration with other ML platforms - -## Summary - -The MLOps integration transforms the Six-Stack Personality Classification Pipeline into a production-ready, enterprise-grade machine learning system while maintaining the simplicity and modularity of the original design. The integration provides: - -โœ… **Complete MLOps Infrastructure** -โœ… **Zero Breaking Changes** -โœ… **Production Ready** -โœ… **Comprehensive Monitoring** -โœ… **Easy Deployment** -โœ… **Excellent Documentation** - -This implementation demonstrates advanced MLOps skills and provides a solid foundation for scaling machine learning operations in production environments. diff --git a/docs/performance-tuning.md b/docs/performance-tuning.md index aa3ce58..2558a98 100644 --- a/docs/performance-tuning.md +++ b/docs/performance-tuning.md @@ -1,60 +1,24 @@ # Performance Tuning Guide -## Overview +## Performance Tuning Guide -This guide provides comprehensive strategies for optimizing the Six-Stack Personality Classification Pipeline performance across different dimensions: speed, memory usage, accuracy, and resource utilization. - -## Performance Dimensions - -### 1. Training Speed - -- Hyperparameter optimization trials -- Data augmentation complexity -- Threading configuration -- Model complexity - -### 2. Memory Efficiency - -- Dataset size management -- Model memory footprint -- Parallel processing overhead -- Synthetic data generation - -### 3. Prediction Accuracy - -- Ensemble optimization -- Cross-validation strategy -- Feature engineering -- Model diversity - -### 4. Resource Utilization - -- CPU core usage -- Memory allocation -- I/O optimization -- Caching strategies - -## Speed Optimization - -### Quick Development Setup +### Key Levers +- Training speed: TESTING_MODE, N_TRIALS_STACK, N_TRIALS_BLEND +- Memory: TESTING_SAMPLE_SIZE, ENABLE_DATA_AUGMENTATION +- Accuracy: Ensemble optimization, feature engineering +- Resource: N_JOBS, THREAD_COUNT +### Recommended Settings +**Fast Dev:** ```python -# Ultra-fast configuration for development iteration TESTING_MODE = True TESTING_SAMPLE_SIZE = 500 N_TRIALS_STACK = 3 N_TRIALS_BLEND = 20 ENABLE_DATA_AUGMENTATION = False -SHOW_PROGRESS_BARS = False - -# Expected time: 2-3 minutes -# Accuracy trade-off: 2-3% lower than production ``` - -### Balanced Development Setup - +**Balanced:** ```python -# Moderate speed with reasonable accuracy TESTING_MODE = True TESTING_SAMPLE_SIZE = 1000 N_TRIALS_STACK = 10 diff --git a/docs/pre-commit-guide.md b/docs/pre-commit-guide.md deleted file mode 100644 index 7a89f09..0000000 --- a/docs/pre-commit-guide.md +++ /dev/null @@ -1,162 +0,0 @@ -# Pre-commit Setup Guide - -This project uses [pre-commit](https://pre-commit.com/) to ensure code quality and consistency. - -## Installation - -Pre-commit is automatically installed when you run: - -```bash -make setup-env -# or -make install-dev -``` - -To manually install pre-commit hooks: - -```bash -make pre-commit-install -# or -uv run pre-commit install -``` - -## Usage - -### Automatic (Recommended) - -Pre-commit will automatically run on every `git commit`. If any checks fail, the commit will be blocked until issues are fixed. - -### Manual Execution - -Run on staged files only: - -```bash -make pre-commit-run -# or -uv run pre-commit run -``` - -Run on all files: - -```bash -make pre-commit-all -# or -uv run pre-commit run --all-files -``` - -## Configured Hooks - -### Code Formatting - -- **Black**: Python code formatter -- **isort**: Import sorting -- **Ruff**: Fast Python linter and formatter -- **Prettier**: Markdown, YAML, JSON formatting - -### Code Quality - -- **Ruff**: Comprehensive Python linting -- **Bandit**: Security vulnerability scanner -- **MyPy**: Static type checking (optional) - -### Documentation - -- **Pydocstyle**: Docstring style checking (Google convention) - -### General - -- **Trailing whitespace removal** -- **End-of-file fixing** -- **Large file detection** -- **Merge conflict detection** -- **YAML/TOML/JSON validation** - -### Jupyter Notebooks - -- **nbstripout**: Remove notebook outputs -- **nbqa**: Apply formatters to notebooks - -## Configuration - -Pre-commit configuration is in `.pre-commit-config.yaml`. - -Tool-specific configurations are in `pyproject.toml`: - -- `[tool.black]` -- `[tool.isort]` -- `[tool.ruff]` -- `[tool.bandit]` -- `[tool.pydocstyle]` -- `[tool.mypy]` - -## Bypassing Hooks - -In emergency situations, you can bypass pre-commit: - -```bash -git commit --no-verify -m "Emergency fix" -``` - -**Note**: This should be used sparingly and issues should be fixed in follow-up commits. - -## Troubleshooting - -### Hook Installation Issues - -```bash -# Reinstall hooks -uv run pre-commit clean -uv run pre-commit install -``` - -### Update Hook Versions - -```bash -uv run pre-commit autoupdate -``` - -### Skip Specific Hooks - -```bash -SKIP=mypy git commit -m "Skip MyPy for this commit" -``` - -## IDE Integration - -Most IDEs can be configured to run these tools automatically: - -### VS Code - -Install extensions: - -- Python -- Black Formatter -- isort -- Ruff -- Prettier - -### PyCharm - -Enable: - -- Black integration -- isort integration -- Pre-commit integration plugin - -## Makefile Targets - -The following Makefile targets are available for code quality: - -```bash -make format # Format code with ruff -make lint # Lint code with ruff -make check # Run linting and formatting checks -make fix # Auto-fix issues -make pre-commit-install # Install pre-commit hooks -make pre-commit-run # Run on staged files -make pre-commit-all # Run on all files -``` - -## CI/CD Integration - -Pre-commit runs automatically on GitHub Actions and other CI platforms. Some intensive hooks (like MyPy) are skipped on CI for performance. diff --git a/docs/technical-guide.md b/docs/technical-guide.md index bd59c4c..ee00f70 100644 --- a/docs/technical-guide.md +++ b/docs/technical-guide.md @@ -10,10 +10,6 @@ This document provides a deep technical dive into the Six-Stack Personality Clas The pipeline follows **SOLID principles** and **separation of concerns**: -- **Single Responsibility**: Each module has one clear purpose -- **Open/Closed**: Easy to extend without modifying existing code -- **Dependency Inversion**: High-level modules don't depend on low-level details -- **Interface Segregation**: Clean, focused interfaces between modules ### Core Architecture Pattern @@ -42,10 +38,6 @@ Each stack is designed to capture different aspects of the data: #### Stack A: Gradient Boosting Core (Narrow) -- **Purpose**: Stable baseline with conservative hyperparameters -- **Models**: XGBoost, LightGBM, CatBoost -- **Meta-learner**: Adaptive (Logistic Regression, Ridge, or XGBoost) -- **Search Space**: Conservative ranges (500-1000 estimators) ```python # Example hyperparameter ranges for Stack A @@ -58,6 +50,25 @@ xgb_params = { } ``` +# Technical Guide + +## Philosophy +- Modular, SOLID design +- Separation of concerns: data, processing, models, config, ensemble, utils + +## Stacks +- A: Stable baseline (XGBoost, LightGBM, CatBoost) +- B: Wide search +- C: XGBoost/CatBoost specialists +- D: Sklearn ensemble +- E: Neural networks +- F: Noise-robust + +## Advanced Features +- Optuna hyperparameter optimization +- SDV Copula data augmentation +- Out-of-fold ensemble blending + #### Stack B: Gradient Boosting Core (Wide) - **Purpose**: Broader exploration of hyperparameter space diff --git a/pre-commit-output.txt b/pre-commit-output.txt deleted file mode 100644 index 67cd458..0000000 --- a/pre-commit-output.txt +++ /dev/null @@ -1,242 +0,0 @@ - Building personality-classification @ file:///Users/jv/Documents/GitHub/Personality-classification - Built personality-classification @ file:///Users/jv/Documents/GitHub/Personality-classification -Uninstalled 1 package in 0.99ms -Installed 1 package in 2ms -trim trailing whitespace.................................................Passed -fix end of files.........................................................Passed -check yaml...........................................(no files to check)Skipped -check toml...............................................................Passed -check json...........................................(no files to check)Skipped -check for added large files..............................................Passed -check for case conflicts.................................................Passed -check for merge conflicts................................................Passed -debug statements (python)................................................Passed -check docstring is first.................................................Passed -check that executables have shebangs.................(no files to check)Skipped -check that scripts with shebangs are executable..........................Failed -- hook id: check-shebang-scripts-are-executable -- exit code: 1 - -src/six_stack_personality_classifier.py: has a shebang but is not marked executable! - If it is supposed to be executable, try: `chmod +x src/six_stack_personality_classifier.py` - If on Windows, you may also need to: `git add --chmod=+x src/six_stack_personality_classifier.py` - If it not supposed to be executable, double-check its shebang is wanted. - -src/main_modular.py: has a shebang but is not marked executable! - If it is supposed to be executable, try: `chmod +x src/main_modular.py` - If on Windows, you may also need to: `git add --chmod=+x src/main_modular.py` - If it not supposed to be executable, double-check its shebang is wanted. - -mixed line ending........................................................Passed -fix utf-8 byte order marker..............................................Passed -black....................................................................Passed -isort....................................................................Failed -- hook id: isort -- files were modified by this hook - -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/modules/config.py -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/modules/data_augmentation.py -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/modules/data_loader.py -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/modules/ensemble.py -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/modules/model_builders.py -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/modules/optimization.py -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/modules/preprocessing.py -Fixing /Users/jv/Documents/GitHub/Personality-classification/src/six_stack_personality_classifier.py - -ruff.....................................................................Failed -- hook id: ruff -- exit code: 1 - -src/modules/data_augmentation.py:431:5: PLR0912 Too many branches (18 > 12) -src/modules/data_augmentation.py:609:5: PLR0912 Too many branches (16 > 12) -src/modules/optimization.py:109:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/modules/optimization.py:151:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/modules/optimization.py:193:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/modules/optimization.py:235:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/modules/optimization.py:282:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/modules/preprocessing.py:16:5: PLR0912 Too many branches (20 > 12) -src/modules/preprocessing.py:320:5: PLR0912 Too many branches (19 > 12) -src/six_stack_personality_classifier.py:231:5: PLR0912 Too many branches (18 > 12) -src/six_stack_personality_classifier.py:530:5: PLR0912 Too many branches (20 > 12) -src/six_stack_personality_classifier.py:840:5: PLR0912 Too many branches (19 > 12) -src/six_stack_personality_classifier.py:1668:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/six_stack_personality_classifier.py:1710:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/six_stack_personality_classifier.py:1752:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/six_stack_personality_classifier.py:1794:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -src/six_stack_personality_classifier.py:1841:13: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -Found 17 errors. - -ruff-format..............................................................Passed -bandit...................................................................Passed -pydocstyle...............................................................Failed -- hook id: pydocstyle -- exit code: 1 - -src/modules/data_loader.py:1 at module level: - D200: One-line docstring should fit on one line with quotes (found 3) -src/modules/data_loader.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/data_loader.py:13 in public function `load_data_with_external_merge`: - D205: 1 blank line required between summary line and description (found 0) -src/modules/data_loader.py:13 in public function `load_data_with_external_merge`: - D212: Multi-line docstring summary should start at the first line -src/main_modular.py:2 at module level: - D205: 1 blank line required between summary line and description (found 0) -src/main_modular.py:2 at module level: - D212: Multi-line docstring summary should start at the first line -src/main_modular.py:2 at module level: - D415: First line should end with a period, question mark, or exclamation point (not ')') -src/modules/model_builders.py:1 at module level: - D205: 1 blank line required between summary line and description (found 0) -src/modules/model_builders.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/model_builders.py:32 in public function `build_stack`: - D202: No blank lines allowed after function docstring (found 1) -src/modules/model_builders.py:32 in public function `build_stack`: - D415: First line should end with a period, question mark, or exclamation point (not 'y') -src/modules/model_builders.py:177 in public function `build_stack_c`: - D202: No blank lines allowed after function docstring (found 1) -src/modules/__init__.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/__init__.py:1 at module level: - D415: First line should end with a period, question mark, or exclamation point (not 's') -src/modules/preprocessing.py:1 at module level: - D200: One-line docstring should fit on one line with quotes (found 3) -src/modules/preprocessing.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/preprocessing.py:19 in public function `prep`: - D212: Multi-line docstring summary should start at the first line -src/modules/preprocessing.py:71 in private nested function `fill_missing_by_quantile_group`: - D415: First line should end with a period, question mark, or exclamation point (not ')') -src/modules/preprocessing.py:226 in public function `add_pseudo_labeling_conservative`: - D212: Multi-line docstring summary should start at the first line -src/modules/preprocessing.py:328 in public function `create_domain_balanced_dataset`: - D205: 1 blank line required between summary line and description (found 0) -src/modules/preprocessing.py:328 in public function `create_domain_balanced_dataset`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:2 at module level: - D205: 1 blank line required between summary line and description (found 0) -src/six_stack_personality_classifier.py:2 at module level: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:2 at module level: - D415: First line should end with a period, question mark, or exclamation point (not 't') -src/six_stack_personality_classifier.py:93 in public function `load_data_with_external_merge`: - D205: 1 blank line required between summary line and description (found 0) -src/six_stack_personality_classifier.py:93 in public function `load_data_with_external_merge`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:168 in public function `simple_mixed_augmentation`: - D202: No blank lines allowed after function docstring (found 1) -src/six_stack_personality_classifier.py:168 in public function `simple_mixed_augmentation`: - D415: First line should end with a period, question mark, or exclamation point (not 's') -src/six_stack_personality_classifier.py:232 in public function `sdv_augmentation`: - D200: One-line docstring should fit on one line with quotes (found 3) -src/six_stack_personality_classifier.py:232 in public function `sdv_augmentation`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:232 in public function `sdv_augmentation`: - D415: First line should end with a period, question mark, or exclamation point (not 'g') -src/six_stack_personality_classifier.py:368 in public function `smotenc_augmentation`: - D415: First line should end with a period, question mark, or exclamation point (not 'a') -src/six_stack_personality_classifier.py:420 in public function `apply_data_augmentation`: - D415: First line should end with a period, question mark, or exclamation point (not 'd') -src/six_stack_personality_classifier.py:465 in public function `augment_data_conservative`: - D205: 1 blank line required between summary line and description (found 0) -src/six_stack_personality_classifier.py:465 in public function `augment_data_conservative`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:533 in public function `prep`: - D200: One-line docstring should fit on one line with quotes (found 3) -src/six_stack_personality_classifier.py:533 in public function `prep`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:576 in private nested function `fill_missing_by_quantile_group`: - D415: First line should end with a period, question mark, or exclamation point (not ')') -src/six_stack_personality_classifier.py:713 in public function `add_label_noise`: - D200: One-line docstring should fit on one line with quotes (found 3) -src/six_stack_personality_classifier.py:713 in public function `add_label_noise`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:746 in public function `add_pseudo_labeling_conservative`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:848 in public function `create_domain_balanced_dataset`: - D205: 1 blank line required between summary line and description (found 0) -src/six_stack_personality_classifier.py:848 in public function `create_domain_balanced_dataset`: - D212: Multi-line docstring summary should start at the first line -src/six_stack_personality_classifier.py:1049 in public function `build_stack`: - D202: No blank lines allowed after function docstring (found 1) -src/six_stack_personality_classifier.py:1049 in public function `build_stack`: - D415: First line should end with a period, question mark, or exclamation point (not 'n') -src/six_stack_personality_classifier.py:1953 in public function `main`: - D415: First line should end with a period, question mark, or exclamation point (not 'n') -src/modules/config.py:1 at module level: - D200: One-line docstring should fit on one line with quotes (found 3) -src/modules/config.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/ensemble.py:1 at module level: - D200: One-line docstring should fit on one line with quotes (found 3) -src/modules/ensemble.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/data_augmentation.py:1 at module level: - D205: 1 blank line required between summary line and description (found 0) -src/modules/data_augmentation.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/data_augmentation.py:368 in public function `simple_mixed_augmentation`: - D202: No blank lines allowed after function docstring (found 1) -src/modules/data_augmentation.py:612 in public function `apply_data_augmentation`: - D202: No blank lines allowed after function docstring (found 1) -src/modules/optimization.py:1 at module level: - D200: One-line docstring should fit on one line with quotes (found 3) -src/modules/optimization.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/optimization.py:58 in public function `add_label_noise`: - D200: One-line docstring should fit on one line with quotes (found 3) -src/modules/optimization.py:58 in public function `add_label_noise`: - D212: Multi-line docstring summary should start at the first line -src/modules/utils.py:1 at module level: - D200: One-line docstring should fit on one line with quotes (found 3) -src/modules/utils.py:1 at module level: - D212: Multi-line docstring summary should start at the first line -src/modules/utils.py:11 in public function `add_label_noise`: - D212: Multi-line docstring summary should start at the first line - -mypy.....................................................................Failed -- hook id: mypy -- exit code: 1 - -src/main_modular.py:77: error: Function "builtins.callable" is not valid as a type [valid-type] -src/main_modular.py:77: note: Perhaps you meant "typing.Callable" instead of "callable"? -src/main_modular.py:249: error: Function "builtins.callable" is not valid as a type [valid-type] -src/main_modular.py:249: note: Perhaps you meant "typing.Callable" instead of "callable"? -src/main_modular.py:272: error: Function "builtins.callable" is not valid as a type [valid-type] -src/main_modular.py:272: note: Perhaps you meant "typing.Callable" instead of "callable"? -src/main_modular.py:355: error: Function "builtins.callable" is not valid as a type [valid-type] -src/main_modular.py:355: note: Perhaps you meant "typing.Callable" instead of "callable"? -src/main_modular.py:364: error: callable? not callable [misc] -src/main_modular.py:372: error: callable? not callable [misc] -src/main_modular.py:388: error: "object" has no attribute "inverse_transform" [attr-defined] -src/main_modular.py:401: error: Function "builtins.callable" is not valid as a type [valid-type] -src/main_modular.py:401: note: Perhaps you meant "typing.Callable" instead of "callable"? -src/main_modular.py:419: error: callable? not callable [misc] -src/main_modular.py:427: error: callable? not callable [misc] -Found 10 errors in 1 file (checked 11 source files) - -nbqa-black...............................................................Passed -nbqa-isort...............................................................Passed -nbqa-ruff................................................................Passed -nbstripout...........................................(no files to check)Skipped -prettier.................................................................Failed -- hook id: prettier -- files were modified by this hook - -.github/AUTHORS.md -.github/ISSUE_TEMPLATE/issue_template.md -docs/configuration.md -docs/deployment.md -.github/CONTRIBUTORS.md -README.md -docs/api-reference.md -docs/technical-guide.md -.github/pull_request_template.md -docs/README.md -docs/performance-tuning.md -src/modules/README.md -docs/data-augmentation.md - -CRLF end-lines remover...................................................Passed -Tabs remover.............................................................Passed diff --git a/pyproject.toml b/pyproject.toml index 936f90f..acbfb3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,27 +24,23 @@ dependencies = [ "numpy>=1.24.0,<2.0.0", "pandas>=2.0.0,<3.0.0", "scikit-learn>=1.3.0,<1.6.0", - # Advanced ML models (gradient boosting) "catboost>=1.2.0,<2.0.0", "lightgbm>=4.0.0,<5.0.0", "xgboost>=2.0.0,<3.0.0", - # Statistical computing and preprocessing "scipy>=1.11.0,<2.0.0", - "imbalanced-learn>=0.11.0,<1.0.0", # For SMOTE data augmentation - + "imbalanced-learn>=0.11.0,<1.0.0", # For SMOTE data augmentation # Hyperparameter optimization "optuna>=3.4.0,<4.0.0", - # Data augmentation and synthetic data generation - "sdv>=1.24.0,<2.0.0", # For advanced synthetic data - + "sdv>=1.24.0,<2.0.0", # For advanced synthetic data # Model serialization and utilities "joblib>=1.3.0,<2.0.0", - # Web application framework "dash>=2.14.0,<3.0.0", + "dash-bootstrap-components>=1.7.1", + "plotly>=5.24.1", ] [project.optional-dependencies] @@ -172,3 +168,20 @@ skips = ["B101", "B601"] # Skip assert_used and shell_injection in paramiko [tool.pydocstyle] convention = "google" add-ignore = ["D100", "D104", "D105"] # Allow missing docstrings for modules, packages, magic methods + +# MyPy configuration (type checking) +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +ignore_missing_imports = true +no_strict_optional = true +explicit_package_bases = true +namespace_packages = true +exclude = [ + "tests/", + "scripts/", + "examples/", + "catboost_info/", +] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..c84f2e4 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +"""Main source package for personality classification pipeline.""" diff --git a/src/modules/config.py b/src/modules/config.py index 1406b76..b023117 100644 --- a/src/modules/config.py +++ b/src/modules/config.py @@ -23,7 +23,7 @@ class Paths(Enum): TRAIN_CSV = DATA_DIR / "train.csv" TEST_CSV = DATA_DIR / "test.csv" SAMPLE_SUBMISSION_CSV = DATA_DIR / "sample_submission.csv" - PERSONALITY_DATASET_CSV = DATA_DIR / "personality_dataset.csv" + PERSONALITY_DATASET_CSV = DATA_DIR / "personality_datasert.csv" # Log files PERSONALITY_CLASSIFIER_LOG = BASE_DIR / "personality_classifier.log" diff --git a/src/modules/data_loader.py b/src/modules/data_loader.py index f2e2a74..07aba89 100644 --- a/src/modules/data_loader.py +++ b/src/modules/data_loader.py @@ -2,6 +2,7 @@ import pandas as pd +from .config import Paths from .utils import get_logger logger = get_logger(__name__) @@ -17,17 +18,18 @@ def load_data_with_external_merge(): """ logger.info("๐Ÿ“Š Loading data with advanced merge strategy...") - # Load original datasets - df_tr = pd.read_csv("./data/train.csv") - df_te = pd.read_csv("./data/test.csv") - submission = pd.read_csv("./data/sample_submission.csv") + # Use Paths enum from config.py for all file paths + df_tr = pd.read_csv(Paths.TRAIN_CSV.value) + df_te = pd.read_csv(Paths.TEST_CSV.value) + submission = pd.read_csv(Paths.SAMPLE_SUBMISSION_CSV.value) logger.info(f"Original train shape: {df_tr.shape}") logger.info(f"Original test shape: {df_te.shape}") # Load external dataset using advanced merge strategy + try: - df_external = pd.read_csv("./data/personality_datasert.csv") + df_external = pd.read_csv(Paths.PERSONALITY_DATASET_CSV.value) logger.info(f"External dataset shape: {df_external.shape}") # Rename Personality column to match_p for clarity @@ -52,7 +54,6 @@ def load_data_with_external_merge(): logger.info(f"External dataset shape after deduplication: {df_external.shape}") # Merge with training and test data to create match_p feature - # This adds the match_p column as a new feature df_tr = df_tr.merge(df_external, how="left", on=merge_cols) df_te = df_te.merge(df_external, how="left", on=merge_cols) diff --git a/tests/conftest.py b/tests/conftest.py index d73635b..a00fd36 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,11 +22,13 @@ # Import Dash app components for testing try: - from dash_app.src import PersonalityClassifierApp + from dash_app.dashboard.app import PersonalityClassifierApp + from dash_app.dashboard.model_loader import ModelLoader DASH_AVAILABLE = True except ImportError: PersonalityClassifierApp = None + ModelLoader = None DASH_AVAILABLE = False @@ -187,6 +189,40 @@ def mock_environment_variables(): os.environ.update(original_env) +@pytest.fixture +def mock_model_file(temp_dir): + """Create a mock model file for testing.""" + model_file = temp_dir / "test_model.pkl" + model_file.write_text("mock_model_data") + return str(model_file) + + +@pytest.fixture +def sample_prediction_data(): + """Sample input data for dashboard predictions.""" + return { + "time_alone": 3.0, + "social_events": 2.0, + "going_outside": 4.0, + "friends_size": 3.0, + "post_freq": 2.0, + "stage_fear": 1.0, + "drained_social": 2.0, + } + + +@pytest.fixture +def sample_prediction_probabilities(): + """Sample prediction probabilities for testing.""" + return { + "Extroversion": 0.8, + "Agreeableness": 0.6, + "Conscientiousness": 0.7, + "Neuroticism": 0.4, + "Openness": 0.9, + } + + # Custom assertions for ML testing def assert_model_performance(y_true, y_pred, min_accuracy: float = 0.5): """Assert that model performance meets minimum requirements.""" diff --git a/tests/dash_app/test_callbacks.py b/tests/dash_app/test_callbacks.py new file mode 100644 index 0000000..eaf2974 --- /dev/null +++ b/tests/dash_app/test_callbacks.py @@ -0,0 +1,237 @@ +"""Tests for dashboard callback functions.""" + +from unittest.mock import MagicMock, patch + +import pytest +from dash import Dash + +from dash_app.dashboard.callbacks import register_callbacks + + +class TestCallbackRegistration: + """Test suite for callback registration.""" + + def test_register_callbacks_success(self): + """Test successful callback registration.""" + # Create mock objects + mock_app = MagicMock(spec=Dash) + mock_model_loader = MagicMock() + prediction_history = [] + + # Should not raise any exceptions + register_callbacks(mock_app, mock_model_loader, prediction_history) + + # Verify that callbacks were registered (app.callback should be called) + assert mock_app.callback.called + + def test_register_callbacks_with_history(self): + """Test callback registration with existing prediction history.""" + mock_app = MagicMock(spec=Dash) + mock_model_loader = MagicMock() + prediction_history = [ + {"timestamp": "2025-01-15", "prediction": {"Extroversion": 0.8}} + ] + + register_callbacks(mock_app, mock_model_loader, prediction_history) + assert mock_app.callback.called + + +class TestPredictionCallback: + """Test suite for prediction callback functionality.""" + + @pytest.fixture + def mock_setup(self): + """Set up mocks for testing prediction callback.""" + mock_app = MagicMock(spec=Dash) + mock_model_loader = MagicMock() + prediction_history = [] + + # Configure mock model loader + mock_model_loader.predict.return_value = { + "Extroversion": 0.8, + "Agreeableness": 0.6, + "Conscientiousness": 0.7, + "Neuroticism": 0.4, + "Openness": 0.9, + } + + return mock_app, mock_model_loader, prediction_history + + def test_prediction_callback_registration(self, mock_setup): + """Test that prediction callback is properly registered.""" + mock_app, mock_model_loader, prediction_history = mock_setup + + register_callbacks(mock_app, mock_model_loader, prediction_history) + + # Verify callback was registered + assert mock_app.callback.called + # Should have at least one callback call for the prediction + assert mock_app.callback.call_count >= 1 + + def test_prediction_with_valid_inputs(self, mock_setup): + """Test prediction callback with valid input values.""" + mock_app, mock_model_loader, prediction_history = mock_setup + + # Register callbacks + register_callbacks(mock_app, mock_model_loader, prediction_history) + + # Get the registered callback function + callback_calls = mock_app.callback.call_args_list + assert len(callback_calls) > 0 + + # Find the prediction callback (it should be the one with most State parameters) + prediction_callback = None + for call in callback_calls: + args, kwargs = call + if len(args) >= 2: # Output, Input, State... + prediction_callback = args + break + + assert prediction_callback is not None + + def test_model_loader_integration(self, mock_setup): + """Test integration with model loader.""" + mock_app, mock_model_loader, prediction_history = mock_setup + + register_callbacks(mock_app, mock_model_loader, prediction_history) + + # Verify model_loader is passed to the callback registration + assert mock_app.callback.called + + +class TestCallbackErrorHandling: + """Test error handling in callbacks.""" + + def test_callback_with_none_model_loader(self): + """Test callback registration with None model loader.""" + mock_app = MagicMock(spec=Dash) + prediction_history = [] + + # Should handle None model_loader gracefully + register_callbacks(mock_app, None, prediction_history) + assert mock_app.callback.called + + def test_callback_with_none_history(self): + """Test callback registration with None prediction history.""" + mock_app = MagicMock(spec=Dash) + mock_model_loader = MagicMock() + + # Should handle None prediction_history gracefully + register_callbacks(mock_app, mock_model_loader, None) + assert mock_app.callback.called + + def test_callback_with_invalid_app(self): + """Test callback registration with invalid app object.""" + mock_model_loader = MagicMock() + prediction_history = [] + + # Should handle invalid app object + with pytest.raises(AttributeError): + register_callbacks("invalid_app", mock_model_loader, prediction_history) + + +class TestCallbackInputValidation: + """Test input validation in callbacks.""" + + @pytest.fixture + def callback_function_mock(self): + """Mock the actual callback function for testing.""" + with patch("dash_app.dashboard.callbacks.register_callbacks") as mock_register: + # Create a mock prediction function + def mock_prediction_callback( + n_clicks, + time_alone, + social_events, + going_outside, + friends_size, + post_freq, + stage_fear, + drained_social, + ): + # Simulate input validation + if n_clicks is None or n_clicks == 0: + return "No prediction made" + + # Validate input ranges + inputs = [ + time_alone, + social_events, + going_outside, + friends_size, + post_freq, + stage_fear, + drained_social, + ] + + if any(x is None for x in inputs): + return "Invalid input: None values" + + if any(not isinstance(x, int | float) for x in inputs): + return "Invalid input: Non-numeric values" + + return "Valid prediction" + + mock_register.return_value = mock_prediction_callback + yield mock_prediction_callback + + def test_callback_with_none_clicks(self, callback_function_mock): + """Test callback behavior with no button clicks.""" + result = callback_function_mock(None, 3.0, 2.0, 4.0, 3.0, 2.0, 1.0, 2.0) + assert result == "No prediction made" + + def test_callback_with_zero_clicks(self, callback_function_mock): + """Test callback behavior with zero button clicks.""" + result = callback_function_mock(0, 3.0, 2.0, 4.0, 3.0, 2.0, 1.0, 2.0) + assert result == "No prediction made" + + def test_callback_with_none_inputs(self, callback_function_mock): + """Test callback behavior with None input values.""" + result = callback_function_mock(1, None, 2.0, 4.0, 3.0, 2.0, 1.0, 2.0) + assert result == "Invalid input: None values" + + def test_callback_with_invalid_inputs(self, callback_function_mock): + """Test callback behavior with invalid input types.""" + result = callback_function_mock(1, "invalid", 2.0, 4.0, 3.0, 2.0, 1.0, 2.0) + assert result == "Invalid input: Non-numeric values" + + def test_callback_with_valid_inputs(self, callback_function_mock): + """Test callback behavior with valid inputs.""" + result = callback_function_mock(1, 3.0, 2.0, 4.0, 3.0, 2.0, 1.0, 2.0) + assert result == "Valid prediction" + + +class TestCallbackHistoryManagement: + """Test prediction history management in callbacks.""" + + def test_history_updates_after_prediction(self): + """Test that prediction history is updated after successful prediction.""" + mock_app = MagicMock(spec=Dash) + mock_model_loader = MagicMock() + prediction_history = [] + + # Configure mock to return a prediction + mock_model_loader.predict.return_value = { + "Extroversion": 0.8, + "Agreeableness": 0.6, + } + + register_callbacks(mock_app, mock_model_loader, prediction_history) + + # Verify that the history list reference is maintained + assert isinstance(prediction_history, list) + + def test_history_size_limit(self): + """Test that prediction history respects size limits.""" + # This would test if there's a maximum history size implementation + prediction_history = [{"test": f"prediction_{i}"} for i in range(1000)] + mock_app = MagicMock(spec=Dash) + mock_model_loader = MagicMock() + + register_callbacks(mock_app, mock_model_loader, prediction_history) + + # The function should handle large histories gracefully + assert isinstance(prediction_history, list) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/dash_app/test_dash_application.py b/tests/dash_app/test_dash_application.py new file mode 100644 index 0000000..4fdab43 --- /dev/null +++ b/tests/dash_app/test_dash_application.py @@ -0,0 +1,318 @@ +"""Tests for the main Dash application class.""" + +from unittest.mock import MagicMock, patch + +import dash +import pytest + +from dash_app.dashboard.app import PersonalityClassifierApp, create_app + + +class TestPersonalityClassifierApp: + """Test suite for PersonalityClassifierApp class.""" + + def test_app_initialization_default_params(self): + """Test app initialization with default parameters.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + assert app.model_name == "test_model" + assert app.model_version is None + assert app.model_stage == "Production" + assert app.host == "127.0.0.1" + assert app.port == 8050 + + def test_app_initialization_custom_params(self): + """Test app initialization with custom parameters.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp( + model_name="custom_model", + model_version="v1.0", + model_stage="Staging", + host="0.0.0.0", + port=9000, + ) + + assert app.model_name == "custom_model" + assert app.model_version == "v1.0" + assert app.model_stage == "Staging" + assert app.host == "0.0.0.0" + assert app.port == 9000 + + def test_app_has_dash_instance(self): + """Test that app creates a Dash instance.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + assert hasattr(app, "app") + assert isinstance(app.app, dash.Dash) + + def test_app_title_configuration(self): + """Test that app title is configured correctly.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + assert "test_model" in app.app.title + + def test_app_layout_is_set(self): + """Test that app layout is properly set.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + with patch("dash_app.dashboard.app.create_layout") as mock_layout: + mock_layout.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + assert app.app.layout is not None + + def test_app_callbacks_registration(self): + """Test that callbacks are registered.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + with patch("dash_app.dashboard.app.register_callbacks") as mock_callbacks: + PersonalityClassifierApp(model_name="test_model") + + # Verify register_callbacks was called + mock_callbacks.assert_called_once() + + def test_app_prediction_history_initialization(self): + """Test that prediction history is initialized.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + assert hasattr(app, "prediction_history") + assert isinstance(app.prediction_history, list) + assert len(app.prediction_history) == 0 + + def test_get_app_method(self): + """Test the get_app method.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + dash_app = app.get_app() + + assert isinstance(dash_app, dash.Dash) + assert dash_app is app.app + + +class TestAppRunning: + """Test suite for app running functionality.""" + + def test_app_run_method_exists(self): + """Test that run method exists.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + assert hasattr(app, "run") + assert callable(app.run) + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_run_with_debug_false(self, mock_loader): + """Test app running with debug=False.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + # Mock the Dash app's run_server method + app.app.run_server = MagicMock() + + app.run(debug=False) + + # Verify run_server was called with correct parameters + app.app.run_server.assert_called_once_with( + host="127.0.0.1", port=8050, debug=False + ) + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_run_with_debug_true(self, mock_loader): + """Test app running with debug=True.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + app.app.run_server = MagicMock() + + app.run(debug=True) + + app.app.run_server.assert_called_once_with( + host="127.0.0.1", port=8050, debug=True + ) + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_run_with_custom_host_port(self, mock_loader): + """Test app running with custom host and port.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp( + model_name="test_model", host="0.0.0.0", port=9000 + ) + app.app.run_server = MagicMock() + + app.run() + + app.app.run_server.assert_called_once_with( + host="0.0.0.0", port=9000, debug=False + ) + + +class TestCreateAppFunction: + """Test suite for the create_app function.""" + + def test_create_app_function_exists(self): + """Test that create_app function exists.""" + assert callable(create_app) + + @patch("dash_app.dashboard.app.PersonalityClassifierApp") + def test_create_app_with_default_params(self, mock_app_class): + """Test create_app function with default parameters.""" + mock_instance = MagicMock() + mock_app_class.return_value = mock_instance + + result = create_app("test_model") + + mock_app_class.assert_called_once_with( + model_name="test_model", model_version=None, model_stage="Production" + ) + assert result == mock_instance.get_app.return_value + + @patch("dash_app.dashboard.app.PersonalityClassifierApp") + def test_create_app_with_custom_params(self, mock_app_class): + """Test create_app function with custom parameters.""" + mock_instance = MagicMock() + mock_app_class.return_value = mock_instance + + result = create_app( + model_name="custom_model", model_version="v2.0", model_stage="Staging" + ) + + mock_app_class.assert_called_once_with( + model_name="custom_model", model_version="v2.0", model_stage="Staging" + ) + assert result == mock_instance.get_app.return_value + + +class TestAppErrorHandling: + """Test error handling in app initialization and running.""" + + def test_app_with_invalid_model_name(self): + """Test app initialization with invalid model name.""" + with patch("dash_app.dashboard.app.ModelLoader") as mock_loader: + mock_loader.side_effect = FileNotFoundError("Model not found") + + with pytest.raises(FileNotFoundError): + PersonalityClassifierApp(model_name="nonexistent_model") + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_with_model_loading_error(self, mock_loader): + """Test app behavior when model loading fails.""" + mock_loader.side_effect = OSError("Model loading failed") + + with pytest.raises(OSError): # More specific exception + PersonalityClassifierApp(model_name="test_model") + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_run_server_error(self, mock_loader): + """Test app behavior when run_server fails.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + app.app.run_server = MagicMock(side_effect=OSError("Server start failed")) + + with pytest.raises(OSError): + app.run() + + +class TestAppIntegration: + """Integration tests for the complete app.""" + + @patch("dash_app.dashboard.app.ModelLoader") + def test_full_app_initialization_workflow(self, mock_loader): + """Test complete app initialization workflow.""" + # Setup mock model loader + mock_model = MagicMock() + mock_model.predict.return_value = { + "Extroversion": 0.8, + "Agreeableness": 0.6, + "Conscientiousness": 0.7, + "Neuroticism": 0.4, + "Openness": 0.9, + } + mock_loader.return_value = mock_model + + # Initialize app + app = PersonalityClassifierApp(model_name="ensemble_model") + + # Verify all components are properly set up + assert app.model_name == "ensemble_model" + assert isinstance(app.app, dash.Dash) + assert app.app.layout is not None + assert isinstance(app.prediction_history, list) + + # Verify model loader was called + mock_loader.assert_called_once() + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_with_real_model_path(self, mock_loader): + """Test app with realistic model path.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="models/ensemble_model.pkl") + + assert app.model_name == "models/ensemble_model.pkl" + # Verify model loader was called with the path + mock_loader.assert_called_once() + + +class TestAppConfiguration: + """Test app configuration and settings.""" + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_external_stylesheets(self, mock_loader): + """Test that external stylesheets are properly configured.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + # Check that the app has external stylesheets configured + # Since Dash doesn't expose external_stylesheets directly, we check the config + assert hasattr(app.app, "config") + # Verify the app was created with stylesheets (implicit test) + assert app.app is not None + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_suppress_callback_exceptions(self, mock_loader): + """Test that callback exceptions are properly configured.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + # Should suppress callback exceptions for dynamic layouts + assert app.app.config.suppress_callback_exceptions is True + + @patch("dash_app.dashboard.app.ModelLoader") + def test_app_logging_configuration(self, mock_loader): + """Test that logging is properly configured.""" + mock_loader.return_value = MagicMock() + + app = PersonalityClassifierApp(model_name="test_model") + + assert hasattr(app, "logger") + assert app.logger is not None + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/dash_app/test_dashboard_functional.py b/tests/dash_app/test_dashboard_functional.py new file mode 100644 index 0000000..5c1a5dc --- /dev/null +++ b/tests/dash_app/test_dashboard_functional.py @@ -0,0 +1,193 @@ +"""Simplified functional tests for dashboard components.""" + +from unittest.mock import MagicMock, patch + +import dash_bootstrap_components as dbc +import pytest + +from dash_app.dashboard.app import PersonalityClassifierApp, create_app +from dash_app.dashboard.layout import create_layout, create_professional_header +from dash_app.dashboard.model_loader import ModelLoader + + +class TestDashboardFunctionality: + """Test the actual dashboard functionality.""" + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_app_initialization(self, mock_load_model): + """Test that the app initializes correctly.""" + mock_load_model.return_value = None + + app = PersonalityClassifierApp(model_name="test_model") + + assert app.model_name == "test_model" + assert app.host == "127.0.0.1" + assert app.port == 8050 + assert app.app is not None + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_app_with_custom_params(self, mock_load_model): + """Test app with custom parameters.""" + mock_load_model.return_value = None + + app = PersonalityClassifierApp( + model_name="custom_model", model_version="v1.0", host="0.0.0.0", port=9000 + ) + + assert app.model_name == "custom_model" + assert app.model_version == "v1.0" + assert app.host == "0.0.0.0" + assert app.port == 9000 + + def test_create_app_function(self): + """Test the create_app factory function.""" + with patch("dash_app.dashboard.app.PersonalityClassifierApp") as mock_app: + mock_instance = MagicMock() + mock_app.return_value = mock_instance + + create_app("test_model") + + mock_app.assert_called_once_with( + model_name="test_model", model_version=None, model_stage="Production" + ) + + def test_layout_creation(self): + """Test layout creation.""" + model_name = "test_model" + model_metadata = {"version": "1.0"} + + layout = create_layout(model_name, model_metadata) + + assert layout is not None + + def test_professional_header_creation(self): + """Test professional header creation.""" + header = create_professional_header() + + # The function returns a dbc.Container, not html.Div + assert isinstance(header, dbc.Container) + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_model_loader_initialization(self, mock_load_model): + """Test model loader initialization.""" + mock_load_model.return_value = None + + loader = ModelLoader("test_model") + + assert loader.model_name == "test_model" + assert loader.model_stage == "Production" + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_app_has_prediction_history(self, mock_load_model): + """Test that app has prediction history.""" + mock_load_model.return_value = None + + app = PersonalityClassifierApp(model_name="test_model") + + assert hasattr(app, "prediction_history") + assert isinstance(app.prediction_history, list) + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_app_has_callback_registration(self, mock_load_model): + """Test that callbacks are registered.""" + mock_load_model.return_value = None + + app = PersonalityClassifierApp(model_name="test_model") + + # Check that the app has callbacks registered + assert hasattr(app.app, "callback_map") + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_app_run_method(self, mock_load_model): + """Test app run method.""" + mock_load_model.return_value = None + + app = PersonalityClassifierApp(model_name="test_model") + app.app.run_server = MagicMock() + + app.run(debug=True) + + app.app.run_server.assert_called_once_with( + host="127.0.0.1", port=8050, debug=True + ) + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_get_app_method(self, mock_load_model): + """Test get_app method.""" + mock_load_model.return_value = None + + app = PersonalityClassifierApp(model_name="test_model") + dash_app = app.get_app() + + assert dash_app is app.app + + +class TestModelLoaderFunctionality: + """Test model loader functionality.""" + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_model_loader_attributes(self, mock_load_model): + """Test model loader has correct attributes.""" + mock_load_model.return_value = None + + loader = ModelLoader("test_model", "v1.0", "Staging") + + assert loader.model_name == "test_model" + assert loader.model_version == "v1.0" + assert loader.model_stage == "Staging" + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_model_loader_has_model_attribute(self, mock_load_model): + """Test that model loader has model attribute.""" + mock_load_model.return_value = None + + loader = ModelLoader("test_model") + + assert hasattr(loader, "model") + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_model_loader_has_metadata(self, mock_load_model): + """Test that model loader has metadata.""" + mock_load_model.return_value = None + + loader = ModelLoader("test_model") + + assert hasattr(loader, "model_metadata") + assert isinstance(loader.model_metadata, dict) + + +class TestIntegrationWorkflow: + """Test integration workflow.""" + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_complete_app_creation_workflow(self, mock_load_model): + """Test complete app creation workflow.""" + mock_load_model.return_value = None + + # Create app + app = PersonalityClassifierApp(model_name="ensemble_model") + + # Verify all components are set up + assert app.model_name == "ensemble_model" + assert app.app is not None + assert app.app.layout is not None + assert app.model_loader is not None + assert isinstance(app.prediction_history, list) + + @patch("dash_app.dashboard.model_loader.ModelLoader._load_model") + def test_app_scalability(self, mock_load_model): + """Test that multiple apps can be created.""" + mock_load_model.return_value = None + + apps = [] + for i in range(3): + app = PersonalityClassifierApp(model_name=f"model_{i}") + apps.append(app) + + assert len(apps) == 3 + for app in apps: + assert app.app is not None + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/dash_app/test_integration.py b/tests/dash_app/test_integration.py new file mode 100644 index 0000000..7b5a71b --- /dev/null +++ b/tests/dash_app/test_integration.py @@ -0,0 +1,245 @@ +"""Integration tests for the complete dashboard pipeline.""" + +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from dash_app.dashboard.app import PersonalityClassifierApp + + +class TestDashboardIntegration: + """Integration tests for the complete dashboard workflow.""" + + @pytest.fixture + def temp_model_file(self): + """Create a temporary model file for testing.""" + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as f: + temp_path = f.name + yield temp_path + # Cleanup + Path(temp_path).unlink(missing_ok=True) + + @patch("joblib.load") + def test_complete_dashboard_workflow(self, mock_joblib_load, temp_model_file): + """Test complete dashboard workflow from initialization to prediction.""" + # Setup mock model + mock_model = MagicMock() + mock_model.predict_proba.return_value = [ + [0.2, 0.8, 0.4, 0.6, 0.3, 0.7, 0.6, 0.4, 0.1, 0.9] + ] + mock_joblib_load.return_value = mock_model + + # Initialize dashboard with mock model + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + app = PersonalityClassifierApp( + model_name="ensemble", host="127.0.0.1", port=8050 + ) + + # Verify app is properly initialized + assert app.model_name == "ensemble" + assert app.host == "127.0.0.1" + assert app.port == 8050 + assert app.app is not None + assert app.app.layout is not None + + def test_dashboard_with_invalid_model_path(self): + """Test dashboard behavior with invalid model path.""" + # PersonalityClassifierApp doesn't raise FileNotFoundError - it creates dummy models + app = PersonalityClassifierApp(model_name="nonexistent_model") + assert app.model_name == "nonexistent_model" + assert app.app is not None + + @patch("joblib.load") + def test_dashboard_layout_rendering(self, mock_joblib_load, temp_model_file): + """Test that dashboard layout renders correctly.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + app = PersonalityClassifierApp(model_name="test_model") + + # Verify layout components exist + layout = app.app.layout + assert layout is not None + + @patch("joblib.load") + def test_dashboard_callbacks_registration(self, mock_joblib_load, temp_model_file): + """Test that dashboard callbacks are properly registered.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + app = PersonalityClassifierApp(model_name="test_model") + + # Verify that callbacks are registered (app should have callback registry) + assert hasattr(app.app, "callback_map") + + +class TestDashboardErrorRecovery: + """Test dashboard error recovery and graceful degradation.""" + + @patch("joblib.load") + def test_dashboard_with_corrupted_model(self, mock_joblib_load): + """Test dashboard behavior with corrupted model.""" + mock_joblib_load.side_effect = OSError("Corrupted model file") + + # PersonalityClassifierApp handles corrupted models gracefully with dummy fallback + app = PersonalityClassifierApp(model_name="corrupted_model") + assert app.model_name == "corrupted_model" + assert app.app is not None + + @patch("joblib.load") + def test_dashboard_handles_prediction_errors(self, mock_joblib_load): + """Test dashboard handles prediction errors gracefully.""" + # Setup mock model that fails during prediction + mock_model = MagicMock() + mock_model.predict_proba.side_effect = ValueError("Prediction failed") + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + # Should initialize successfully even if model has issues + app = PersonalityClassifierApp(model_name="test_model") + assert app is not None + + +class TestDashboardPerformance: + """Test dashboard performance and resource usage.""" + + @patch("joblib.load") + def test_dashboard_memory_usage(self, mock_joblib_load): + """Test that dashboard doesn't create memory leaks.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + # Create multiple app instances + apps = [] + for i in range(5): + app = PersonalityClassifierApp(model_name=f"test_model_{i}") + apps.append(app) + + # Each should be independent + assert len(apps) == 5 + for app in apps: + assert app.app is not None + + @patch("joblib.load") + def test_dashboard_startup_time(self, mock_joblib_load): + """Test dashboard startup performance.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + app = PersonalityClassifierApp(model_name="test_model") + + # Verify that startup is reasonably fast + assert app.app is not None + + +class TestDashboardConfiguration: + """Test dashboard configuration options.""" + + @patch("joblib.load") + def test_dashboard_custom_configuration(self, mock_joblib_load): + """Test dashboard with custom configuration.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + app = PersonalityClassifierApp( + model_name="custom_model", + model_version="v2.0", + model_stage="Staging", + host="0.0.0.0", + port=9000, + ) + + assert app.model_name == "custom_model" + assert app.model_version == "v2.0" + assert app.model_stage == "Staging" + assert app.host == "0.0.0.0" + assert app.port == 9000 + + @patch("joblib.load") + def test_dashboard_environment_variables(self, mock_joblib_load): + """Test dashboard respects environment configuration.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + # Test with environment-like configuration + with patch.dict("os.environ", {"DASH_HOST": "0.0.0.0", "DASH_PORT": "9000"}): + app = PersonalityClassifierApp(model_name="test_model") + + # App should still use provided parameters over environment + assert app.host == "127.0.0.1" # Default value + assert app.port == 8050 # Default value + + +class TestDashboardScalability: + """Test dashboard scalability and concurrent usage.""" + + @patch("joblib.load") + def test_dashboard_concurrent_initialization(self, mock_joblib_load): + """Test multiple dashboard instances can be created concurrently.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + # Test creating multiple app instances + apps = [] + for i in range(3): + app = PersonalityClassifierApp(model_name=f"model_{i}") + apps.append(app) + + # All should succeed + assert len(apps) == 3 + for app in apps: + assert isinstance(app, PersonalityClassifierApp) + + @patch("joblib.load") + def test_dashboard_prediction_history_management(self, mock_joblib_load): + """Test prediction history management under load.""" + mock_model = MagicMock() + mock_joblib_load.return_value = mock_model + + with patch("dash_app.dashboard.model_loader.Path.exists") as mock_exists: + mock_exists.return_value = True + + app = PersonalityClassifierApp(model_name="test_model") + + # Simulate adding many predictions to history + for i in range(100): + app.prediction_history.append( + { + "timestamp": f"2025-01-15T{i:02d}:00:00", + "prediction": {"Extroversion": 0.8}, + } + ) + + assert len(app.prediction_history) == 100 + # History should be manageable even with many entries + assert isinstance(app.prediction_history, list) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/dash_app/test_layout_components.py b/tests/dash_app/test_layout_components.py new file mode 100644 index 0000000..86a871b --- /dev/null +++ b/tests/dash_app/test_layout_components.py @@ -0,0 +1,205 @@ +"""Tests for dashboard layout components.""" + +import dash_bootstrap_components as dbc +import plotly.graph_objects as go +import pytest +from dash import html + +from dash_app.dashboard.layout import ( + create_input_panel, + create_layout, + create_personality_radar, + create_professional_header, + format_prediction_result, +) + + +class TestLayoutComponents: + """Test suite for layout components.""" + + def test_create_professional_header(self): + """Test professional header creation.""" + header = create_professional_header() + + # The header returns a dbc.Container, not html.Div + assert isinstance(header, dbc.Container) + # Check for required styling + assert hasattr(header, "style") + # Check for children components + assert hasattr(header, "children") + + def test_create_input_panel(self): + """Test input panel creation.""" + panel = create_input_panel() + + assert isinstance(panel, dbc.Card) + # Should have card header and body + assert hasattr(panel, "children") + + def test_create_layout_structure(self): + """Test main layout structure.""" + model_name = "test_model" + model_metadata = {"version": "1.0", "created": "2025-01-01"} + + layout = create_layout(model_name, model_metadata) + + assert isinstance(layout, html.Div) + assert hasattr(layout, "children") + assert len(layout.children) >= 2 # Header + Content + + +class TestPersonalityRadar: + """Test suite for personality radar chart.""" + + def test_create_personality_radar_with_valid_data(self): + """Test radar chart creation with valid probability data.""" + probabilities = { + "Extroversion": 0.8, + "Agreeableness": 0.6, + "Conscientiousness": 0.7, + "Neuroticism": 0.4, + "Openness": 0.9, + } + + fig = create_personality_radar(probabilities) + + assert isinstance(fig, go.Figure) + assert len(fig.data) > 0 + assert fig.data[0].type == "scatterpolar" + + def test_create_personality_radar_with_input_data(self): + """Test radar chart creation with input data included.""" + probabilities = { + "Extroversion": 0.8, + "Agreeableness": 0.6, + "Conscientiousness": 0.7, + "Neuroticism": 0.4, + "Openness": 0.9, + } + input_data = {"time_alone": 3.0, "social_events": 2.0} + + fig = create_personality_radar(probabilities, input_data) + + assert isinstance(fig, go.Figure) + assert len(fig.data) > 0 + + def test_create_personality_radar_empty_data(self): + """Test radar chart with empty probability data.""" + probabilities = {} + + fig = create_personality_radar(probabilities) + + assert isinstance(fig, go.Figure) + # Should handle empty data gracefully + + def test_create_personality_radar_invalid_values(self): + """Test radar chart with invalid probability values.""" + probabilities = { + "Extroversion": 1.5, # Invalid: > 1.0 + "Agreeableness": -0.1, # Invalid: < 0.0 + "Conscientiousness": 0.7, + } + + # Should not raise an exception + fig = create_personality_radar(probabilities) + assert isinstance(fig, go.Figure) + + +class TestPredictionFormatting: + """Test suite for prediction result formatting.""" + + def test_format_prediction_result_valid(self): + """Test formatting of valid prediction results.""" + result_dict = { + "probabilities": { + "Extroversion": 0.8, + "Agreeableness": 0.6, + "Conscientiousness": 0.7, + "Neuroticism": 0.4, + "Openness": 0.9, + }, + "input_data": { + "time_alone": 3.0, + "social_events": 2.0, + "going_outside": 4.0, + "friends_size": 3.0, + "post_freq": 2.0, + "stage_fear": 1.0, + "drained_social": 2.0, + }, + } + + result = format_prediction_result(result_dict) + + assert isinstance(result, dbc.Card) + # Should contain formatted components + assert hasattr(result, "children") + + def test_format_prediction_result_missing_data(self): + """Test formatting with missing input data.""" + result_dict = {"probabilities": {"Extroversion": 0.8, "Agreeableness": 0.6}} + + # Should handle missing input data gracefully + result = format_prediction_result(result_dict) + assert isinstance(result, dbc.Card) + + +class TestLayoutIntegration: + """Integration tests for layout components.""" + + def test_layout_with_mock_model_metadata(self): + """Test layout creation with realistic model metadata.""" + model_name = "six_stack_ensemble" + model_metadata = { + "model_type": "ensemble", + "version": "1.0.0", + "created_date": "2025-01-15", + "accuracy": 0.92, + "features": [ + "time_alone", + "social_events", + "going_outside", + "friends_size", + "post_freq", + "stage_fear", + "drained_social", + ], + } + + layout = create_layout(model_name, model_metadata) + + assert isinstance(layout, html.Div) + # Verify structure contains expected components + assert len(layout.children) >= 2 + + def test_layout_responsiveness(self): + """Test that layout components have responsive classes.""" + layout = create_layout("test", {}) + + # Check for Bootstrap responsive classes in the layout + layout_str = str(layout) + assert "dbc.Container" in layout_str or "container" in layout_str.lower() + + +class TestLayoutEdgeCases: + """Test edge cases for layout components.""" + + def test_empty_model_name(self): + """Test layout creation with empty model name.""" + layout = create_layout("", {}) + assert isinstance(layout, html.Div) + + def test_none_model_metadata(self): + """Test layout creation with None metadata.""" + layout = create_layout("test_model", {}) + assert isinstance(layout, html.Div) + + def test_large_model_metadata(self): + """Test layout with extensive metadata.""" + large_metadata = {f"param_{i}": f"value_{i}" for i in range(100)} + layout = create_layout("test_model", large_metadata) + assert isinstance(layout, html.Div) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/dash_app/test_model_loader.py b/tests/dash_app/test_model_loader.py new file mode 100644 index 0000000..53dfd76 --- /dev/null +++ b/tests/dash_app/test_model_loader.py @@ -0,0 +1,138 @@ +"""Tests for dashboard model loader.""" + +import pytest + +from dash_app.dashboard.model_loader import ModelLoader + + +class TestModelLoader: + """Test suite for ModelLoader class.""" + + def test_model_loader_initialization(self): + """Test ModelLoader initialization.""" + loader = ModelLoader( + model_name="test_model", model_version="1.0", model_stage="Testing" + ) + assert loader.model_name == "test_model" + assert loader.model_version == "1.0" + assert loader.model_stage == "Testing" + # Model should be loaded (either real model or dummy) + assert loader.model is not None + + def test_model_loader_with_ensemble_name(self): + """Test ModelLoader with ensemble model name.""" + loader = ModelLoader(model_name="ensemble") + assert loader.model_name == "ensemble" + assert loader.is_loaded() is True + + def test_model_loader_get_metadata(self): + """Test model metadata retrieval.""" + loader = ModelLoader(model_name="test_model") + metadata = loader.get_metadata() + assert isinstance(metadata, dict) + assert "version" in metadata + assert "stage" in metadata + + def test_model_loader_is_loaded(self): + """Test model loading status check.""" + loader = ModelLoader(model_name="test_model") + assert loader.is_loaded() is True + + def test_model_loader_str_representation(self): + """Test string representation of ModelLoader.""" + loader = ModelLoader(model_name="test_model") + # Just check that it doesn't raise an error + str_repr = repr(loader) + assert isinstance(str_repr, str) + + +class TestModelPrediction: + """Test suite for model prediction functionality.""" + + @pytest.fixture + def model_loader(self): + """Create a ModelLoader for testing predictions.""" + return ModelLoader(model_name="test_model") + + def test_model_prediction_success(self, model_loader): + """Test successful model prediction.""" + input_data = { + "Time_spent_Alone": 3.0, + "Social_event_attendance": 2.0, + "Going_outside": 4.0, + "Friends_circle_size": 3.0, + "Post_frequency": 2.0, + "Stage_fear_No": 1, + "Stage_fear_Unknown": 0, + "Stage_fear_Yes": 0, + "Drained_after_socializing_No": 1, + "Drained_after_socializing_Unknown": 0, + "Drained_after_socializing_Yes": 0, + "match_p_Extrovert": 0, + "match_p_Introvert": 0, + "match_p_Unknown": 1, + } + + result = model_loader.predict(input_data) + + assert isinstance(result, dict) + assert "prediction" in result + assert "confidence" in result + assert result["model_name"] == "test_model" + + def test_model_prediction_with_missing_features(self, model_loader): + """Test prediction with missing input features.""" + input_data = { + "Time_spent_Alone": 3.0, + "Social_event_attendance": 2.0, + # Missing other features - should be handled by default values + } + + result = model_loader.predict(input_data) + assert isinstance(result, dict) + assert "prediction" in result + + def test_model_prediction_with_invalid_input(self, model_loader): + """Test prediction with invalid input data.""" + invalid_input = "invalid_input" + + with pytest.raises((ValueError, TypeError, AttributeError)): + model_loader.predict(invalid_input) + + def test_model_prediction_empty_input(self, model_loader): + """Test prediction with empty input.""" + empty_input = {} + + # Should handle empty input with default values + result = model_loader.predict(empty_input) + assert isinstance(result, dict) + assert "prediction" in result + + +class TestModelLoaderEdgeCases: + """Test edge cases for ModelLoader.""" + + def test_model_loader_with_dummy_fallback(self): + """Test ModelLoader creates dummy model when no real model found.""" + # Use a model name that won't exist + loader = ModelLoader(model_name="nonexistent_model") + + # Should still be loaded (with dummy model) + assert loader.is_loaded() is True + assert loader.model is not None + + # Metadata should indicate dummy model + metadata = loader.get_metadata() + assert metadata.get("version") == "dummy" + + def test_model_loader_ensemble_vs_stack(self): + """Test different model name patterns.""" + ensemble_loader = ModelLoader(model_name="ensemble") + stack_loader = ModelLoader(model_name="A") # Stack A + + assert ensemble_loader.model_name == "ensemble" + assert stack_loader.model_name == "A" + + # Both should be loaded + assert ensemble_loader.is_loaded() + assert stack_loader.is_loaded() diff --git a/uv.lock b/uv.lock index e1a8f9f..45a3513 100644 --- a/uv.lock +++ b/uv.lock @@ -834,6 +834,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/ef/d46131f4817f18b329e4fb7c53ba1d31774239d91266a74bccdc932708cc/dash-2.18.2-py3-none-any.whl", hash = "sha256:0ce0479d1bc958e934630e2de7023b8a4558f23ce1f9f5a4b34b65eb3903a869", size = 7792658, upload-time = "2024-11-04T21:12:56.592Z" }, ] +[[package]] +name = "dash-bootstrap-components" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/68/fa/f702d729a4b788293b796dc92f3d529909641de1e2e13f967211169b807a/dash_bootstrap_components-1.7.1.tar.gz", hash = "sha256:30d48340d6dc89831d6c06e400cd4236f0d5363562c05b2a922f21545695a082", size = 136579, upload-time = "2025-01-16T07:11:28.74Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/87/4db3b56e9a6813d413a0f20e053aa163d652babb629a8bf7b871af4a075f/dash_bootstrap_components-1.7.1-py3-none-any.whl", hash = "sha256:5e8eae7ee1d013f69e272c68c1015b53ab71802460152088f33fffa90d245199", size = 229294, upload-time = "2025-01-16T07:11:24.635Z" }, +] + [[package]] name = "dash-core-components" version = "2.0.0" @@ -2474,12 +2486,14 @@ source = { editable = "." } dependencies = [ { name = "catboost" }, { name = "dash" }, + { name = "dash-bootstrap-components" }, { name = "imbalanced-learn" }, { name = "joblib" }, { name = "lightgbm" }, { name = "numpy" }, { name = "optuna" }, { name = "pandas" }, + { name = "plotly" }, { name = "scikit-learn" }, { name = "scipy" }, { name = "sdv" }, @@ -2521,6 +2535,7 @@ requires-dist = [ { name = "bandit", marker = "extra == 'dev'", specifier = ">=1.7.0,<2.0.0" }, { name = "catboost", specifier = ">=1.2.0,<2.0.0" }, { name = "dash", specifier = ">=2.14.0,<3.0.0" }, + { name = "dash-bootstrap-components", specifier = ">=1.7.1" }, { name = "h2o", marker = "extra == 'automl'", specifier = ">=3.44.0,<4.0.0" }, { name = "imbalanced-learn", specifier = ">=0.11.0,<1.0.0" }, { name = "joblib", specifier = ">=1.3.0,<2.0.0" }, @@ -2529,6 +2544,7 @@ requires-dist = [ { name = "numpy", specifier = ">=1.24.0,<2.0.0" }, { name = "optuna", specifier = ">=3.4.0,<4.0.0" }, { name = "pandas", specifier = ">=2.0.0,<3.0.0" }, + { name = "plotly", specifier = ">=5.24.1" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.3.0,<4.0.0" }, { name = "pydocstyle", marker = "extra == 'dev'", specifier = ">=6.3.0,<7.0.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.4.0,<8.0.0" },