From 8a20f75ddeef26ad5978afbd1c9ac7815364d045 Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Sun, 30 Nov 2025 12:31:42 +0100 Subject: [PATCH 1/2] Cleaning the repo and removing unnecessary code. --- LICENSE | 2 +- Makefile | 22 ++++---- README.md | 42 +++++++-------- docs/README.md | 2 +- docs/data-augmentation.md | 4 +- scripts/train_and_save_models.py | 6 +-- src/main_modular.py | 46 ++++++++--------- src/modules/data_augmentation.py | 72 +++++++++++++------------- src/modules/data_loader.py | 10 ++-- src/modules/preprocessing.py | 38 +++++++------- tests/README.md | 88 ++++++++++++++++---------------- 11 files changed, 166 insertions(+), 166 deletions(-) diff --git a/LICENSE b/LICENSE index 261eeb9..41e33b1 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2025 Jeremy Vachier Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/Makefile b/Makefile index ed048aa..03eca9b 100644 --- a/Makefile +++ b/Makefile @@ -25,53 +25,53 @@ help: # Dependency management install: - @echo "๐Ÿ“ฆ Installing dependencies with uv..." + @echo "Installing dependencies with uv..." uv sync --all-extras # Code quality with Ruff format: - @echo "๐ŸŽจ Formatting code with ruff..." + @echo "Formatting code with ruff..." uv run ruff format src/ dash_app/ tests/ scripts/ lint: - @echo "๐Ÿ” Linting code with ruff..." + @echo "Linting code with ruff..." uv run ruff check . --fix uv run ruff format --check . # Type checking typecheck: - @echo "๐Ÿ”Ž Type checking with mypy..." + @echo "Type checking with mypy..." uv run mypy src/ --ignore-missing-imports # Security checking security: - @echo "๐Ÿ”’ Security checking with bandit..." + @echo "Security checking with bandit..." uv run bandit -r src/ -f json # Run all quality checks check-all: lint typecheck security - @echo "โœ… All code quality checks completed!" + @echo "All code quality checks completed!" # Testing test: - @echo "๐Ÿงช Running tests..." + @echo "Running tests..." uv run pytest tests/ -v # Pipeline execution run: - @echo "๐Ÿš€ Running modular pipeline..." + @echo "Running modular pipeline..." uv run python src/main_modular.py # Model training train-models: - @echo "๐Ÿค– Training and saving ML models..." + @echo "Training and saving ML models..." uv run python scripts/train_and_save_models.py # Dash application dash: - @echo "๐Ÿ“Š Starting Dash application..." + @echo "Starting Dash application..." uv run python dash_app/main.py --model-name ensemble stop-dash: - @echo "๐Ÿ›‘ Stopping Dash application..." + @echo "Stopping Dash application..." @lsof -ti:8050 | xargs kill -9 2>/dev/null || echo "No process found on port 8050" diff --git a/README.md b/README.md index 9f017f1..d3543fc 100644 --- a/README.md +++ b/README.md @@ -63,18 +63,18 @@ uv run python src/main_modular.py # Run pipeline ``` src/ -โ”œโ”€โ”€ main_modular.py # ๐ŸŽฏ Main production pipeline (MLOps-enhanced) -โ”œโ”€โ”€ modules/ # ๐Ÿงฉ Core modules -โ”‚ โ”œโ”€โ”€ config.py # โš™๏ธ Configuration & logging -โ”‚ โ”œโ”€โ”€ data_loader.py # ๐Ÿ“Š Data loading & external merge -โ”‚ โ”œโ”€โ”€ preprocessing.py # ๐Ÿ”ง Feature engineering -โ”‚ โ”œโ”€โ”€ data_augmentation.py # ๐ŸŽฒ Advanced synthetic data -โ”‚ โ”œโ”€โ”€ model_builders.py # ๐Ÿญ Model stack construction -โ”‚ โ”œโ”€โ”€ ensemble.py # ๐ŸŽฏ Ensemble & OOF predictions -โ”‚ โ”œโ”€โ”€ optimization.py # ๐Ÿ” Optuna utilities -โ”‚ โ””โ”€โ”€ utils.py # ๐Ÿ› ๏ธ Utility functions - -dash_app/ # ๐Ÿ–ฅ๏ธ Interactive Dashboard +โ”œโ”€โ”€ main_modular.py # Main production pipeline (MLOps-enhanced) +โ”œโ”€โ”€ modules/ # Core modules +โ”‚ โ”œโ”€โ”€ config.py # Configuration & logging +โ”‚ โ”œโ”€โ”€ data_loader.py # Data loading & external merge +โ”‚ โ”œโ”€โ”€ preprocessing.py # Feature engineering +โ”‚ โ”œโ”€โ”€ data_augmentation.py # Advanced synthetic data +โ”‚ โ”œโ”€โ”€ model_builders.py # Model stack construction +โ”‚ โ”œโ”€โ”€ ensemble.py # Ensemble & OOF predictions +โ”‚ โ”œโ”€โ”€ optimization.py # Optuna utilities +โ”‚ โ””โ”€โ”€ utils.py # Utility functions + +dash_app/ # Interactive Dashboard โ”œโ”€โ”€ dashboard/ # Application source โ”‚ โ”œโ”€โ”€ app.py # Main Dash application โ”‚ โ”œโ”€โ”€ layout.py # UI layout components @@ -84,21 +84,21 @@ dash_app/ # ๐Ÿ–ฅ๏ธ Interactive Dashboard โ”œโ”€โ”€ Dockerfile # Container configuration โ””โ”€โ”€ docker-compose.yml # Multi-service orchestration -models/ # ๐Ÿค– Trained Models +models/ # Trained Models โ”œโ”€โ”€ ensemble_model.pkl # Production ensemble model โ”œโ”€โ”€ ensemble_metadata.json # Model metadata and labels โ”œโ”€โ”€ stack_*_model.pkl # Individual stack models โ””โ”€โ”€ stack_*_metadata.json # Stack-specific metadata -scripts/ # ๐Ÿ› ๏ธ Utility Scripts +scripts/ # Utility Scripts โ””โ”€โ”€ train_and_save_models.py # Model training and persistence -data/ # ๐Ÿ“Š Datasets +data/ # Datasets -docs/ # ๐Ÿ“ Documentation +docs/ # Documentation โ””โ”€โ”€ [Generated documentation] # Technical guides -best_params/ # ๐Ÿ’พ Optimized parameters +best_params/ # Optimized parameters โ””โ”€โ”€ stack_*_best_params.json # Per-stack best parameters ``` @@ -231,7 +231,7 @@ The pipeline employs six specialized ensemble stacks, each optimized for differe The pipeline is designed to achieve high accuracy through ensemble learning and advanced optimization techniques. Performance will vary based on: ``` -๐Ÿ“Š Dataset Statistics +Dataset Statistics โ”œโ”€โ”€ Training Samples: ~18,000+ (with augmentation) โ”œโ”€โ”€ Test Samples: ~6,000+ โ”œโ”€โ”€ Original Features: 8 personality dimensions @@ -239,11 +239,11 @@ The pipeline is designed to achieve high accuracy through ensemble learning and โ”œโ”€โ”€ Augmented Samples: Variable (adaptive, typically 5-10%) โ””โ”€โ”€ Class Balance: Extrovert/Introvert classification -๐Ÿ”ง Technical Specifications +Technical Specifications โ”œโ”€โ”€ Memory Usage: <4GB peak (configurable) โ”œโ”€โ”€ CPU Utilization: 4 cores (configurable) -โ”œโ”€โ”€ Model Persistence: โœ… Best parameters saved -โ””โ”€โ”€ Reproducibility: โœ… Fixed random seeds +โ”œโ”€โ”€ Model Persistence: Yes - Best parameters saved +โ””โ”€โ”€ Reproducibility: Yes - Fixed random seeds ``` ## Testing & Validation diff --git a/docs/README.md b/docs/README.md index ac7fdb7..7535805 100644 --- a/docs/README.md +++ b/docs/README.md @@ -40,7 +40,7 @@ docker build -t personality-classifier . docker run -p 8080:8080 personality-classifier ``` -## ๐Ÿ“š Resources +## Resources - Code: `src/main_modular.py`, `examples/` - Config templates: [Configuration Guide](configuration.md) diff --git a/docs/data-augmentation.md b/docs/data-augmentation.md index b437964..582bc3f 100644 --- a/docs/data-augmentation.md +++ b/docs/data-augmentation.md @@ -397,14 +397,14 @@ def calculate_adaptive_ratio(data_characteristics): ### When to Use Augmentation -โœ… **Recommended**: +**Recommended**: - Small to medium datasets (<10K samples) - Class imbalanced problems - High-stakes applications requiring robustness - When overfitting is detected -โŒ **Not Recommended**: +**Not Recommended**: - Very large datasets (>100K samples) - When computational resources are limited diff --git a/scripts/train_and_save_models.py b/scripts/train_and_save_models.py index aa344e7..fd1d883 100755 --- a/scripts/train_and_save_models.py +++ b/scripts/train_and_save_models.py @@ -174,14 +174,14 @@ def main(): setup_logging() logger = get_logger(__name__) - logger.info("๐Ÿš€ Starting model training and saving process...") + logger.info("Starting model training and saving process...") # Create models directory models_dir = Path("models") models_dir.mkdir(exist_ok=True) # Load and prepare data - logger.info("๐Ÿ“Š Loading and preparing data...") + logger.info("Loading and preparing data...") df_tr, df_te, submission = load_data_with_external_merge() # Preprocess data (prep function expects target column in df_tr) @@ -215,7 +215,7 @@ def main(): except Exception as e: logger.error(f"Failed to train ensemble model: {e}") - logger.info("โœ… Model training and saving complete!") + logger.info("Model training and saving complete!") logger.info(f"Models saved in: {models_dir.absolute()}") diff --git a/src/main_modular.py b/src/main_modular.py index f92b43e..42ab466 100755 --- a/src/main_modular.py +++ b/src/main_modular.py @@ -88,7 +88,7 @@ def load_and_prepare_data( testing_mode: bool = True, test_size: int = 1000 ) -> TrainingData: """Load and prepare training data.""" - logger.info("๐ŸŽฏ Six-Stack Personality Classification Pipeline (Modular)") + logger.info("Six-Stack Personality Classification Pipeline (Modular)") logger.info("=" * 60) # Load data using advanced merge strategy @@ -97,11 +97,11 @@ def load_and_prepare_data( # FOR TESTING: Limit to specified samples for faster execution if testing_mode and len(df_tr) > test_size: logger.info( - f"๐Ÿ”ฌ TESTING MODE: Limiting dataset to {test_size} samples " + f"TESTING MODE: Limiting dataset to {test_size} samples " f"(original: {len(df_tr)})" ) df_tr = df_tr.sample(n=test_size, random_state=RND).reset_index(drop=True) - logger.info(f" ๐Ÿ“Š Using {len(df_tr)} samples for testing") + logger.info(f" Using {len(df_tr)} samples for testing") # Preprocess data with advanced competitive approach (do this first) X_full, X_test, y_full, le = prep(df_tr, df_te) @@ -235,7 +235,7 @@ def train_single_stack(config: StackConfig, data: TrainingData) -> optuna.Study: def train_all_stacks(data: TrainingData) -> dict[str, optuna.Study]: """Train all stacks in the ensemble.""" - logger.info("\n๐Ÿ” Training 6 specialized stacks...") + logger.info("\nTraining 6 specialized stacks...") stack_configs = get_stack_configurations() studies = {} @@ -250,7 +250,7 @@ def create_model_builders( studies: dict[str, optuna.Study], data: TrainingData ) -> dict[str, Callable[[], Any]]: """Create model builder functions for each stack.""" - logger.info("\n๐Ÿ“Š Creating model builders for ensemble...") + logger.info("\nCreating model builders for ensemble...") builders = { "A": lambda: build_stack(studies["A"].best_trial, seed=RND, wide_hp=False), @@ -274,7 +274,7 @@ def generate_oof_predictions( builders: dict[str, Callable[[], Any]], data: TrainingData ) -> dict[str, pd.Series]: """Generate out-of-fold predictions for all stacks.""" - logger.info("\n๐Ÿ”ฎ Generating out-of-fold predictions...") + logger.info("\nGenerating out-of-fold predictions...") oof_predictions = {} @@ -325,7 +325,7 @@ def optimize_ensemble_blending( oof_predictions: dict[str, pd.Series], y_full: pd.Series ) -> tuple[dict[str, float], float]: """Optimize ensemble blending weights.""" - logger.info("\nโš–๏ธ Optimizing ensemble blending...") + logger.info("\nOptimizing ensemble blending...") study_blend = optuna.create_study(direction="maximize") blend_objective = create_blend_objective(oof_predictions, y_full) @@ -345,7 +345,7 @@ def optimize_ensemble_blending( "F": best_weights_list[5], } - logger.info("\n๐Ÿ† Best ensemble weights:") + logger.info("\nBest ensemble weights:") for stack_name, weight in best_weights.items(): logger.info(f" Stack {stack_name}: {weight:.3f}") logger.info(f"Best CV score: {study_blend.best_value:.6f}") @@ -377,7 +377,7 @@ def refit_and_predict( models["F"].fit(data.X_full, y_full_noisy) # Generate final predictions - logger.info("\n๐ŸŽฏ Generating final predictions...") + logger.info("\nGenerating final predictions...") probabilities = {} for stack_name in ["A", "B", "C", "D", "E", "F"]: probabilities[stack_name] = models[stack_name].predict_proba(data.X_test)[:, 1] @@ -408,11 +408,11 @@ def apply_pseudo_labelling( ) -> TrainingData: """Apply pseudo labelling using ensemble predictions.""" if not ENABLE_PSEUDO_LABELLING: - logger.info("๐Ÿ”ฎ Pseudo labelling disabled") + logger.info("Pseudo labelling disabled") return data logger.info( - f"\n๐Ÿ”ฎ Applying pseudo labelling (threshold={PSEUDO_CONFIDENCE_THRESHOLD}, max_ratio={PSEUDO_MAX_RATIO})..." + f"\nApplying pseudo labelling (threshold={PSEUDO_CONFIDENCE_THRESHOLD}, max_ratio={PSEUDO_MAX_RATIO})..." ) # First train models to get test predictions for pseudo labelling @@ -465,7 +465,7 @@ def apply_pseudo_labelling( # Create new TrainingData with pseudo labels added if pseudo_stats["n_pseudo_added"] > 0: logger.info( - f"โœ… Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples" + f"Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples" ) # Create new TrainingData object with enhanced training set @@ -478,14 +478,14 @@ def apply_pseudo_labelling( ) return enhanced_data else: - logger.info("โš ๏ธ No pseudo labels added, using original data") + logger.info("No pseudo labels added, using original data") return data def main(): """Main execution function for the Six-Stack Personality Classification Pipeline.""" - logger.info("๐Ÿš€ Starting Six-Stack Personality Classification Pipeline") + logger.info("Starting Six-Stack Personality Classification Pipeline") try: # Load and prepare data @@ -494,7 +494,7 @@ def main(): ) logger.info( - f"๐Ÿ“Š Loaded data: {len(data.X_full)} training samples, {len(data.X_test)} test samples" + f"Loaded data: {len(data.X_full)} training samples, {len(data.X_test)} test samples" ) # Train all stacks @@ -503,7 +503,7 @@ def main(): # Log stack optimization results for stack_name, study in studies.items(): logger.info( - f"๐Ÿ“ˆ Stack {stack_name}: Best score = {study.best_value:.6f} ({len(study.trials)} trials)" + f"Stack {stack_name}: Best score = {study.best_value:.6f} ({len(study.trials)} trials)" ) # Create model builders @@ -517,8 +517,8 @@ def main(): oof_predictions, data.y_full ) - logger.info(f"๐ŸŽฏ Best ensemble CV score: {best_cv_score:.6f}") - logger.info(f"โš–๏ธ Ensemble weights: {best_weights}") + logger.info(f"Best ensemble CV score: {best_cv_score:.6f}") + logger.info(f"Ensemble weights: {best_weights}") # Apply pseudo labelling using ensemble predictions enhanced_data = apply_pseudo_labelling(builders, best_weights, data) @@ -534,12 +534,12 @@ def main(): ) # Print final results - logger.info(f"\nโœ… Predictions saved to '{output_file}'") - logger.info(f"๐Ÿ“Š Final submission shape: {submission_df.shape}") - logger.info("๐ŸŽ‰ Six-stack ensemble pipeline completed successfully!") + logger.info(f"\nPredictions saved to '{output_file}'") + logger.info(f"Final submission shape: {submission_df.shape}") + logger.info("Six-stack ensemble pipeline completed successfully!") # Print summary - logger.info("\n๐Ÿ“‹ Summary:") + logger.info("\nSummary:") logger.info(f" - Training samples: {len(enhanced_data.X_full):,}") logger.info(f" - Test samples: {len(enhanced_data.X_test):,}") logger.info(f" - Features: {enhanced_data.X_full.shape[1]}") @@ -551,7 +551,7 @@ def main(): logger.info(" - Modular architecture") except Exception as e: - logger.error(f"โŒ Pipeline failed: {e}") + logger.error(f"Pipeline failed: {e}") raise diff --git a/src/modules/data_augmentation.py b/src/modules/data_augmentation.py index 8c504e4..d9d19be 100644 --- a/src/modules/data_augmentation.py +++ b/src/modules/data_augmentation.py @@ -65,7 +65,7 @@ def analyze_data_characteristics(X_train: pd.DataFrame, y_train: pd.Series) -> d "is_highly_categorical": categorical_ratio > 0.3, } - logger.info(f"๐Ÿ“Š Data characteristics: {characteristics}") + logger.info(f"Data characteristics: {characteristics}") return characteristics @@ -82,14 +82,14 @@ def adaptive_augmentation_selection( max_ratio = AugmentationConfig.MAX_AUGMENTATION_RATIO.value logger.info( - f" ๐Ÿ“Š Adaptive selection using config ratios: base={base_ratio}, min={min_ratio}, max={max_ratio}" + f" Adaptive selection using config ratios: base={base_ratio}, min={min_ratio}, max={max_ratio}" ) if characteristics["is_small_dataset"] and characteristics["is_imbalanced"]: # Small imbalanced dataset - use higher ratio for SMOTENC adaptive_ratio = min(base_ratio * 1.5, max_ratio) # 150% of base, capped at max logger.info( - f" ๐ŸŽฏ Small imbalanced dataset detected โ†’ SMOTENC with ratio {adaptive_ratio:.3f} (base*1.5)" + f" Small imbalanced dataset detected โ†’ SMOTENC with ratio {adaptive_ratio:.3f} (base*1.5)" ) return AugmentationMethod.SMOTENC, adaptive_ratio @@ -97,7 +97,7 @@ def adaptive_augmentation_selection( # High categorical ratio - use moderate ratio for SDV Copula adaptive_ratio = min(base_ratio * 0.8, max_ratio) # 80% of base logger.info( - f" ๐ŸŽฏ High categorical features detected โ†’ SDV_COPULA with ratio {adaptive_ratio:.3f} (base*0.8)" + f" High categorical features detected โ†’ SDV_COPULA with ratio {adaptive_ratio:.3f} (base*0.8)" ) return AugmentationMethod.SDV_COPULA, adaptive_ratio @@ -105,7 +105,7 @@ def adaptive_augmentation_selection( # Large balanced dataset - use conservative ratio for ensemble adaptive_ratio = max(base_ratio * 0.5, min_ratio) # 50% of base, at least min logger.info( - f" ๐ŸŽฏ Large balanced dataset detected โ†’ MIXED_ENSEMBLE with ratio {adaptive_ratio:.3f} (base*0.5)" + f" Large balanced dataset detected โ†’ MIXED_ENSEMBLE with ratio {adaptive_ratio:.3f} (base*0.5)" ) return AugmentationMethod.MIXED_ENSEMBLE, adaptive_ratio @@ -113,7 +113,7 @@ def adaptive_augmentation_selection( # Severe imbalance - use maximum ratio for class balancing adaptive_ratio = max_ratio # Use maximum configured ratio logger.info( - f" ๐ŸŽฏ Severe class imbalance detected โ†’ CLASS_BALANCED with ratio {adaptive_ratio:.3f} (max ratio)" + f" Severe class imbalance detected โ†’ CLASS_BALANCED with ratio {adaptive_ratio:.3f} (max ratio)" ) return AugmentationMethod.CLASS_BALANCED, adaptive_ratio @@ -121,7 +121,7 @@ def adaptive_augmentation_selection( # Default to SDV Copula with base ratio adaptive_ratio = base_ratio logger.info( - f" ๐ŸŽฏ Default case โ†’ SDV_COPULA with ratio {adaptive_ratio:.3f} (base ratio)" + f" Default case โ†’ SDV_COPULA with ratio {adaptive_ratio:.3f} (base ratio)" ) return AugmentationMethod.SDV_COPULA, adaptive_ratio @@ -131,7 +131,7 @@ def tvae_augmentation( ) -> tuple[pd.DataFrame, pd.Series]: """TVAE-based augmentation with better stability than CTGAN.""" if not SDV_AVAILABLE: - logger.warning("โš ๏ธ SDV not available, falling back to simple augmentation") + logger.warning("SDV not available, falling back to simple augmentation") return simple_mixed_augmentation(X_train, y_train, augment_ratio) try: @@ -175,7 +175,7 @@ def tvae_augmentation( return augmented_X, augmented_y except Exception as e: - logger.warning(f"โš ๏ธ TVAE failed: {e}, falling back to Copula") + logger.warning(f"TVAE failed: {e}, falling back to Copula") return sdv_augmentation(X_train, y_train, "copula", augment_ratio) @@ -196,7 +196,7 @@ def class_balanced_augmentation( samples_needed = max(0, target_minority_count - minority_count) if samples_needed == 0: - logger.info(" โœ… Classes already balanced, no augmentation needed") + logger.info(" Classes already balanced, no augmentation needed") return pd.DataFrame(), pd.Series(dtype=y_train.dtype) # Filter minority class data @@ -229,7 +229,7 @@ def class_balanced_augmentation( all_augmented_y.append(aug_y) except Exception as e: - logger.warning(f"โš ๏ธ Method {method} failed: {e}") + logger.warning(f"Method {method} failed: {e}") if all_augmented_X: combined_X = pd.concat(all_augmented_X, ignore_index=True) @@ -266,7 +266,7 @@ def mixed_ensemble_augmentation( all_augmented_y.append(aug_y) except Exception as e: - logger.warning(f"โš ๏ธ Method {method} failed: {e}") + logger.warning(f"Method {method} failed: {e}") if all_augmented_X: combined_X = pd.concat(all_augmented_X, ignore_index=True) @@ -284,7 +284,7 @@ def enhanced_quality_filtering( threshold: float = 0.75, ) -> tuple[pd.DataFrame, pd.Series]: """Multi-metric quality filtering for synthetic data.""" - logger.info(" ๐Ÿ” Enhanced quality filtering...") + logger.info(" Enhanced quality filtering...") if len(synthetic_X) == 0: return synthetic_X, synthetic_y @@ -326,7 +326,7 @@ def enhanced_quality_filtering( logger.info( f" โœจ Quality filtering: {len(filtered_X)}/{len(synthetic_X)} samples kept" ) - logger.info(f" ๐Ÿ“Š Avg quality score: {np.mean(quality_scores):.3f}") + logger.info(f" Avg quality score: {np.mean(quality_scores):.3f}") return filtered_X, filtered_y @@ -354,10 +354,10 @@ def diversity_check( diversity_scores.append(diversity) avg_diversity = np.mean(diversity_scores) if diversity_scores else 1.0 - is_diverse = avg_diversity >= threshold + is_diverse = bool(avg_diversity >= threshold) logger.info( - f" ๐ŸŒˆ Diversity score: {avg_diversity:.3f} ({'โœ… Pass' if is_diverse else 'โŒ Fail'})" + f" Diversity score: {avg_diversity:.3f} ({'Pass' if is_diverse else 'Fail'})" ) return is_diverse @@ -435,11 +435,11 @@ def sdv_augmentation( ) -> tuple[pd.DataFrame, pd.Series]: """High-quality synthetic data generation using SDV with improved CTGAN handling.""" if not SDV_AVAILABLE: - logger.warning("โš ๏ธ SDV not available, falling back to simple augmentation") + logger.warning("SDV not available, falling back to simple augmentation") return simple_mixed_augmentation(X_train, y_train, augment_ratio) try: - logger.info(f" ๐Ÿ”ง Using {method} synthesizer...") + logger.info(f" Using {method} synthesizer...") # Combine features and target for SDV train_data = X_train.copy() @@ -471,7 +471,7 @@ def sdv_augmentation( logger.warning( " โš ๏ธ Apple Silicon detected - CTGAN has compatibility issues" ) - logger.info(" ๐Ÿ”„ Falling back to GaussianCopula for stability...") + logger.info(" Falling back to GaussianCopula for stability...") synthesizer = GaussianCopulaSynthesizer(metadata) else: logger.info(" ๐Ÿง  Training CTGAN (slow but high quality)...") @@ -521,7 +521,7 @@ def timeout_handler(_signum, _frame): try: synthesizer.fit(train_data) training_time = time.time() - start_time - logger.info(f" โœ… Training completed in {training_time:.1f}s") + logger.info(f" Training completed in {training_time:.1f}s") finally: if method == "ctgan": signal.alarm(0) @@ -531,22 +531,22 @@ def timeout_handler(_signum, _frame): if n_synthetic == 0: return pd.DataFrame(), pd.Series(dtype=y_train.dtype) - logger.info(f" ๐ŸŽฒ Generating {n_synthetic} synthetic samples...") + logger.info(f" Generating {n_synthetic} synthetic samples...") synthetic_data = synthesizer.sample(num_rows=n_synthetic) # Split back to features and target synthetic_X = synthetic_data.drop("Personality", axis=1) synthetic_y = synthetic_data["Personality"] - logger.info(f" โœ… Generated {len(synthetic_X)} synthetic samples") + logger.info(f" Generated {len(synthetic_X)} synthetic samples") return synthetic_X, synthetic_y except TimeoutError: - logger.warning("โš ๏ธ Training timed out, falling back to simple augmentation") + logger.warning("Training timed out, falling back to simple augmentation") return simple_mixed_augmentation(X_train, y_train, augment_ratio) except Exception as e: logger.warning( - f"โš ๏ธ SDV augmentation failed: {e}, falling back to simple augmentation" + f"SDV augmentation failed: {e}, falling back to simple augmentation" ) return simple_mixed_augmentation(X_train, y_train, augment_ratio) @@ -557,7 +557,7 @@ def smotenc_augmentation( """SMOTE for mixed numerical/categorical data.""" if not IMBLEARN_AVAILABLE: logger.warning( - "โš ๏ธ imbalanced-learn not available, falling back to simple augmentation" + "imbalanced-learn not available, falling back to simple augmentation" ) return simple_mixed_augmentation(X_train, y_train, augment_ratio) @@ -600,7 +600,7 @@ def smotenc_augmentation( except Exception as e: logger.warning( - f"โš ๏ธ SMOTENC augmentation failed: {e}, falling back to simple augmentation" + f"SMOTENC augmentation failed: {e}, falling back to simple augmentation" ) return simple_mixed_augmentation(X_train, y_train, augment_ratio) @@ -613,7 +613,7 @@ def apply_data_augmentation( logger.info("๐Ÿ“Š Data augmentation disabled") return X_train, y_train - logger.info("๐Ÿ“Š Applying adaptive data augmentation...") + logger.info("Applying adaptive data augmentation...") original_shape = X_train.shape # Analyze data characteristics @@ -623,13 +623,13 @@ def apply_data_augmentation( if AugmentationConfig.AUGMENTATION_METHOD.value == AugmentationMethod.ADAPTIVE: method, augment_ratio = adaptive_augmentation_selection(characteristics) logger.info( - f" ๐ŸŽฏ Auto-selected: {method.value} with ratio {augment_ratio:.3f}" + f" Auto-selected: {method.value} with ratio {augment_ratio:.3f}" ) else: method = AugmentationConfig.AUGMENTATION_METHOD.value augment_ratio = AugmentationConfig.AUGMENTATION_RATIO.value logger.info( - f" ๐ŸŽฏ Using configured: {method.value} with ratio {augment_ratio:.3f}" + f" Using configured: {method.value} with ratio {augment_ratio:.3f}" ) # Apply augmentation with timeout @@ -677,12 +677,12 @@ def timeout_handler(_signum, _frame): except TimeoutError: logger.warning( - f"โš ๏ธ Augmentation timeout after {AugmentationConfig.MAX_AUGMENTATION_TIME_SECONDS.value}s, using SMOTENC fallback" + f"Augmentation timeout after {AugmentationConfig.MAX_AUGMENTATION_TIME_SECONDS.value}s, using SMOTENC fallback" ) augmented_X, augmented_y = smotenc_augmentation(X_train, y_train, 0.03) signal.alarm(0) except Exception as e: - logger.warning(f"โš ๏ธ Augmentation failed: {e}, using original data") + logger.warning(f"Augmentation failed: {e}, using original data") signal.alarm(0) return X_train, y_train @@ -704,7 +704,7 @@ def timeout_handler(_signum, _frame): if AugmentationConfig.ENABLE_DIVERSITY_CHECK.value and not diversity_check( X_train, augmented_X, AugmentationConfig.DIVERSITY_THRESHOLD.value ): - logger.warning(" โš ๏ธ Low diversity detected, reducing synthetic samples") + logger.warning(" Low diversity detected, reducing synthetic samples") # Keep only top 50% most diverse samples keep_ratio = 0.5 keep_count = int(len(augmented_X) * keep_ratio) @@ -717,18 +717,18 @@ def timeout_handler(_signum, _frame): y_combined = pd.concat([y_train, augmented_y], ignore_index=True) logger.info( - f" โœ… Added {len(augmented_X)} high-quality synthetic samples" + f" Added {len(augmented_X)} high-quality synthetic samples" ) - logger.info(f" ๐Ÿ“ˆ Data shape: {original_shape} โ†’ {X_combined.shape}") + logger.info(f" Data shape: {original_shape} โ†’ {X_combined.shape}") # Log class balance improvement orig_balance = y_train.value_counts().min() / y_train.value_counts().max() new_balance = ( y_combined.value_counts().min() / y_combined.value_counts().max() ) - logger.info(f" โš–๏ธ Class balance: {orig_balance:.3f} โ†’ {new_balance:.3f}") + logger.info(f" Class balance: {orig_balance:.3f} โ†’ {new_balance:.3f}") return X_combined, y_combined - logger.warning("โš ๏ธ No synthetic samples generated, using original data") + logger.warning("No synthetic samples generated, using original data") return X_train, y_train diff --git a/src/modules/data_loader.py b/src/modules/data_loader.py index 07aba89..57167f3 100644 --- a/src/modules/data_loader.py +++ b/src/modules/data_loader.py @@ -16,7 +16,7 @@ def load_data_with_external_merge(): Returns: tuple: (df_tr, df_te, submission) - training data, test data, and submission template """ - logger.info("๐Ÿ“Š Loading data with advanced merge strategy...") + logger.info("Loading data with advanced merge strategy...") # Use Paths enum from config.py for all file paths df_tr = pd.read_csv(Paths.TRAIN_CSV.value) @@ -62,10 +62,10 @@ def load_data_with_external_merge(): test_matches = df_te["match_p"].notna().sum() logger.info( - f"โœ… Successfully matched {train_matches}/{len(df_tr)} training samples with external data" + f"Successfully matched {train_matches}/{len(df_tr)} training samples with external data" ) logger.info( - f"โœ… Successfully matched {test_matches}/{len(df_te)} test samples with external data" + f"Successfully matched {test_matches}/{len(df_te)} test samples with external data" ) # Print match distribution for training data @@ -77,10 +77,10 @@ def load_data_with_external_merge(): except FileNotFoundError: logger.warning( - "โš ๏ธ personality_datasert.csv not found, adding empty match_p column" + "personality_datasert.csv not found, adding empty match_p column" ) df_tr["match_p"] = None df_te["match_p"] = None - logger.info("โœ… Data loading with external merge completed") + logger.info("Data loading with external merge completed") return df_tr, df_te, submission diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py index 980d683..5aea9e8 100644 --- a/src/modules/preprocessing.py +++ b/src/modules/preprocessing.py @@ -28,7 +28,7 @@ def prep( Returns: Tuple of (X_train, X_test, y_train, label_encoder) """ - logger.info("๐Ÿ”ง Preprocessing data with advanced competitive approach...") + logger.info("Preprocessing data with advanced competitive approach...") # Define feature groups before any processing # Keep original column types for proper categorization @@ -52,7 +52,7 @@ def prep( df_te = df_te.drop(columns=[idx]) # Use advanced correlation-based imputation - logger.info("๐Ÿ”„ Performing TOP-4 correlation-based imputation...") + logger.info("Performing TOP-4 correlation-based imputation...") # Extract and encode target variable BEFORE combining data le_tgt = LabelEncoder() @@ -143,7 +143,7 @@ def fill_missing_by_quantile_group( all_data[col] = all_data[col].fillna("Unknown") # Apply one-hot encoding for categorical features using OneHotEncoder (advanced approach) - logger.info("๐Ÿ”„ Applying one-hot encoding for categorical features...") + logger.info("Applying one-hot encoding for categorical features...") # Identify categorical columns that exist in the data existing_categorical_cols = [ @@ -173,13 +173,13 @@ def fill_missing_by_quantile_group( all_data = pd.concat([all_data, encoded_df], axis=1) logger.info( - f" โœ… Encoded {len(existing_categorical_cols)} categorical features into {len(feature_names)} binary features" + f" Encoded {len(existing_categorical_cols)} categorical features into {len(feature_names)} binary features" ) else: - logger.warning(" โš ๏ธ No categorical features found to encode") + logger.warning(" No categorical features found to encode") # Fill any remaining missing values - logger.info("๐Ÿ”„ Filling any remaining missing values...") + logger.info("Filling any remaining missing values...") # For numerical columns, use median num_cols = all_data.select_dtypes(include=[np.number]).columns @@ -200,7 +200,7 @@ def fill_missing_by_quantile_group( logger.info(f"Final train shape: {df_tr.shape}") logger.info(f"Final test shape: {df_te.shape}") - logger.info("โœ… Preprocessing completed with advanced competitive approach") + logger.info("Preprocessing completed with advanced competitive approach") return df_tr, df_te, ytr, le_tgt @@ -259,13 +259,13 @@ def add_pseudo_labeling_conservative( n_high_conf = np.sum(high_conf_mask) max_pseudo_samples = int(len(X_full) * max_pseudo_ratio) - logger.info(f" ๐Ÿ“Š Found {n_high_conf} high-confidence predictions") - logger.info(f" ๐Ÿ“ Maximum allowed pseudo-samples: {max_pseudo_samples}") + logger.info(f" Found {n_high_conf} high-confidence predictions") + logger.info(f" Maximum allowed pseudo-samples: {max_pseudo_samples}") if n_high_conf > 0: # Limit pseudo-samples to avoid overfitting if n_high_conf > max_pseudo_samples: - logger.info(f" โœ‚๏ธ Limiting to {max_pseudo_samples} most confident samples") + logger.info(f" Limiting to {max_pseudo_samples} most confident samples") # Get confidence scores and select most confident samples conf_scores = np.maximum(ensemble_proba, 1 - ensemble_proba) high_conf_indices = np.where(high_conf_mask)[0] @@ -299,16 +299,16 @@ def add_pseudo_labeling_conservative( "final_size": len(X_combined), } - logger.info(f" โœ… Added {len(y_pseudo)} pseudo-labels to training data") + logger.info(f" Added {len(y_pseudo)} pseudo-labels to training data") logger.info( - f" ๐Ÿ“Š Pseudo-label distribution: Class 0: {pseudo_stats['pseudo_class_0']}, Class 1: {pseudo_stats['pseudo_class_1']}" + f" Pseudo-label distribution: Class 0: {pseudo_stats['pseudo_class_0']}, Class 1: {pseudo_stats['pseudo_class_1']}" ) - logger.info(f" ๐ŸŽฏ Mean confidence: {pseudo_stats['mean_confidence']:.4f}") - logger.info(f" ๐Ÿ“ˆ Training data: {len(X_full)} โ†’ {len(X_combined)} samples") + logger.info(f" Mean confidence: {pseudo_stats['mean_confidence']:.4f}") + logger.info(f" Training data: {len(X_full)} โ†’ {len(X_combined)} samples") return X_combined, y_combined, pseudo_stats else: - logger.info(" โš ๏ธ No high-confidence predictions found") + logger.info(" No high-confidence predictions found") pseudo_stats = { "n_pseudo_added": 0, "original_size": len(X_full), @@ -340,7 +340,7 @@ def create_domain_balanced_dataset( Returns: Tuple of (combined_dataframe, sample_weights) """ - logger.info("๐ŸŽฏ Computing domain weights for distribution alignment...") + logger.info("Computing domain weights for distribution alignment...") # Combine dataframes with domain labels combined_data = [] @@ -440,7 +440,7 @@ def create_domain_balanced_dataset( weights[domain_mask] = domain_weights / (np.mean(domain_weights) + 1e-8) # Print summary - logger.info("๐Ÿ“Š Domain weighting summary:") + logger.info("Domain weighting summary:") logger.info(" Reference domain: 0 (first dataframe)") for domain_idx in range(len(dataframes)): domain_mask = domain_labels == domain_idx @@ -481,7 +481,7 @@ def create_domain_balanced_dataset( total_count = np.sum(domain_mask) if removed_count > 0: logger.info( - f" ๐Ÿšซ Filtered {removed_count}/{total_count} ({removed_count / total_count * 100:.1f}%) " + f" Filtered {removed_count}/{total_count} ({removed_count / total_count * 100:.1f}%) " f"low-quality samples from domain {domain_idx}" ) @@ -490,7 +490,7 @@ def create_domain_balanced_dataset( weights = weights[keep_mask] logger.info( - f" โœ… Kept {len(combined_df)} high-quality samples after filtering" + f" Kept {len(combined_df)} high-quality samples after filtering" ) logger.info(f"Created domain-balanced dataset with {len(combined_df)} samples") diff --git a/tests/README.md b/tests/README.md index 51504a8..6f86cb1 100644 --- a/tests/README.md +++ b/tests/README.md @@ -2,7 +2,7 @@ This comprehensive testing framework covers all components of the personality classification pipeline including data processing, model building, and MLOps infrastructure. -## ๐Ÿ“ Test Structure +## Test Structure ``` tests/ @@ -18,7 +18,7 @@ tests/ โ””โ”€โ”€ fixtures/ # Test data and fixtures ``` -## ๐Ÿงช Test Categories +## Test Categories ### **Unit Tests** (`@pytest.mark.unit`) - Test individual functions and classes in isolation @@ -35,9 +35,9 @@ tests/ - Large dataset processing - Model training with multiple iterations -## ๐Ÿš€ Running Tests +## Running Tests -### **Quick Start** +### Quick Start ```bash # Run all tests python run_tests.py @@ -49,7 +49,7 @@ python run_tests.py --type all python run_tests.py --type fast ``` -### **Test Categories** +### Test Categories ```bash # Run only unit tests python run_tests.py --type unit @@ -64,7 +64,7 @@ python run_tests.py --type modules python run_tests.py --type mlops ``` -### **Specific Test Execution** +### Specific Test Execution ```bash # Run specific test file python run_tests.py --test tests/modules/test_data_loader.py @@ -76,7 +76,7 @@ python run_tests.py --test tests/modules/test_data_loader.py::TestDataLoader python run_tests.py --test tests/modules/test_data_loader.py::TestDataLoader::test_init ``` -### **Coverage Options** +### Coverage Options ```bash # Run without coverage (faster) python run_tests.py --no-coverage @@ -85,7 +85,7 @@ python run_tests.py --no-coverage python run_tests.py --verbose ``` -### **Direct Pytest Usage** +### Direct Pytest Usage ```bash # Run with pytest directly pytest tests/ @@ -97,9 +97,9 @@ pytest -m "unit and not slow" tests/ pytest --cov=src --cov-report=html tests/ ``` -## ๐Ÿ”ง Configuration +## Configuration -### **pytest.ini** +### pytest.ini ```ini [tool.pytest.ini_options] minversion = "6.0" @@ -121,7 +121,7 @@ markers = [ ] ``` -## ๐ŸŽฏ Test Fixtures +## Test Fixtures The test suite includes comprehensive fixtures for different testing scenarios: @@ -139,7 +139,7 @@ The test suite includes comprehensive fixtures for different testing scenarios: - `assert_data_shape()`: Check DataFrame dimensions - `assert_no_missing_values()`: Verify data quality -## ๐Ÿ“Š Coverage Reports +## Coverage Reports Test coverage reports are generated in multiple formats: @@ -162,7 +162,7 @@ Open `htmlcov/index.html` in your browser for detailed coverage analysis. ### **XML Report** `coverage.xml` for CI/CD integration. -## ๐Ÿ” Test Examples +## Test Examples ### **Data Processing Tests** ```python @@ -193,9 +193,9 @@ def test_train_model(self, sample_data): assert hasattr(trained_model, "predict") ``` -## ๐Ÿ› Debugging Tests +## Debugging Tests -### **Running in Debug Mode** +### Running in Debug Mode ```bash # Run with verbose output and show local variables pytest -vvv --tb=long tests/ @@ -204,7 +204,7 @@ pytest -vvv --tb=long tests/ pytest -s tests/modules/test_data_loader.py::TestDataLoader::test_init ``` -### **Using Print Statements** +### Using Print Statements ```python def test_debug_example(self, sample_data): print(f"Data shape: {sample_data.shape}") @@ -212,14 +212,14 @@ def test_debug_example(self, sample_data): # ... test logic ``` -### **Using Debugger** +### Using Debugger ```python def test_with_debugger(self, sample_data): import pdb; pdb.set_trace() # ... test logic ``` -## ๐Ÿ”„ Continuous Integration +## Continuous Integration ### **GitHub Actions Example** ```yaml @@ -248,28 +248,28 @@ jobs: uses: codecov/codecov-action@v1 ``` -## ๐Ÿ“ Best Practices +## Best Practices -### **Writing Tests** -1. **Descriptive Names**: Use clear, descriptive test names -2. **Single Responsibility**: Each test should verify one specific behavior -3. **Independent Tests**: Tests should not depend on each other -4. **Use Fixtures**: Leverage pytest fixtures for setup and teardown -5. **Mock External Dependencies**: Use mocks for external services +### Writing Tests +1. Descriptive Names: Use clear, descriptive test names +2. Single Responsibility: Each test should verify one specific behavior +3. Independent Tests: Tests should not depend on each other +4. Use Fixtures: Leverage pytest fixtures for setup and teardown +5. Mock External Dependencies: Use mocks for external services -### **Test Organization** -1. **Group Related Tests**: Use test classes to group related functionality -2. **Use Markers**: Tag tests appropriately for selective execution -3. **Parametrize Tests**: Use `@pytest.mark.parametrize` for multiple scenarios -4. **Document Complex Tests**: Add docstrings explaining test purpose +### Test Organization +1. Group Related Tests: Use test classes to group related functionality +2. Use Markers: Tag tests appropriately for selective execution +3. Parametrize Tests: Use `@pytest.mark.parametrize` for multiple scenarios +4. Document Complex Tests: Add docstrings explaining test purpose -### **Performance** -1. **Fast Unit Tests**: Keep unit tests fast and focused -2. **Mark Slow Tests**: Use `@pytest.mark.slow` for time-consuming tests -3. **Use Smaller Datasets**: Create minimal datasets for testing -4. **Parallel Execution**: Consider pytest-xdist for parallel test execution +### Performance +1. Fast Unit Tests: Keep unit tests fast and focused +2. Mark Slow Tests: Use `@pytest.mark.slow` for time-consuming tests +3. Use Smaller Datasets: Create minimal datasets for testing +4. Parallel Execution: Consider pytest-xdist for parallel test execution -## ๐Ÿ› ๏ธ Dependencies +## Dependencies The testing framework requires: @@ -290,15 +290,15 @@ dash>=2.14.0 pytest-xdist>=3.0.0 ``` -## ๐Ÿ“ˆ Metrics and Reporting +## Metrics and Reporting -### **Test Metrics** -- **Test Count**: Total number of tests -- **Pass Rate**: Percentage of passing tests -- **Coverage**: Code coverage percentage -- **Execution Time**: Test suite runtime +### Test Metrics +- Test Count: Total number of tests +- Pass Rate: Percentage of passing tests +- Coverage: Code coverage percentage +- Execution Time: Test suite runtime -### **Quality Gates** +### Quality Gates - Minimum 90% code coverage - All tests must pass - No critical security vulnerabilities @@ -306,7 +306,7 @@ pytest-xdist>=3.0.0 --- -## ๐Ÿš€ Quick Commands Reference +## Quick Commands Reference ```bash # Essential commands From c6241e0411203f96759a7aa2db9c60181068317b Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Sun, 30 Nov 2025 12:39:28 +0100 Subject: [PATCH 2/2] Fixing linting issue. --- src/main_modular.py | 4 +--- src/modules/data_augmentation.py | 8 ++------ src/modules/preprocessing.py | 4 +--- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/src/main_modular.py b/src/main_modular.py index 42ab466..ae28a0d 100755 --- a/src/main_modular.py +++ b/src/main_modular.py @@ -464,9 +464,7 @@ def apply_pseudo_labelling( # Create new TrainingData with pseudo labels added if pseudo_stats["n_pseudo_added"] > 0: - logger.info( - f"Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples" - ) + logger.info(f"Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples") # Create new TrainingData object with enhanced training set enhanced_data = TrainingData( diff --git a/src/modules/data_augmentation.py b/src/modules/data_augmentation.py index d9d19be..2ee48ff 100644 --- a/src/modules/data_augmentation.py +++ b/src/modules/data_augmentation.py @@ -622,9 +622,7 @@ def apply_data_augmentation( # Select optimal method and ratio if AugmentationConfig.AUGMENTATION_METHOD.value == AugmentationMethod.ADAPTIVE: method, augment_ratio = adaptive_augmentation_selection(characteristics) - logger.info( - f" Auto-selected: {method.value} with ratio {augment_ratio:.3f}" - ) + logger.info(f" Auto-selected: {method.value} with ratio {augment_ratio:.3f}") else: method = AugmentationConfig.AUGMENTATION_METHOD.value augment_ratio = AugmentationConfig.AUGMENTATION_RATIO.value @@ -716,9 +714,7 @@ def timeout_handler(_signum, _frame): X_combined = pd.concat([X_train, augmented_X], ignore_index=True) y_combined = pd.concat([y_train, augmented_y], ignore_index=True) - logger.info( - f" Added {len(augmented_X)} high-quality synthetic samples" - ) + logger.info(f" Added {len(augmented_X)} high-quality synthetic samples") logger.info(f" Data shape: {original_shape} โ†’ {X_combined.shape}") # Log class balance improvement diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py index 5aea9e8..282913e 100644 --- a/src/modules/preprocessing.py +++ b/src/modules/preprocessing.py @@ -489,9 +489,7 @@ def create_domain_balanced_dataset( combined_df = combined_df[keep_mask].reset_index(drop=True) weights = weights[keep_mask] - logger.info( - f" Kept {len(combined_df)} high-quality samples after filtering" - ) + logger.info(f" Kept {len(combined_df)} high-quality samples after filtering") logger.info(f"Created domain-balanced dataset with {len(combined_df)} samples") logger.info(