From 8a20f75ddeef26ad5978afbd1c9ac7815364d045 Mon Sep 17 00:00:00 2001
From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com>
Date: Sun, 30 Nov 2025 12:31:42 +0100
Subject: [PATCH 1/2] Cleaning the repo and removing unnecessary code.

---
 LICENSE                          |  2 +-
 Makefile                         | 22 ++++----
 README.md                        | 42 +++++++--------
 docs/README.md                   |  2 +-
 docs/data-augmentation.md        |  4 +-
 scripts/train_and_save_models.py |  6 +--
 src/main_modular.py              | 46 ++++++++---------
 src/modules/data_augmentation.py | 72 +++++++++++++-------------
 src/modules/data_loader.py       | 10 ++--
 src/modules/preprocessing.py     | 38 +++++++-------
 tests/README.md                  | 88 ++++++++++++++++----------------
 11 files changed, 166 insertions(+), 166 deletions(-)

diff --git a/LICENSE b/LICENSE
index 261eeb9..41e33b1 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2025 Jeremy Vachier
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/Makefile b/Makefile
index ed048aa..03eca9b 100644
--- a/Makefile
+++ b/Makefile
@@ -25,53 +25,53 @@ help:
 
 # Dependency management
 install:
-	@echo "📦 Installing dependencies with uv..."
+	@echo "Installing dependencies with uv..."
 	uv sync --all-extras
 
 # Code quality with Ruff
 format:
-	@echo "🎨 Formatting code with ruff..."
+	@echo "Formatting code with ruff..."
 	uv run ruff format src/ dash_app/ tests/ scripts/
 
 lint:
-	@echo "🔍 Linting code with ruff..."
+	@echo "Linting code with ruff..."
 	uv run ruff check . --fix
 	uv run ruff format --check .
 
 # Type checking
 typecheck:
-	@echo "🔎 Type checking with mypy..."
+	@echo "Type checking with mypy..."
 	uv run mypy src/ --ignore-missing-imports
 
 # Security checking
 security:
-	@echo "🔒 Security checking with bandit..."
+	@echo "Security checking with bandit..."
 	uv run bandit -r src/ -f json
 
 # Run all quality checks
 check-all: lint typecheck security
-	@echo "✅ All code quality checks completed!"
+	@echo "All code quality checks completed!"
 
 # Testing
 test:
-	@echo "🧪 Running tests..."
+	@echo "Running tests..."
 	uv run pytest tests/ -v
 
 # Pipeline execution
 run:
-	@echo "🚀 Running modular pipeline..."
+	@echo "Running modular pipeline..."
 	uv run python src/main_modular.py
 
 # Model training
 train-models:
-	@echo "🤖 Training and saving ML models..."
+	@echo "Training and saving ML models..."
 	uv run python scripts/train_and_save_models.py
 
 # Dash application
 dash:
-	@echo "📊 Starting Dash application..."
+	@echo "Starting Dash application..."
 	uv run python dash_app/main.py --model-name ensemble
 
 stop-dash:
-	@echo "🛑 Stopping Dash application..."
+	@echo "Stopping Dash application..."
 	@lsof -ti:8050 | xargs kill -9 2>/dev/null || echo "No process found on port 8050"
diff --git a/README.md b/README.md
index 9f017f1..d3543fc 100644
--- a/README.md
+++ b/README.md
@@ -63,18 +63,18 @@ uv run python src/main_modular.py   # Run pipeline
 
 ```
 src/
-├── main_modular.py                 # 🎯 Main production pipeline (MLOps-enhanced)
-├── modules/                        # 🧩 Core modules
-│   ├── config.py                   # ⚙️ Configuration & logging
-│   ├── data_loader.py              # 📊 Data loading & external merge
-│   ├── preprocessing.py            # 🔧 Feature engineering
-│   ├── data_augmentation.py        # 🎲 Advanced synthetic data
-│   ├── model_builders.py           # 🏭 Model stack construction
-│   ├── ensemble.py                 # 🎯 Ensemble & OOF predictions
-│   ├── optimization.py             # 🔍 Optuna utilities
-│   └── utils.py                    # 🛠️ Utility functions
-
-dash_app/                           # 🖥️ Interactive Dashboard
+├── main_modular.py                 # Main production pipeline (MLOps-enhanced)
+├── modules/                        # Core modules
+│   ├── config.py                   # Configuration & logging
+│   ├── data_loader.py              # Data loading & external merge
+│   ├── preprocessing.py            # Feature engineering
+│   ├── data_augmentation.py        # Advanced synthetic data
+│   ├── model_builders.py           # Model stack construction
+│   ├── ensemble.py                 # Ensemble & OOF predictions
+│   ├── optimization.py             # Optuna utilities
+│   └── utils.py                    # Utility functions
+
+dash_app/                           # Interactive Dashboard
 ├── dashboard/                            # Application source
 │   ├── app.py                      # Main Dash application
 │   ├── layout.py                   # UI layout components
@@ -84,21 +84,21 @@ dash_app/                           # 🖥️ Interactive Dashboard
 ├── Dockerfile                      # Container configuration
 └── docker-compose.yml             # Multi-service orchestration
 
-models/                             # 🤖 Trained Models
+models/                             # Trained Models
 ├── ensemble_model.pkl              # Production ensemble model
 ├── ensemble_metadata.json         # Model metadata and labels
 ├── stack_*_model.pkl              # Individual stack models
 └── stack_*_metadata.json          # Stack-specific metadata
 
-scripts/                            # 🛠️ Utility Scripts
+scripts/                            # Utility Scripts
 └── train_and_save_models.py        # Model training and persistence
 
-data/                               # 📊 Datasets
+data/                               # Datasets
 
-docs/                               # 📝 Documentation
+docs/                               # Documentation
 └── [Generated documentation]       # Technical guides
 
-best_params/                        # 💾 Optimized parameters
+best_params/                        # Optimized parameters
 └── stack_*_best_params.json        # Per-stack best parameters
 ```
 
@@ -231,7 +231,7 @@ The pipeline employs six specialized ensemble stacks, each optimized for differe
 The pipeline is designed to achieve high accuracy through ensemble learning and advanced optimization techniques. Performance will vary based on:
 
 ```
-📊 Dataset Statistics
+Dataset Statistics
 ├── Training Samples: ~18,000+ (with augmentation)
 ├── Test Samples: ~6,000+
 ├── Original Features: 8 personality dimensions
@@ -239,11 +239,11 @@ The pipeline is designed to achieve high accuracy through ensemble learning and
 ├── Augmented Samples: Variable (adaptive, typically 5-10%)
 └── Class Balance: Extrovert/Introvert classification
 
-🔧 Technical Specifications
+Technical Specifications
 ├── Memory Usage: <4GB peak (configurable)
 ├── CPU Utilization: 4 cores (configurable)
-├── Model Persistence: ✅ Best parameters saved
-└── Reproducibility: ✅ Fixed random seeds
+├── Model Persistence: Yes - Best parameters saved
+└── Reproducibility: Yes - Fixed random seeds
 ```
 
 ## Testing & Validation
diff --git a/docs/README.md b/docs/README.md
index ac7fdb7..7535805 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -40,7 +40,7 @@ docker build -t personality-classifier .
 docker run -p 8080:8080 personality-classifier
 ```
 
-## 📚 Resources
+## Resources
 
 - Code: `src/main_modular.py`, `examples/`
 - Config templates: [Configuration Guide](configuration.md)
diff --git a/docs/data-augmentation.md b/docs/data-augmentation.md
index b437964..582bc3f 100644
--- a/docs/data-augmentation.md
+++ b/docs/data-augmentation.md
@@ -397,14 +397,14 @@ def calculate_adaptive_ratio(data_characteristics):
 
 ### When to Use Augmentation
 
-✅ **Recommended**:
+**Recommended**:
 
 - Small to medium datasets (<10K samples)
 - Class imbalanced problems
 - High-stakes applications requiring robustness
 - When overfitting is detected
 
-❌ **Not Recommended**:
+**Not Recommended**:
 
 - Very large datasets (>100K samples)
 - When computational resources are limited
diff --git a/scripts/train_and_save_models.py b/scripts/train_and_save_models.py
index aa344e7..fd1d883 100755
--- a/scripts/train_and_save_models.py
+++ b/scripts/train_and_save_models.py
@@ -174,14 +174,14 @@ def main():
     setup_logging()
     logger = get_logger(__name__)
 
-    logger.info("🚀 Starting model training and saving process...")
+    logger.info("Starting model training and saving process...")
 
     # Create models directory
     models_dir = Path("models")
     models_dir.mkdir(exist_ok=True)
 
     # Load and prepare data
-    logger.info("📊 Loading and preparing data...")
+    logger.info("Loading and preparing data...")
     df_tr, df_te, submission = load_data_with_external_merge()
 
     # Preprocess data (prep function expects target column in df_tr)
@@ -215,7 +215,7 @@ def main():
     except Exception as e:
         logger.error(f"Failed to train ensemble model: {e}")
 
-    logger.info("✅ Model training and saving complete!")
+    logger.info("Model training and saving complete!")
     logger.info(f"Models saved in: {models_dir.absolute()}")
 
 
diff --git a/src/main_modular.py b/src/main_modular.py
index f92b43e..42ab466 100755
--- a/src/main_modular.py
+++ b/src/main_modular.py
@@ -88,7 +88,7 @@ def load_and_prepare_data(
     testing_mode: bool = True, test_size: int = 1000
 ) -> TrainingData:
     """Load and prepare training data."""
-    logger.info("🎯 Six-Stack Personality Classification Pipeline (Modular)")
+    logger.info("Six-Stack Personality Classification Pipeline (Modular)")
     logger.info("=" * 60)
 
     # Load data using advanced merge strategy
@@ -97,11 +97,11 @@ def load_and_prepare_data(
     # FOR TESTING: Limit to specified samples for faster execution
     if testing_mode and len(df_tr) > test_size:
         logger.info(
-            f"🔬 TESTING MODE: Limiting dataset to {test_size} samples "
+            f"TESTING MODE: Limiting dataset to {test_size} samples "
             f"(original: {len(df_tr)})"
         )
         df_tr = df_tr.sample(n=test_size, random_state=RND).reset_index(drop=True)
-        logger.info(f"   📊 Using {len(df_tr)} samples for testing")
+        logger.info(f"   Using {len(df_tr)} samples for testing")
 
     # Preprocess data with advanced competitive approach (do this first)
     X_full, X_test, y_full, le = prep(df_tr, df_te)
@@ -235,7 +235,7 @@ def train_single_stack(config: StackConfig, data: TrainingData) -> optuna.Study:
 
 def train_all_stacks(data: TrainingData) -> dict[str, optuna.Study]:
     """Train all stacks in the ensemble."""
-    logger.info("\n🔍 Training 6 specialized stacks...")
+    logger.info("\nTraining 6 specialized stacks...")
 
     stack_configs = get_stack_configurations()
     studies = {}
@@ -250,7 +250,7 @@ def create_model_builders(
     studies: dict[str, optuna.Study], data: TrainingData
 ) -> dict[str, Callable[[], Any]]:
     """Create model builder functions for each stack."""
-    logger.info("\n📊 Creating model builders for ensemble...")
+    logger.info("\nCreating model builders for ensemble...")
 
     builders = {
         "A": lambda: build_stack(studies["A"].best_trial, seed=RND, wide_hp=False),
@@ -274,7 +274,7 @@ def generate_oof_predictions(
     builders: dict[str, Callable[[], Any]], data: TrainingData
 ) -> dict[str, pd.Series]:
     """Generate out-of-fold predictions for all stacks."""
-    logger.info("\n🔮 Generating out-of-fold predictions...")
+    logger.info("\nGenerating out-of-fold predictions...")
 
     oof_predictions = {}
 
@@ -325,7 +325,7 @@ def optimize_ensemble_blending(
     oof_predictions: dict[str, pd.Series], y_full: pd.Series
 ) -> tuple[dict[str, float], float]:
     """Optimize ensemble blending weights."""
-    logger.info("\n⚖️ Optimizing ensemble blending...")
+    logger.info("\nOptimizing ensemble blending...")
 
     study_blend = optuna.create_study(direction="maximize")
     blend_objective = create_blend_objective(oof_predictions, y_full)
@@ -345,7 +345,7 @@ def optimize_ensemble_blending(
         "F": best_weights_list[5],
     }
 
-    logger.info("\n🏆 Best ensemble weights:")
+    logger.info("\nBest ensemble weights:")
     for stack_name, weight in best_weights.items():
         logger.info(f"   Stack {stack_name}: {weight:.3f}")
     logger.info(f"Best CV score: {study_blend.best_value:.6f}")
@@ -377,7 +377,7 @@ def refit_and_predict(
     models["F"].fit(data.X_full, y_full_noisy)
 
     # Generate final predictions
-    logger.info("\n🎯 Generating final predictions...")
+    logger.info("\nGenerating final predictions...")
     probabilities = {}
     for stack_name in ["A", "B", "C", "D", "E", "F"]:
         probabilities[stack_name] = models[stack_name].predict_proba(data.X_test)[:, 1]
@@ -408,11 +408,11 @@ def apply_pseudo_labelling(
 ) -> TrainingData:
     """Apply pseudo labelling using ensemble predictions."""
     if not ENABLE_PSEUDO_LABELLING:
-        logger.info("🔮 Pseudo labelling disabled")
+        logger.info("Pseudo labelling disabled")
         return data
 
     logger.info(
-        f"\n🔮 Applying pseudo labelling (threshold={PSEUDO_CONFIDENCE_THRESHOLD}, max_ratio={PSEUDO_MAX_RATIO})..."
+        f"\nApplying pseudo labelling (threshold={PSEUDO_CONFIDENCE_THRESHOLD}, max_ratio={PSEUDO_MAX_RATIO})..."
     )
 
     # First train models to get test predictions for pseudo labelling
@@ -465,7 +465,7 @@ def apply_pseudo_labelling(
     # Create new TrainingData with pseudo labels added
     if pseudo_stats["n_pseudo_added"] > 0:
         logger.info(
-            f"✅ Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples"
+            f"Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples"
         )
 
         # Create new TrainingData object with enhanced training set
@@ -478,14 +478,14 @@ def apply_pseudo_labelling(
         )
         return enhanced_data
     else:
-        logger.info("⚠️ No pseudo labels added, using original data")
+        logger.info("No pseudo labels added, using original data")
         return data
 
 
 def main():
     """Main execution function for the Six-Stack Personality Classification Pipeline."""
 
-    logger.info("🚀 Starting Six-Stack Personality Classification Pipeline")
+    logger.info("Starting Six-Stack Personality Classification Pipeline")
 
     try:
         # Load and prepare data
@@ -494,7 +494,7 @@ def main():
         )
 
         logger.info(
-            f"📊 Loaded data: {len(data.X_full)} training samples, {len(data.X_test)} test samples"
+            f"Loaded data: {len(data.X_full)} training samples, {len(data.X_test)} test samples"
         )
 
         # Train all stacks
@@ -503,7 +503,7 @@ def main():
         # Log stack optimization results
         for stack_name, study in studies.items():
             logger.info(
-                f"📈 Stack {stack_name}: Best score = {study.best_value:.6f} ({len(study.trials)} trials)"
+                f"Stack {stack_name}: Best score = {study.best_value:.6f} ({len(study.trials)} trials)"
             )
 
         # Create model builders
@@ -517,8 +517,8 @@ def main():
             oof_predictions, data.y_full
         )
 
-        logger.info(f"🎯 Best ensemble CV score: {best_cv_score:.6f}")
-        logger.info(f"⚖️ Ensemble weights: {best_weights}")
+        logger.info(f"Best ensemble CV score: {best_cv_score:.6f}")
+        logger.info(f"Ensemble weights: {best_weights}")
 
         # Apply pseudo labelling using ensemble predictions
         enhanced_data = apply_pseudo_labelling(builders, best_weights, data)
@@ -534,12 +534,12 @@ def main():
         )
 
         # Print final results
-        logger.info(f"\n✅ Predictions saved to '{output_file}'")
-        logger.info(f"📊 Final submission shape: {submission_df.shape}")
-        logger.info("🎉 Six-stack ensemble pipeline completed successfully!")
+        logger.info(f"\nPredictions saved to '{output_file}'")
+        logger.info(f"Final submission shape: {submission_df.shape}")
+        logger.info("Six-stack ensemble pipeline completed successfully!")
 
         # Print summary
-        logger.info("\n📋 Summary:")
+        logger.info("\nSummary:")
         logger.info(f"   - Training samples: {len(enhanced_data.X_full):,}")
         logger.info(f"   - Test samples: {len(enhanced_data.X_test):,}")
         logger.info(f"   - Features: {enhanced_data.X_full.shape[1]}")
@@ -551,7 +551,7 @@ def main():
         logger.info("   - Modular architecture")
 
     except Exception as e:
-        logger.error(f"❌ Pipeline failed: {e}")
+        logger.error(f"Pipeline failed: {e}")
         raise
 
 
diff --git a/src/modules/data_augmentation.py b/src/modules/data_augmentation.py
index 8c504e4..d9d19be 100644
--- a/src/modules/data_augmentation.py
+++ b/src/modules/data_augmentation.py
@@ -65,7 +65,7 @@ def analyze_data_characteristics(X_train: pd.DataFrame, y_train: pd.Series) -> d
         "is_highly_categorical": categorical_ratio > 0.3,
     }
 
-    logger.info(f"📊 Data characteristics: {characteristics}")
+    logger.info(f"Data characteristics: {characteristics}")
     return characteristics
 
 
@@ -82,14 +82,14 @@ def adaptive_augmentation_selection(
     max_ratio = AugmentationConfig.MAX_AUGMENTATION_RATIO.value
 
     logger.info(
-        f"   📊 Adaptive selection using config ratios: base={base_ratio}, min={min_ratio}, max={max_ratio}"
+        f"   Adaptive selection using config ratios: base={base_ratio}, min={min_ratio}, max={max_ratio}"
     )
 
     if characteristics["is_small_dataset"] and characteristics["is_imbalanced"]:
         # Small imbalanced dataset - use higher ratio for SMOTENC
         adaptive_ratio = min(base_ratio * 1.5, max_ratio)  # 150% of base, capped at max
         logger.info(
-            f"   🎯 Small imbalanced dataset detected → SMOTENC with ratio {adaptive_ratio:.3f} (base*1.5)"
+            f"   Small imbalanced dataset detected → SMOTENC with ratio {adaptive_ratio:.3f} (base*1.5)"
         )
         return AugmentationMethod.SMOTENC, adaptive_ratio
 
@@ -97,7 +97,7 @@ def adaptive_augmentation_selection(
         # High categorical ratio - use moderate ratio for SDV Copula
         adaptive_ratio = min(base_ratio * 0.8, max_ratio)  # 80% of base
         logger.info(
-            f"   🎯 High categorical features detected → SDV_COPULA with ratio {adaptive_ratio:.3f} (base*0.8)"
+            f"   High categorical features detected → SDV_COPULA with ratio {adaptive_ratio:.3f} (base*0.8)"
         )
         return AugmentationMethod.SDV_COPULA, adaptive_ratio
 
@@ -105,7 +105,7 @@ def adaptive_augmentation_selection(
         # Large balanced dataset - use conservative ratio for ensemble
         adaptive_ratio = max(base_ratio * 0.5, min_ratio)  # 50% of base, at least min
         logger.info(
-            f"   🎯 Large balanced dataset detected → MIXED_ENSEMBLE with ratio {adaptive_ratio:.3f} (base*0.5)"
+            f"   Large balanced dataset detected → MIXED_ENSEMBLE with ratio {adaptive_ratio:.3f} (base*0.5)"
         )
         return AugmentationMethod.MIXED_ENSEMBLE, adaptive_ratio
 
@@ -113,7 +113,7 @@ def adaptive_augmentation_selection(
         # Severe imbalance - use maximum ratio for class balancing
         adaptive_ratio = max_ratio  # Use maximum configured ratio
         logger.info(
-            f"   🎯 Severe class imbalance detected → CLASS_BALANCED with ratio {adaptive_ratio:.3f} (max ratio)"
+            f"   Severe class imbalance detected → CLASS_BALANCED with ratio {adaptive_ratio:.3f} (max ratio)"
         )
         return AugmentationMethod.CLASS_BALANCED, adaptive_ratio
 
@@ -121,7 +121,7 @@ def adaptive_augmentation_selection(
         # Default to SDV Copula with base ratio
         adaptive_ratio = base_ratio
         logger.info(
-            f"   🎯 Default case → SDV_COPULA with ratio {adaptive_ratio:.3f} (base ratio)"
+            f"   Default case → SDV_COPULA with ratio {adaptive_ratio:.3f} (base ratio)"
         )
         return AugmentationMethod.SDV_COPULA, adaptive_ratio
 
@@ -131,7 +131,7 @@ def tvae_augmentation(
 ) -> tuple[pd.DataFrame, pd.Series]:
     """TVAE-based augmentation with better stability than CTGAN."""
     if not SDV_AVAILABLE:
-        logger.warning("⚠️ SDV not available, falling back to simple augmentation")
+        logger.warning("SDV not available, falling back to simple augmentation")
         return simple_mixed_augmentation(X_train, y_train, augment_ratio)
 
     try:
@@ -175,7 +175,7 @@ def tvae_augmentation(
         return augmented_X, augmented_y
 
     except Exception as e:
-        logger.warning(f"⚠️ TVAE failed: {e}, falling back to Copula")
+        logger.warning(f"TVAE failed: {e}, falling back to Copula")
         return sdv_augmentation(X_train, y_train, "copula", augment_ratio)
 
 
@@ -196,7 +196,7 @@ def class_balanced_augmentation(
     samples_needed = max(0, target_minority_count - minority_count)
 
     if samples_needed == 0:
-        logger.info("   ✅ Classes already balanced, no augmentation needed")
+        logger.info("   Classes already balanced, no augmentation needed")
         return pd.DataFrame(), pd.Series(dtype=y_train.dtype)
 
     # Filter minority class data
@@ -229,7 +229,7 @@ def class_balanced_augmentation(
                 all_augmented_y.append(aug_y)
 
         except Exception as e:
-            logger.warning(f"⚠️ Method {method} failed: {e}")
+            logger.warning(f"Method {method} failed: {e}")
 
     if all_augmented_X:
         combined_X = pd.concat(all_augmented_X, ignore_index=True)
@@ -266,7 +266,7 @@ def mixed_ensemble_augmentation(
                 all_augmented_y.append(aug_y)
 
         except Exception as e:
-            logger.warning(f"⚠️ Method {method} failed: {e}")
+            logger.warning(f"Method {method} failed: {e}")
 
     if all_augmented_X:
         combined_X = pd.concat(all_augmented_X, ignore_index=True)
@@ -284,7 +284,7 @@ def enhanced_quality_filtering(
     threshold: float = 0.75,
 ) -> tuple[pd.DataFrame, pd.Series]:
     """Multi-metric quality filtering for synthetic data."""
-    logger.info("   🔍 Enhanced quality filtering...")
+    logger.info("   Enhanced quality filtering...")
 
     if len(synthetic_X) == 0:
         return synthetic_X, synthetic_y
@@ -326,7 +326,7 @@ def enhanced_quality_filtering(
     logger.info(
         f"   ✨ Quality filtering: {len(filtered_X)}/{len(synthetic_X)} samples kept"
     )
-    logger.info(f"   📊 Avg quality score: {np.mean(quality_scores):.3f}")
+    logger.info(f"   Avg quality score: {np.mean(quality_scores):.3f}")
 
     return filtered_X, filtered_y
 
@@ -354,10 +354,10 @@ def diversity_check(
             diversity_scores.append(diversity)
 
     avg_diversity = np.mean(diversity_scores) if diversity_scores else 1.0
-    is_diverse = avg_diversity >= threshold
+    is_diverse = bool(avg_diversity >= threshold)
 
     logger.info(
-        f"   🌈 Diversity score: {avg_diversity:.3f} ({'✅ Pass' if is_diverse else '❌ Fail'})"
+        f"   Diversity score: {avg_diversity:.3f} ({'Pass' if is_diverse else 'Fail'})"
     )
     return is_diverse
 
@@ -435,11 +435,11 @@ def sdv_augmentation(
 ) -> tuple[pd.DataFrame, pd.Series]:
     """High-quality synthetic data generation using SDV with improved CTGAN handling."""
     if not SDV_AVAILABLE:
-        logger.warning("⚠️ SDV not available, falling back to simple augmentation")
+        logger.warning("SDV not available, falling back to simple augmentation")
         return simple_mixed_augmentation(X_train, y_train, augment_ratio)
 
     try:
-        logger.info(f"   🔧 Using {method} synthesizer...")
+        logger.info(f"   Using {method} synthesizer...")
 
         # Combine features and target for SDV
         train_data = X_train.copy()
@@ -471,7 +471,7 @@ def sdv_augmentation(
                 logger.warning(
                     "   ⚠️ Apple Silicon detected - CTGAN has compatibility issues"
                 )
-                logger.info("   🔄 Falling back to GaussianCopula for stability...")
+                logger.info("   Falling back to GaussianCopula for stability...")
                 synthesizer = GaussianCopulaSynthesizer(metadata)
             else:
                 logger.info("   🧠 Training CTGAN (slow but high quality)...")
@@ -521,7 +521,7 @@ def timeout_handler(_signum, _frame):
         try:
             synthesizer.fit(train_data)
             training_time = time.time() - start_time
-            logger.info(f"   ✅ Training completed in {training_time:.1f}s")
+            logger.info(f"   Training completed in {training_time:.1f}s")
         finally:
             if method == "ctgan":
                 signal.alarm(0)
@@ -531,22 +531,22 @@ def timeout_handler(_signum, _frame):
         if n_synthetic == 0:
             return pd.DataFrame(), pd.Series(dtype=y_train.dtype)
 
-        logger.info(f"   🎲 Generating {n_synthetic} synthetic samples...")
+        logger.info(f"   Generating {n_synthetic} synthetic samples...")
         synthetic_data = synthesizer.sample(num_rows=n_synthetic)
 
         # Split back to features and target
         synthetic_X = synthetic_data.drop("Personality", axis=1)
         synthetic_y = synthetic_data["Personality"]
 
-        logger.info(f"   ✅ Generated {len(synthetic_X)} synthetic samples")
+        logger.info(f"   Generated {len(synthetic_X)} synthetic samples")
         return synthetic_X, synthetic_y
 
     except TimeoutError:
-        logger.warning("⚠️ Training timed out, falling back to simple augmentation")
+        logger.warning("Training timed out, falling back to simple augmentation")
         return simple_mixed_augmentation(X_train, y_train, augment_ratio)
     except Exception as e:
         logger.warning(
-            f"⚠️ SDV augmentation failed: {e}, falling back to simple augmentation"
+            f"SDV augmentation failed: {e}, falling back to simple augmentation"
         )
         return simple_mixed_augmentation(X_train, y_train, augment_ratio)
 
@@ -557,7 +557,7 @@ def smotenc_augmentation(
     """SMOTE for mixed numerical/categorical data."""
     if not IMBLEARN_AVAILABLE:
         logger.warning(
-            "⚠️ imbalanced-learn not available, falling back to simple augmentation"
+            "imbalanced-learn not available, falling back to simple augmentation"
         )
         return simple_mixed_augmentation(X_train, y_train, augment_ratio)
 
@@ -600,7 +600,7 @@ def smotenc_augmentation(
 
     except Exception as e:
         logger.warning(
-            f"⚠️ SMOTENC augmentation failed: {e}, falling back to simple augmentation"
+            f"SMOTENC augmentation failed: {e}, falling back to simple augmentation"
         )
         return simple_mixed_augmentation(X_train, y_train, augment_ratio)
 
@@ -613,7 +613,7 @@ def apply_data_augmentation(
         logger.info("📊 Data augmentation disabled")
         return X_train, y_train
 
-    logger.info("📊 Applying adaptive data augmentation...")
+    logger.info("Applying adaptive data augmentation...")
     original_shape = X_train.shape
 
     # Analyze data characteristics
@@ -623,13 +623,13 @@ def apply_data_augmentation(
     if AugmentationConfig.AUGMENTATION_METHOD.value == AugmentationMethod.ADAPTIVE:
         method, augment_ratio = adaptive_augmentation_selection(characteristics)
         logger.info(
-            f"   🎯 Auto-selected: {method.value} with ratio {augment_ratio:.3f}"
+            f"   Auto-selected: {method.value} with ratio {augment_ratio:.3f}"
         )
     else:
         method = AugmentationConfig.AUGMENTATION_METHOD.value
         augment_ratio = AugmentationConfig.AUGMENTATION_RATIO.value
         logger.info(
-            f"   🎯 Using configured: {method.value} with ratio {augment_ratio:.3f}"
+            f"   Using configured: {method.value} with ratio {augment_ratio:.3f}"
         )
 
     # Apply augmentation with timeout
@@ -677,12 +677,12 @@ def timeout_handler(_signum, _frame):
 
     except TimeoutError:
         logger.warning(
-            f"⚠️ Augmentation timeout after {AugmentationConfig.MAX_AUGMENTATION_TIME_SECONDS.value}s, using SMOTENC fallback"
+            f"Augmentation timeout after {AugmentationConfig.MAX_AUGMENTATION_TIME_SECONDS.value}s, using SMOTENC fallback"
         )
         augmented_X, augmented_y = smotenc_augmentation(X_train, y_train, 0.03)
         signal.alarm(0)
     except Exception as e:
-        logger.warning(f"⚠️ Augmentation failed: {e}, using original data")
+        logger.warning(f"Augmentation failed: {e}, using original data")
         signal.alarm(0)
         return X_train, y_train
 
@@ -704,7 +704,7 @@ def timeout_handler(_signum, _frame):
         if AugmentationConfig.ENABLE_DIVERSITY_CHECK.value and not diversity_check(
             X_train, augmented_X, AugmentationConfig.DIVERSITY_THRESHOLD.value
         ):
-            logger.warning("   ⚠️ Low diversity detected, reducing synthetic samples")
+            logger.warning("   Low diversity detected, reducing synthetic samples")
             # Keep only top 50% most diverse samples
             keep_ratio = 0.5
             keep_count = int(len(augmented_X) * keep_ratio)
@@ -717,18 +717,18 @@ def timeout_handler(_signum, _frame):
             y_combined = pd.concat([y_train, augmented_y], ignore_index=True)
 
             logger.info(
-                f"   ✅ Added {len(augmented_X)} high-quality synthetic samples"
+                f"   Added {len(augmented_X)} high-quality synthetic samples"
             )
-            logger.info(f"   📈 Data shape: {original_shape} → {X_combined.shape}")
+            logger.info(f"   Data shape: {original_shape} → {X_combined.shape}")
 
             # Log class balance improvement
             orig_balance = y_train.value_counts().min() / y_train.value_counts().max()
             new_balance = (
                 y_combined.value_counts().min() / y_combined.value_counts().max()
             )
-            logger.info(f"   ⚖️ Class balance: {orig_balance:.3f} → {new_balance:.3f}")
+            logger.info(f"   Class balance: {orig_balance:.3f} → {new_balance:.3f}")
 
             return X_combined, y_combined
 
-    logger.warning("⚠️ No synthetic samples generated, using original data")
+    logger.warning("No synthetic samples generated, using original data")
     return X_train, y_train
diff --git a/src/modules/data_loader.py b/src/modules/data_loader.py
index 07aba89..57167f3 100644
--- a/src/modules/data_loader.py
+++ b/src/modules/data_loader.py
@@ -16,7 +16,7 @@ def load_data_with_external_merge():
     Returns:
         tuple: (df_tr, df_te, submission) - training data, test data, and submission template
     """
-    logger.info("📊 Loading data with advanced merge strategy...")
+    logger.info("Loading data with advanced merge strategy...")
 
     # Use Paths enum from config.py for all file paths
     df_tr = pd.read_csv(Paths.TRAIN_CSV.value)
@@ -62,10 +62,10 @@ def load_data_with_external_merge():
         test_matches = df_te["match_p"].notna().sum()
 
         logger.info(
-            f"✅ Successfully matched {train_matches}/{len(df_tr)} training samples with external data"
+            f"Successfully matched {train_matches}/{len(df_tr)} training samples with external data"
         )
         logger.info(
-            f"✅ Successfully matched {test_matches}/{len(df_te)} test samples with external data"
+            f"Successfully matched {test_matches}/{len(df_te)} test samples with external data"
         )
 
         # Print match distribution for training data
@@ -77,10 +77,10 @@ def load_data_with_external_merge():
 
     except FileNotFoundError:
         logger.warning(
-            "⚠️ personality_datasert.csv not found, adding empty match_p column"
+            "personality_datasert.csv not found, adding empty match_p column"
         )
         df_tr["match_p"] = None
         df_te["match_p"] = None
 
-    logger.info("✅ Data loading with external merge completed")
+    logger.info("Data loading with external merge completed")
     return df_tr, df_te, submission
diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py
index 980d683..5aea9e8 100644
--- a/src/modules/preprocessing.py
+++ b/src/modules/preprocessing.py
@@ -28,7 +28,7 @@ def prep(
     Returns:
         Tuple of (X_train, X_test, y_train, label_encoder)
     """
-    logger.info("🔧 Preprocessing data with advanced competitive approach...")
+    logger.info("Preprocessing data with advanced competitive approach...")
 
     # Define feature groups before any processing
     # Keep original column types for proper categorization
@@ -52,7 +52,7 @@ def prep(
         df_te = df_te.drop(columns=[idx])
 
     # Use advanced correlation-based imputation
-    logger.info("🔄 Performing TOP-4 correlation-based imputation...")
+    logger.info("Performing TOP-4 correlation-based imputation...")
 
     # Extract and encode target variable BEFORE combining data
     le_tgt = LabelEncoder()
@@ -143,7 +143,7 @@ def fill_missing_by_quantile_group(
             all_data[col] = all_data[col].fillna("Unknown")
 
     # Apply one-hot encoding for categorical features using OneHotEncoder (advanced approach)
-    logger.info("🔄 Applying one-hot encoding for categorical features...")
+    logger.info("Applying one-hot encoding for categorical features...")
 
     # Identify categorical columns that exist in the data
     existing_categorical_cols = [
@@ -173,13 +173,13 @@ def fill_missing_by_quantile_group(
         all_data = pd.concat([all_data, encoded_df], axis=1)
 
         logger.info(
-            f"   ✅ Encoded {len(existing_categorical_cols)} categorical features into {len(feature_names)} binary features"
+            f"   Encoded {len(existing_categorical_cols)} categorical features into {len(feature_names)} binary features"
         )
     else:
-        logger.warning("   ⚠️ No categorical features found to encode")
+        logger.warning("   No categorical features found to encode")
 
     # Fill any remaining missing values
-    logger.info("🔄 Filling any remaining missing values...")
+    logger.info("Filling any remaining missing values...")
 
     # For numerical columns, use median
     num_cols = all_data.select_dtypes(include=[np.number]).columns
@@ -200,7 +200,7 @@ def fill_missing_by_quantile_group(
     logger.info(f"Final train shape: {df_tr.shape}")
     logger.info(f"Final test shape: {df_te.shape}")
 
-    logger.info("✅ Preprocessing completed with advanced competitive approach")
+    logger.info("Preprocessing completed with advanced competitive approach")
     return df_tr, df_te, ytr, le_tgt
 
 
@@ -259,13 +259,13 @@ def add_pseudo_labeling_conservative(
     n_high_conf = np.sum(high_conf_mask)
     max_pseudo_samples = int(len(X_full) * max_pseudo_ratio)
 
-    logger.info(f"   📊 Found {n_high_conf} high-confidence predictions")
-    logger.info(f"   📏 Maximum allowed pseudo-samples: {max_pseudo_samples}")
+    logger.info(f"   Found {n_high_conf} high-confidence predictions")
+    logger.info(f"   Maximum allowed pseudo-samples: {max_pseudo_samples}")
 
     if n_high_conf > 0:
         # Limit pseudo-samples to avoid overfitting
         if n_high_conf > max_pseudo_samples:
-            logger.info(f"   ✂️ Limiting to {max_pseudo_samples} most confident samples")
+            logger.info(f"   Limiting to {max_pseudo_samples} most confident samples")
             # Get confidence scores and select most confident samples
             conf_scores = np.maximum(ensemble_proba, 1 - ensemble_proba)
             high_conf_indices = np.where(high_conf_mask)[0]
@@ -299,16 +299,16 @@ def add_pseudo_labeling_conservative(
             "final_size": len(X_combined),
         }
 
-        logger.info(f"   ✅ Added {len(y_pseudo)} pseudo-labels to training data")
+        logger.info(f"   Added {len(y_pseudo)} pseudo-labels to training data")
         logger.info(
-            f"   📊 Pseudo-label distribution: Class 0: {pseudo_stats['pseudo_class_0']}, Class 1: {pseudo_stats['pseudo_class_1']}"
+            f"   Pseudo-label distribution: Class 0: {pseudo_stats['pseudo_class_0']}, Class 1: {pseudo_stats['pseudo_class_1']}"
         )
-        logger.info(f"   🎯 Mean confidence: {pseudo_stats['mean_confidence']:.4f}")
-        logger.info(f"   📈 Training data: {len(X_full)} → {len(X_combined)} samples")
+        logger.info(f"   Mean confidence: {pseudo_stats['mean_confidence']:.4f}")
+        logger.info(f"   Training data: {len(X_full)} → {len(X_combined)} samples")
 
         return X_combined, y_combined, pseudo_stats
     else:
-        logger.info("   ⚠️ No high-confidence predictions found")
+        logger.info("   No high-confidence predictions found")
         pseudo_stats = {
             "n_pseudo_added": 0,
             "original_size": len(X_full),
@@ -340,7 +340,7 @@ def create_domain_balanced_dataset(
     Returns:
         Tuple of (combined_dataframe, sample_weights)
     """
-    logger.info("🎯 Computing domain weights for distribution alignment...")
+    logger.info("Computing domain weights for distribution alignment...")
 
     # Combine dataframes with domain labels
     combined_data = []
@@ -440,7 +440,7 @@ def create_domain_balanced_dataset(
             weights[domain_mask] = domain_weights / (np.mean(domain_weights) + 1e-8)
 
     # Print summary
-    logger.info("📊 Domain weighting summary:")
+    logger.info("Domain weighting summary:")
     logger.info("   Reference domain: 0 (first dataframe)")
     for domain_idx in range(len(dataframes)):
         domain_mask = domain_labels == domain_idx
@@ -481,7 +481,7 @@ def create_domain_balanced_dataset(
             total_count = np.sum(domain_mask)
             if removed_count > 0:
                 logger.info(
-                    f"   🚫 Filtered {removed_count}/{total_count} ({removed_count / total_count * 100:.1f}%) "
+                    f"   Filtered {removed_count}/{total_count} ({removed_count / total_count * 100:.1f}%) "
                     f"low-quality samples from domain {domain_idx}"
                 )
 
@@ -490,7 +490,7 @@ def create_domain_balanced_dataset(
         weights = weights[keep_mask]
 
         logger.info(
-            f"   ✅ Kept {len(combined_df)} high-quality samples after filtering"
+            f"   Kept {len(combined_df)} high-quality samples after filtering"
         )
 
     logger.info(f"Created domain-balanced dataset with {len(combined_df)} samples")
diff --git a/tests/README.md b/tests/README.md
index 51504a8..6f86cb1 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -2,7 +2,7 @@
 
 This comprehensive testing framework covers all components of the personality classification pipeline including data processing, model building, and MLOps infrastructure.
 
-## 📁 Test Structure
+## Test Structure
 
 ```
 tests/
@@ -18,7 +18,7 @@ tests/
 └── fixtures/                      # Test data and fixtures
 ```
 
-## 🧪 Test Categories
+## Test Categories
 
 ### **Unit Tests** (`@pytest.mark.unit`)
 - Test individual functions and classes in isolation
@@ -35,9 +35,9 @@ tests/
 - Large dataset processing
 - Model training with multiple iterations
 
-## 🚀 Running Tests
+## Running Tests
 
-### **Quick Start**
+### Quick Start
 ```bash
 # Run all tests
 python run_tests.py
@@ -49,7 +49,7 @@ python run_tests.py --type all
 python run_tests.py --type fast
 ```
 
-### **Test Categories**
+### Test Categories
 ```bash
 # Run only unit tests
 python run_tests.py --type unit
@@ -64,7 +64,7 @@ python run_tests.py --type modules
 python run_tests.py --type mlops
 ```
 
-### **Specific Test Execution**
+### Specific Test Execution
 ```bash
 # Run specific test file
 python run_tests.py --test tests/modules/test_data_loader.py
@@ -76,7 +76,7 @@ python run_tests.py --test tests/modules/test_data_loader.py::TestDataLoader
 python run_tests.py --test tests/modules/test_data_loader.py::TestDataLoader::test_init
 ```
 
-### **Coverage Options**
+### Coverage Options
 ```bash
 # Run without coverage (faster)
 python run_tests.py --no-coverage
@@ -85,7 +85,7 @@ python run_tests.py --no-coverage
 python run_tests.py --verbose
 ```
 
-### **Direct Pytest Usage**
+### Direct Pytest Usage
 ```bash
 # Run with pytest directly
 pytest tests/
@@ -97,9 +97,9 @@ pytest -m "unit and not slow" tests/
 pytest --cov=src --cov-report=html tests/
 ```
 
-## 🔧 Configuration
+## Configuration
 
-### **pytest.ini**
+### pytest.ini
 ```ini
 [tool.pytest.ini_options]
 minversion = "6.0"
@@ -121,7 +121,7 @@ markers = [
 ]
 ```
 
-## 🎯 Test Fixtures
+## Test Fixtures
 
 The test suite includes comprehensive fixtures for different testing scenarios:
 
@@ -139,7 +139,7 @@ The test suite includes comprehensive fixtures for different testing scenarios:
 - `assert_data_shape()`: Check DataFrame dimensions
 - `assert_no_missing_values()`: Verify data quality
 
-## 📊 Coverage Reports
+## Coverage Reports
 
 Test coverage reports are generated in multiple formats:
 
@@ -162,7 +162,7 @@ Open `htmlcov/index.html` in your browser for detailed coverage analysis.
 ### **XML Report**
 `coverage.xml` for CI/CD integration.
 
-## 🔍 Test Examples
+## Test Examples
 
 ### **Data Processing Tests**
 ```python
@@ -193,9 +193,9 @@ def test_train_model(self, sample_data):
     assert hasattr(trained_model, "predict")
 ```
 
-## 🐛 Debugging Tests
+## Debugging Tests
 
-### **Running in Debug Mode**
+### Running in Debug Mode
 ```bash
 # Run with verbose output and show local variables
 pytest -vvv --tb=long tests/
@@ -204,7 +204,7 @@ pytest -vvv --tb=long tests/
 pytest -s tests/modules/test_data_loader.py::TestDataLoader::test_init
 ```
 
-### **Using Print Statements**
+### Using Print Statements
 ```python
 def test_debug_example(self, sample_data):
     print(f"Data shape: {sample_data.shape}")
@@ -212,14 +212,14 @@ def test_debug_example(self, sample_data):
     # ... test logic
 ```
 
-### **Using Debugger**
+### Using Debugger
 ```python
 def test_with_debugger(self, sample_data):
     import pdb; pdb.set_trace()
     # ... test logic
 ```
 
-## 🔄 Continuous Integration
+## Continuous Integration
 
 ### **GitHub Actions Example**
 ```yaml
@@ -248,28 +248,28 @@ jobs:
       uses: codecov/codecov-action@v1
 ```
 
-## 📝 Best Practices
+## Best Practices
 
-### **Writing Tests**
-1. **Descriptive Names**: Use clear, descriptive test names
-2. **Single Responsibility**: Each test should verify one specific behavior
-3. **Independent Tests**: Tests should not depend on each other
-4. **Use Fixtures**: Leverage pytest fixtures for setup and teardown
-5. **Mock External Dependencies**: Use mocks for external services
+### Writing Tests
+1. Descriptive Names: Use clear, descriptive test names
+2. Single Responsibility: Each test should verify one specific behavior
+3. Independent Tests: Tests should not depend on each other
+4. Use Fixtures: Leverage pytest fixtures for setup and teardown
+5. Mock External Dependencies: Use mocks for external services
 
-### **Test Organization**
-1. **Group Related Tests**: Use test classes to group related functionality
-2. **Use Markers**: Tag tests appropriately for selective execution
-3. **Parametrize Tests**: Use `@pytest.mark.parametrize` for multiple scenarios
-4. **Document Complex Tests**: Add docstrings explaining test purpose
+### Test Organization
+1. Group Related Tests: Use test classes to group related functionality
+2. Use Markers: Tag tests appropriately for selective execution
+3. Parametrize Tests: Use `@pytest.mark.parametrize` for multiple scenarios
+4. Document Complex Tests: Add docstrings explaining test purpose
 
-### **Performance**
-1. **Fast Unit Tests**: Keep unit tests fast and focused
-2. **Mark Slow Tests**: Use `@pytest.mark.slow` for time-consuming tests
-3. **Use Smaller Datasets**: Create minimal datasets for testing
-4. **Parallel Execution**: Consider pytest-xdist for parallel test execution
+### Performance
+1. Fast Unit Tests: Keep unit tests fast and focused
+2. Mark Slow Tests: Use `@pytest.mark.slow` for time-consuming tests
+3. Use Smaller Datasets: Create minimal datasets for testing
+4. Parallel Execution: Consider pytest-xdist for parallel test execution
 
-## 🛠️ Dependencies
+## Dependencies
 
 The testing framework requires:
 
@@ -290,15 +290,15 @@ dash>=2.14.0
 pytest-xdist>=3.0.0
 ```
 
-## 📈 Metrics and Reporting
+## Metrics and Reporting
 
-### **Test Metrics**
-- **Test Count**: Total number of tests
-- **Pass Rate**: Percentage of passing tests
-- **Coverage**: Code coverage percentage
-- **Execution Time**: Test suite runtime
+### Test Metrics
+- Test Count: Total number of tests
+- Pass Rate: Percentage of passing tests
+- Coverage: Code coverage percentage
+- Execution Time: Test suite runtime
 
-### **Quality Gates**
+### Quality Gates
 - Minimum 90% code coverage
 - All tests must pass
 - No critical security vulnerabilities
@@ -306,7 +306,7 @@ pytest-xdist>=3.0.0
 
 ---
 
-## 🚀 Quick Commands Reference
+## Quick Commands Reference
 
 ```bash
 # Essential commands

From c6241e0411203f96759a7aa2db9c60181068317b Mon Sep 17 00:00:00 2001
From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com>
Date: Sun, 30 Nov 2025 12:39:28 +0100
Subject: [PATCH 2/2] Fixing linting issue.

---
 src/main_modular.py              | 4 +---
 src/modules/data_augmentation.py | 8 ++------
 src/modules/preprocessing.py     | 4 +---
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/main_modular.py b/src/main_modular.py
index 42ab466..ae28a0d 100755
--- a/src/main_modular.py
+++ b/src/main_modular.py
@@ -464,9 +464,7 @@ def apply_pseudo_labelling(
 
     # Create new TrainingData with pseudo labels added
     if pseudo_stats["n_pseudo_added"] > 0:
-        logger.info(
-            f"Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples"
-        )
+        logger.info(f"Pseudo labelling added {pseudo_stats['n_pseudo_added']} samples")
 
         # Create new TrainingData object with enhanced training set
         enhanced_data = TrainingData(
diff --git a/src/modules/data_augmentation.py b/src/modules/data_augmentation.py
index d9d19be..2ee48ff 100644
--- a/src/modules/data_augmentation.py
+++ b/src/modules/data_augmentation.py
@@ -622,9 +622,7 @@ def apply_data_augmentation(
     # Select optimal method and ratio
     if AugmentationConfig.AUGMENTATION_METHOD.value == AugmentationMethod.ADAPTIVE:
         method, augment_ratio = adaptive_augmentation_selection(characteristics)
-        logger.info(
-            f"   Auto-selected: {method.value} with ratio {augment_ratio:.3f}"
-        )
+        logger.info(f"   Auto-selected: {method.value} with ratio {augment_ratio:.3f}")
     else:
         method = AugmentationConfig.AUGMENTATION_METHOD.value
         augment_ratio = AugmentationConfig.AUGMENTATION_RATIO.value
@@ -716,9 +714,7 @@ def timeout_handler(_signum, _frame):
             X_combined = pd.concat([X_train, augmented_X], ignore_index=True)
             y_combined = pd.concat([y_train, augmented_y], ignore_index=True)
 
-            logger.info(
-                f"   Added {len(augmented_X)} high-quality synthetic samples"
-            )
+            logger.info(f"   Added {len(augmented_X)} high-quality synthetic samples")
             logger.info(f"   Data shape: {original_shape} → {X_combined.shape}")
 
             # Log class balance improvement
diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py
index 5aea9e8..282913e 100644
--- a/src/modules/preprocessing.py
+++ b/src/modules/preprocessing.py
@@ -489,9 +489,7 @@ def create_domain_balanced_dataset(
         combined_df = combined_df[keep_mask].reset_index(drop=True)
         weights = weights[keep_mask]
 
-        logger.info(
-            f"   Kept {len(combined_df)} high-quality samples after filtering"
-        )
+        logger.info(f"   Kept {len(combined_df)} high-quality samples after filtering")
 
     logger.info(f"Created domain-balanced dataset with {len(combined_df)} samples")
     logger.info(