BOMBOCLLLLLA/loader.py at main · Ibrahim-Debug-1010/BOMBOCLLLLLA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import sys
import joblib

# Add parent directory to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from data.config import RANDOM_STATE

def load_data(file_path):
    """
    Load glucose level data from CSV file

    Parameters:
    -----------
    file_path : str
        Path to the CSV file

    Returns:
    --------
    pandas.DataFrame
        Loaded data
    """
    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully with {data.shape[0]} rows and {data.shape[1]} columns")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(df):
    """
    Preprocess the data for modeling

    Parameters:
    -----------
    df : pandas.DataFrame
        The raw data to preprocess

    Returns:
    --------
    tuple
        X_train, X_test, y_train, y_test, scaler, feature_names
    """
    # Handle missing values
    df = df.dropna(subset=['WEIGHT', 'HEIGHT']).copy()
    df.loc[:, 'HEARTRATE'] = df['HEARTRATE'].fillna(df['HEARTRATE'].median())
    df.loc[:, 'NIR_Reading'] = df['NIR_Reading'].fillna(df['NIR_Reading'].median())
    df.loc[:, 'HR_IR'] = df['HR_IR'].fillna(df['HR_IR'].median())
    df.loc[:, 'LAST_EATEN'] = df['LAST_EATEN'].fillna(df['LAST_EATEN'].median())
    df.loc[:, 'DIABETIC'] = df['DIABETIC'].fillna(df['DIABETIC'].mode()[0])

    # Encode categorical features
    df.loc[:, 'GENDER'] = df['GENDER'].map({'M': 1, 'F': 0})
    df.loc[:, 'DIABETIC'] = df['DIABETIC'].map({'Y': 1, 'N': 0})

    feature_names = [
        'AGE', 'GENDER', 'WEIGHT', 'SKIN_COLOR', 'NIR_Reading',
        'HEARTRATE', 'HEIGHT', 'LAST_EATEN', 'DIABETIC', 'HR_IR'
    ]
    target_col = 'GLUCOSE_LEVEL'
    X = df[feature_names]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test, scaler, feature_names

def save_processed_data(X_train, X_test, y_train, y_test, output_dir='datasets/processed'):
    """
    Save processed data to disk

    Parameters:
    -----------
    X_train : numpy.ndarray
        Training features
    X_test : numpy.ndarray
        Test features
    y_train : numpy.ndarray
        Training target
    y_test : numpy.ndarray
        Test target
    output_dir : str
        Directory to save processed data
    """
    # Create directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save as numpy arrays
    np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
    np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
    np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
    np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

    print(f"Processed data saved to {output_dir}")

def load_processed_data(input_dir='datasets/processed'):
    """
    Load processed data from disk

    Parameters:
    -----------
    input_dir : str
        Directory containing processed data

    Returns:
    --------
    tuple
        X_train, X_test, y_train, y_test
    """
    try:
        X_train = np.load(os.path.join(input_dir, 'X_train.npy'))
        X_test = np.load(os.path.join(input_dir, 'X_test.npy'))
        y_train = np.load(os.path.join(input_dir, 'y_train.npy'))
        y_test = np.load(os.path.join(input_dir, 'y_test.npy'))

        print(f"Processed data loaded from {input_dir}")
        return X_train, X_test, y_train, y_test
    except Exception as e:
        print(f"Error loading processed data: {e}")
        return None, None, None, None

def load_model(model_path):
    """
    Load a trained model from disk

    Parameters:
    -----------
    model_path : str
        Path to the model file

    Returns:
    --------
    object
        Loaded model
    """
    return joblib.load(model_path)