diff --git a/.gitignore b/.gitignore index 620ed3b..b0882aa 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,9 @@ settings.json pk.key vk.key +# data files +model/data/ + # Editors .vscode/ .idea/ diff --git a/model/EDA.py b/model/EDA.py new file mode 100644 index 0000000..5fd6a84 --- /dev/null +++ b/model/EDA.py @@ -0,0 +1,69 @@ +# Exploratory Data Analysis + +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import os + +#DATA_PATH = 'data/transaction_dataset.csv' +DATA_PATH = 'data/data.csv' + +def print_info(df): + print("DataFrame Info:") + print(df.info()) + print("\nStatistical Summary:") + print(df.describe()) + print("\nMissing Values:") + print(df.isnull().sum()) + print("\nFirst 5 Rows:") + print(df.head()) + + +def save_image(fig, filename): + os.makedirs('images', exist_ok=True) + fig.savefig(os.path.join('images', filename)) + + +def inspect_flag_distribution(df, flag_column): + # Inspect target distribution + print(df[flag_column].value_counts()) + pie, ax = plt.subplots(figsize=[15,10]) + labels = ['Non-fraud', 'Fraud'] + colors = ['#f9ae35', '#f64e38'] + plt.pie(x = df[flag_column].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors) + plt.title('Target distribution') + plt.show() + save_image(pie, "flag_distribution.png") + + +def correlation_heatmap(df, annotation=False): + # Correlation matrix + cleaned = df.iloc[:, 2:].copy() + categorical_cols = cleaned.select_dtypes(include=['object']).columns + if len(categorical_cols) > 0: + print(f"Dropping {len(categorical_cols)} categorical columns") + cleaned.drop(columns=categorical_cols, inplace=True) + + cleaned.fillna(cleaned.median(numeric_only=True), inplace=True) + + corr = cleaned.corr() + + mask = np.zeros_like(corr) + mask[np.triu_indices_from(mask)]=True + with sns.axes_style('white'): + fig, ax =plt.subplots(figsize=(20,12)) + sns.heatmap(corr, mask=mask, annot=False, cmap='coolwarm', center=0, square=True) + plt.title('Correlation Heatmap', fontsize=10) + plt.show() + save_image(fig, "correlation_heatmap.png") + + +def main(): + df = pd.read_csv(DATA_PATH) + print_info(df) + inspect_flag_distribution(df, 'FLAG') + correlation_heatmap(df, annotation=False) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model/images/correlation_heatmap.png b/model/images/correlation_heatmap.png new file mode 100644 index 0000000..c48a4ed Binary files /dev/null and b/model/images/correlation_heatmap.png differ diff --git a/model/images/flag_distribution.png b/model/images/flag_distribution.png new file mode 100644 index 0000000..1de1dcd Binary files /dev/null and b/model/images/flag_distribution.png differ diff --git a/model/preprocessing.py b/model/preprocessing.py new file mode 100644 index 0000000..bd93cb1 --- /dev/null +++ b/model/preprocessing.py @@ -0,0 +1,82 @@ +"""Preprocessing to prepare the data.""" + +from pathlib import Path +import pandas as pd + +DATA_PATH = 'data/transaction_dataset.csv' +CLEAN_DATA_PATH = 'data/data.csv' + +DROP_COLUMNS = [ + 'total transactions (including tnx to create contract', + 'total ether sent contracts', + 'max val sent to contract', + ' ERC20 avg val rec', + ' ERC20 avg val rec', + ' ERC20 max val rec', + ' ERC20 min val rec', + ' ERC20 uniq rec contract addr', + 'max val sent', + ' ERC20 avg val sent', + ' ERC20 min val sent', + ' ERC20 max val sent', + ' Total ERC20 tnxs', + 'avg value sent to contract', + 'Unique Sent To Addresses', + 'Unique Received From Addresses', + 'total ether received', + ' ERC20 uniq sent token name', + 'min value received', + 'min val sent', + ' ERC20 uniq rec addr', + 'min value sent to contract', + ' ERC20 uniq sent addr.1', +] + +# Load dataset from CSV file +def load_dataset(csv_path: Path) -> pd.DataFrame: + df = pd.read_csv(csv_path, index_col=0) + print(f"Loaded dataset: {df.shape}") + return df + + +# Clean dataset +def clean_dataset(df: pd.DataFrame) -> pd.DataFrame: + cleaned = df.iloc[:, 2:].copy() + categorical_cols = cleaned.select_dtypes(include=['object']).columns + if len(categorical_cols) > 0: + print(f"Dropping {len(categorical_cols)} categorical columns") + cleaned.drop(columns=categorical_cols, inplace=True) + + cleaned.fillna(cleaned.median(numeric_only=True), inplace=True) + + zero_var_cols = cleaned.columns[cleaned.var(numeric_only=True) == 0] + if len(zero_var_cols) > 0: + print(f"Dropping {len(zero_var_cols)} zero-variance columns") + cleaned.drop(columns=zero_var_cols, inplace=True) + + manual_cols = [col for col in DROP_COLUMNS if col in cleaned.columns] + if manual_cols: + print(f"Dropping {len(manual_cols)} manually selected columns") + cleaned.drop(columns=manual_cols, inplace=True) + + print(f"Cleaned dataset: {cleaned.shape}") + return cleaned + + +# Save cleaned dataset to CSV file +def save_cleaned_dataset(df: pd.DataFrame, output_path: Path): + df.to_csv(output_path, index=True) + print(f"Saved cleaned dataset to {output_path}") + + +def main(): + df = load_dataset(Path(DATA_PATH)) + df = clean_dataset(df) + #print dataset info + print("Cleaned DataFrame Info:") + print(df.info()) + save_cleaned_dataset(df, Path(CLEAN_DATA_PATH)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model/training.py b/model/training.py new file mode 100644 index 0000000..96cd064 --- /dev/null +++ b/model/training.py @@ -0,0 +1,128 @@ +import numpy as np +import pandas as pd +from pyparsing import Dict +import seaborn as sns +import torch +import xgboost as xgb +from hummingbird.ml import convert +from imblearn.over_sampling import SMOTE +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import PowerTransformer +from pathlib import Path +import os + +DATA_PATH = Path(__file__).resolve().parent / 'data' / 'data.csv' + +def load_dataset(csv_path: Path): + df = pd.read_csv(csv_path) + print(f"Loaded dataset: {df.shape}") + # Drop auto-generated index columns if present + if 'Unnamed: 0' in df.columns: + df = df.drop(columns=['Unnamed: 0']) + return df + + +def split_dataset(df: pd.DataFrame, test_size: float = 0.2, random_state: int =42): + y = df['FLAG'] + X = df.drop(columns=['FLAG']) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + print(f"Train shapes: {X_train.shape}, {y_train.shape}") + print(f"Test shapes: {X_test.shape}, {y_test.shape}") + return X_train, X_test, y_train, y_test + + +def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame): + norm = PowerTransformer() + norm_train = norm.fit_transform(X_train) + norm_test = norm.transform(X_test) + print("Applied PowerTransformer normalization") + return norm_train, norm_test, norm + + +def balance_training_data(X_train: np.ndarray, y_train: pd.Series): + smote = SMOTE() + X_resampled, y_resampled = smote.fit_resample(X_train, y_train) + print( + f"SMOTE applied: before {X_train.shape}/{y_train.shape},", + f"after {X_resampled.shape}/{y_resampled.shape}" + ) + return X_resampled, y_resampled + + +def train_models(X_train: np.ndarray, y_train: np.ndarray): + models: Dict[str, object] = {} + + rf_model = RandomForestClassifier(random_state=42) + rf_model.fit(X_train, y_train) + models['random_forest'] = rf_model + print("Trained RandomForestClassifier") + + xgb_model = xgb.XGBClassifier(random_state=42) + xgb_model.fit(X_train, y_train) + models['xgboost'] = xgb_model + print("Trained XGBoost classifier") + + return models + + +def evaluate_model(name: str, model: object, X_test: np.ndarray, y_test: pd.Series): + preds = model.predict(X_test) + print(f"\n{name} classification report:\n{classification_report(y_test, preds)}") + print(f"{name} confusion matrix:\n{confusion_matrix(y_test, preds)}") + + +def export_model_to_onnx(model: xgb.XGBClassifier, sample: np.ndarray, export_path: Path): + os.makedirs(export_path.parent, exist_ok=True) + hb_model = convert(model, 'torch', sample) + torch_model = hb_model.model + dummy_input = torch.tensor(sample, dtype=torch.float32) + + torch.onnx.export( + torch_model, + dummy_input, + export_path.as_posix(), + export_params=True, + opset_version=10, + do_constant_folding=True, + input_names=['input'], + output_names=['output'], + dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}, + ) + + print(f"✅ Model exported to {export_path}") + return export_path + + +def main(): + print(torch.__version__) + + df = load_dataset(DATA_PATH) + + X_train, X_test, y_train, y_test = split_dataset(df) + norm_train, norm_test, _ = scale_features(X_train, X_test) + X_resampled, y_resampled = balance_training_data(norm_train, y_train) + + models = train_models(X_resampled, y_resampled) + + evaluate_model('RandomForest', models['random_forest'], norm_test, y_test) + evaluate_model('XGBoost', models['xgboost'], norm_test, y_test) + + export_model_to_onnx( + models['xgboost'], + norm_test[:1], + Path(__file__).resolve().parent / 'onnx' / 'xgboost_model.onnx' + ) + + export_model_to_onnx( + models['random_forest'], + norm_test[:1], + Path(__file__).resolve().parent / 'onnx' / 'random_forest_model.onnx' + ) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b0fb3ca..48fb91d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ ezkl -torch +torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu numpy onnx +pandas +matplotlib +seaborn \ No newline at end of file