Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ settings.json
pk.key
vk.key

# data files
model/data/

# Editors
.vscode/
.idea/
Expand Down
69 changes: 69 additions & 0 deletions model/EDA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Exploratory Data Analysis

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

#DATA_PATH = 'data/transaction_dataset.csv'
DATA_PATH = 'data/data.csv'

def print_info(df):
print("DataFrame Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nFirst 5 Rows:")
print(df.head())


def save_image(fig, filename):
os.makedirs('images', exist_ok=True)
fig.savefig(os.path.join('images', filename))


def inspect_flag_distribution(df, flag_column):
# Inspect target distribution
print(df[flag_column].value_counts())
pie, ax = plt.subplots(figsize=[15,10])
labels = ['Non-fraud', 'Fraud']
colors = ['#f9ae35', '#f64e38']
plt.pie(x = df[flag_column].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors)
plt.title('Target distribution')
plt.show()
save_image(pie, "flag_distribution.png")


def correlation_heatmap(df, annotation=False):
# Correlation matrix
cleaned = df.iloc[:, 2:].copy()
categorical_cols = cleaned.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
print(f"Dropping {len(categorical_cols)} categorical columns")
cleaned.drop(columns=categorical_cols, inplace=True)

cleaned.fillna(cleaned.median(numeric_only=True), inplace=True)

corr = cleaned.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)]=True
with sns.axes_style('white'):
fig, ax =plt.subplots(figsize=(20,12))
sns.heatmap(corr, mask=mask, annot=False, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Heatmap', fontsize=10)
plt.show()
save_image(fig, "correlation_heatmap.png")


def main():
df = pd.read_csv(DATA_PATH)
print_info(df)
inspect_flag_distribution(df, 'FLAG')
correlation_heatmap(df, annotation=False)

if __name__ == "__main__":
main()
Binary file added model/images/correlation_heatmap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added model/images/flag_distribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
82 changes: 82 additions & 0 deletions model/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Preprocessing to prepare the data."""

from pathlib import Path
import pandas as pd

DATA_PATH = 'data/transaction_dataset.csv'
CLEAN_DATA_PATH = 'data/data.csv'

DROP_COLUMNS = [
'total transactions (including tnx to create contract',
'total ether sent contracts',
'max val sent to contract',
' ERC20 avg val rec',
' ERC20 avg val rec',
' ERC20 max val rec',
' ERC20 min val rec',
' ERC20 uniq rec contract addr',
'max val sent',
' ERC20 avg val sent',
' ERC20 min val sent',
' ERC20 max val sent',
' Total ERC20 tnxs',
'avg value sent to contract',
'Unique Sent To Addresses',
'Unique Received From Addresses',
'total ether received',
' ERC20 uniq sent token name',
'min value received',
'min val sent',
' ERC20 uniq rec addr',
'min value sent to contract',
' ERC20 uniq sent addr.1',
]

# Load dataset from CSV file
def load_dataset(csv_path: Path) -> pd.DataFrame:
df = pd.read_csv(csv_path, index_col=0)
print(f"Loaded dataset: {df.shape}")
return df


# Clean dataset
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
cleaned = df.iloc[:, 2:].copy()
categorical_cols = cleaned.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
print(f"Dropping {len(categorical_cols)} categorical columns")
cleaned.drop(columns=categorical_cols, inplace=True)

cleaned.fillna(cleaned.median(numeric_only=True), inplace=True)

zero_var_cols = cleaned.columns[cleaned.var(numeric_only=True) == 0]
if len(zero_var_cols) > 0:
print(f"Dropping {len(zero_var_cols)} zero-variance columns")
cleaned.drop(columns=zero_var_cols, inplace=True)

manual_cols = [col for col in DROP_COLUMNS if col in cleaned.columns]
if manual_cols:
print(f"Dropping {len(manual_cols)} manually selected columns")
cleaned.drop(columns=manual_cols, inplace=True)

print(f"Cleaned dataset: {cleaned.shape}")
return cleaned


# Save cleaned dataset to CSV file
def save_cleaned_dataset(df: pd.DataFrame, output_path: Path):
df.to_csv(output_path, index=True)
print(f"Saved cleaned dataset to {output_path}")


def main():
df = load_dataset(Path(DATA_PATH))
df = clean_dataset(df)
#print dataset info
print("Cleaned DataFrame Info:")
print(df.info())
save_cleaned_dataset(df, Path(CLEAN_DATA_PATH))


if __name__ == "__main__":
main()
128 changes: 128 additions & 0 deletions model/training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import numpy as np
import pandas as pd
from pyparsing import Dict
import seaborn as sns
import torch
import xgboost as xgb
from hummingbird.ml import convert
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from pathlib import Path
import os

DATA_PATH = Path(__file__).resolve().parent / 'data' / 'data.csv'

def load_dataset(csv_path: Path):
df = pd.read_csv(csv_path)
print(f"Loaded dataset: {df.shape}")
# Drop auto-generated index columns if present
if 'Unnamed: 0' in df.columns:
df = df.drop(columns=['Unnamed: 0'])
return df


def split_dataset(df: pd.DataFrame, test_size: float = 0.2, random_state: int =42):
y = df['FLAG']
X = df.drop(columns=['FLAG'])
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
)
print(f"Train shapes: {X_train.shape}, {y_train.shape}")
print(f"Test shapes: {X_test.shape}, {y_test.shape}")
return X_train, X_test, y_train, y_test


def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame):
norm = PowerTransformer()
norm_train = norm.fit_transform(X_train)
norm_test = norm.transform(X_test)
print("Applied PowerTransformer normalization")
return norm_train, norm_test, norm


def balance_training_data(X_train: np.ndarray, y_train: pd.Series):
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(
f"SMOTE applied: before {X_train.shape}/{y_train.shape},",
f"after {X_resampled.shape}/{y_resampled.shape}"
)
return X_resampled, y_resampled


def train_models(X_train: np.ndarray, y_train: np.ndarray):
models: Dict[str, object] = {}

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
models['random_forest'] = rf_model
print("Trained RandomForestClassifier")

xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
models['xgboost'] = xgb_model
print("Trained XGBoost classifier")

return models


def evaluate_model(name: str, model: object, X_test: np.ndarray, y_test: pd.Series):
preds = model.predict(X_test)
print(f"\n{name} classification report:\n{classification_report(y_test, preds)}")
print(f"{name} confusion matrix:\n{confusion_matrix(y_test, preds)}")


def export_model_to_onnx(model: xgb.XGBClassifier, sample: np.ndarray, export_path: Path):
os.makedirs(export_path.parent, exist_ok=True)
hb_model = convert(model, 'torch', sample)
torch_model = hb_model.model
dummy_input = torch.tensor(sample, dtype=torch.float32)

torch.onnx.export(
torch_model,
dummy_input,
export_path.as_posix(),
export_params=True,
opset_version=10,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}},
)

print(f"✅ Model exported to {export_path}")
return export_path


def main():
print(torch.__version__)

df = load_dataset(DATA_PATH)

X_train, X_test, y_train, y_test = split_dataset(df)
norm_train, norm_test, _ = scale_features(X_train, X_test)
X_resampled, y_resampled = balance_training_data(norm_train, y_train)

models = train_models(X_resampled, y_resampled)

evaluate_model('RandomForest', models['random_forest'], norm_test, y_test)
evaluate_model('XGBoost', models['xgboost'], norm_test, y_test)

export_model_to_onnx(
models['xgboost'],
norm_test[:1],
Path(__file__).resolve().parent / 'onnx' / 'xgboost_model.onnx'
)

export_model_to_onnx(
models['random_forest'],
norm_test[:1],
Path(__file__).resolve().parent / 'onnx' / 'random_forest_model.onnx'
)


if __name__ == '__main__':
main()
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
ezkl
torch
torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu
numpy
onnx
pandas
matplotlib
seaborn