-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathModel_development.py
More file actions
108 lines (85 loc) · 3.16 KB
/
Model_development.py
File metadata and controls
108 lines (85 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"Created by Himanshu swamy"
import datetime
import pandas as pd
import numpy as np
import pymysql
import pymysql.cursors
from os import getenv
import sqlalchemy
from google.cloud import storage
from sklearn.externals import joblib
from google.cloud import storage
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
import googleapiclient.discovery
from googleapiclient.discovery import build
from oauth2client.client import GoogleCredentials
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
BUCKET_NAME = 'raw_data_scripts'
# TODO(developer): specify SQL connection details
CONNECTION_NAME = getenv(
'INSTANCE_CONNECTION_NAME',
'credit-default:us-central1:credit-work')
DB_USER = getenv('MYSQL_USER', 'root')
DB_PASSWORD = getenv('MYSQL_PASSWORD', 'root')
DB_NAME = getenv('MYSQL_DATABASE', 'credit_risk')
mysql_config = {
'host': '104.154.35.150',
'user': DB_USER,
'password': DB_PASSWORD,
'db': DB_NAME,
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor,
'autocommit': True
}
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.format(DB_USER, DB_PASSWORD,
'104.154.35.150', DB_NAME))
connection = pymysql.connect(**mysql_config)
connection1 = database_connection.connect()
try:
with connection.cursor() as cursor:
# Read a single record
sql = "SELECT * FROM train_test_data"
cursor.execute(sql)
sql_data = pd.DataFrame(cursor.fetchall())
app_train = sql_data
#Data cleaning
#Label Encoding and One-Hot Encoding
le = LabelEncoder()
le_count = 0
for col in app_train:
if app_train[col].dtype == 'object':
# If 2 or fewer unique categories
if len(list(app_train[col].unique())) <= 2:
# Train on the train_test data
le.fit(app_train[col])
# Transform train_test data
app_train[col] = le.transform(app_train[col])
app_train = app_train.drop(columns = ['SK_ID_CURR'])
train_labels = app_train['TARGET_var']
train = app_train.drop(columns = ['TARGET_var'])
train = train.fillna(0)
train = train.drop(columns = ['CODE_GENDER'])
classifier = LogisticRegression(C = 0.0001)
classifier.fit(train, train_labels)
pipeline = Pipeline([
('classifier', classifier)
])
model = 'model.joblib'
joblib.dump(pipeline, model)
# Upload the model to GCS
bucket = storage.Client().bucket(BUCKET_NAME)
blob = bucket.blob('{}/{}'.format(
datetime.datetime.now().strftime('credit_%Y%m%d_%H%M%S'),
model))
blob.upload_from_filename(model)
print(model)
finally:
connection.close()