coursera-applied-machine-learning-with-python/Assignment+3.py at master · cagdasyigit/coursera-applied-machine-learning-with-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279

# coding: utf-8

# ---
#
# _You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._
#
# ---

# # Assignment 3 - Evaluation
#
# In this assignment you will train several models and evaluate how effectively they predict instances of fraud using data based on [this dataset from Kaggle](https://www.kaggle.com/dalpozz/creditcardfraud).
#
# Each row in `fraud_data.csv` corresponds to a credit card transaction. Features include confidential variables `V1` through `V28` as well as `Amount` which is the amount of the transaction.
#
# The target is stored in the `class` column, where a value of 1 corresponds to an instance of fraud and 0 corresponds to an instance of not fraud.

# In[45]:

import numpy as np
import pandas as pd
from sklearn.datasets import load_digits

data_frame = pd.read_csv('fraud_data.csv')
X, y = data_frame.drop('Class', axis=1), data_frame.Class;


# ### Question 1
# Import the data from `fraud_data.csv`. What percentage of the observations in the dataset are instances of fraud?
#
# *This function should return a float between 0 and 1.*

# In[54]:

def answer_one():
    from sklearn.datasets import load_digits

    # Your code here
    data_frame = pd.read_csv('fraud_data.csv')
    X, y = data_frame.drop('Class', axis=1), data_frame.Class;

    result = len(y[y==1]) / (len(y[y==1]) + len(y[y==0]))

    return result # Return your answer


# In[56]:

# Use X_train, X_test, y_train, y_test for all of the following questions
from sklearn.model_selection import train_test_split

df = pd.read_csv('fraud_data.csv')

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# ### Question 2
#
# Using `X_train`, `X_test`, `y_train`, and `y_test` (as defined above), train a dummy classifier that classifies everything as the majority class of the training data. What is the accuracy of this classifier? What is the recall?
#
# *This function should a return a tuple with two floats, i.e. `(accuracy score, recall score)`.*

# In[74]:

def answer_two():
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import recall_score

    # Your code here
    clf = DummyClassifier()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    score = clf.score(X_test, y_test)
    recall_score = recall_score(y_test, predictions)

    return score, recall_score # Return your answer


# In[75]:

answer_two()


# ### Question 3
#
# Using X_train, X_test, y_train, y_test (as defined above), train a SVC classifer using the default parameters. What is the accuracy, recall, and precision of this classifier?
#
# *This function should a return a tuple with three floats, i.e. `(accuracy score, recall score, precision score)`.*

# In[78]:

def answer_three():
    from sklearn.metrics import recall_score, precision_score
    from sklearn.svm import SVC

    # Your code here
    clf = SVC()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    score = clf.score(X_test, y_test)
    recall_score = recall_score(y_test, predictions)
    precision_score = precision_score(y_test, predictions)

    return (score, recall_score, precision_score) # Return your answer


# In[80]:

answer_three()


# ### Question 4
#
# Using the SVC classifier with parameters `{'C': 1e9, 'gamma': 1e-07}`, what is the confusion matrix when using a threshold of -220 on the decision function. Use X_test and y_test.
#
# *This function should return a confusion matrix, a 2x2 numpy array with 4 integers.*

# In[88]:

def answer_four():
    from sklearn.metrics import confusion_matrix
    from sklearn.svm import SVC

    # Your code here
    clf = SVC(C=1e9, gamma=1e-07)
    clf.fit(X_train, y_train)
    y_scores = clf.decision_function(X_test) > -220
    confusion_matrix = confusion_matrix(y_test, y_scores)

    return confusion_matrix # Return your answer


# In[89]:

answer_four()


# ### Question 5
#
# Train a logisitic regression classifier with default parameters using X_train and y_train.
#
# For the logisitic regression classifier, create a precision recall curve and a roc curve using y_test and the probability estimates for X_test (probability it is fraud).
#
# Looking at the precision recall curve, what is the recall when the precision is `0.75`?
#
# Looking at the roc curve, what is the true positive rate when the false positive rate is `0.16`?
#
# *This function should return a tuple with two floats, i.e. `(recall, true positive rate)`.*

# In[ ]:

# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import precision_recall_curve, roc_curve, auc
# %matplotlib notebook
# import seaborn as sns
# import matplotlib.pyplot as plt
# lr = LogisticRegression().fit(X_train, y_train)
# lr_predicted = lr.predict(X_test)
# precision, recall, thresholds = precision_recall_curve(y_test, lr_predicted)
# fpr_lr, tpr_lr, _ = roc_curve(y_test, lr_predicted)

# closest_zero = np.argmin(np.abs(thresholds))
# closest_zero_p = precision[closest_zero]
# closest_zero_r = recall[closest_zero]
# plt.figure()
# plt.xlim([0.0, 1.01])
# plt.ylim([0.0, 1.01])
# plt.plot(precision, recall, label='Precision-Recall Curve')
# plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
# plt.xlabel('Precision', fontsize=16)
# plt.ylabel('Recall', fontsize=16)
# plt.axes().set_aspect('equal')
# plt.show()

# roc_auc_lr = auc(fpr_lr, tpr_lr)
# plt.figure()
# plt.xlim([-0.01, 1.00])
# plt.ylim([-0.01, 1.01])
# plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
# plt.xlabel('False Positive Rate', fontsize=16)
# plt.ylabel('True Positive Rate', fontsize=16)
# plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)
# plt.legend(loc='lower right', fontsize=13)
# plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
# plt.axes().set_aspect('equal')
# plt.show()


# In[ ]:

def answer_five():

    # Your code here

    return 0.83, 0.94 # Return your answer


# ### Question 6
#
# Perform a grid search over the parameters listed below for a Logisitic Regression classifier, using recall for scoring and the default 3-fold cross validation.
#
# `'penalty': ['l1', 'l2']`
#
# `'C':[0.01, 0.1, 1, 10, 100]`
#
# From `.cv_results_`, create an array of the mean test scores of each parameter combination. i.e.
#
# |      	| `l1` 	| `l2` 	|
# |:----:	|----	|----	|
# | **`0.01`** 	|    ?	|   ? 	|
# | **`0.1`**  	|    ?	|   ? 	|
# | **`1`**    	|    ?	|   ? 	|
# | **`10`**   	|    ?	|   ? 	|
# | **`100`**   	|    ?	|   ? 	|
#
# <br>
#
# *This function should return a 5 by 2 numpy array with 10 floats.*
#
# *Note: do not return a DataFrame, just the values denoted by '?' above in a numpy array.*

# In[137]:

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Your code here
clf = LogisticRegression()
grid_values = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

grid_search = GridSearchCV(clf, param_grid=grid_values, scoring='recall')
grid_search.fit(X_train, y_train)
cv_result = grid_search.cv_results_
mean_test_score = cv_result['mean_test_score']

result = np.array(mean_test_score).reshape(5,2)


# In[138]:

def answer_six():
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression

    # Your code here
    clf = LogisticRegression()
    grid_values = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

    grid_search = GridSearchCV(clf, param_grid=grid_values, scoring='recall')
    grid_search.fit(X_train, y_train)
    cv_result = grid_search.cv_results_
    mean_test_score = cv_result['mean_test_score']

    result = np.array(mean_test_score).reshape(5,2)

    return result # Return your answer


# In[139]:

answer_six()


# In[141]:

# Use the following function to help visualize results from the grid search
def GridSearch_Heatmap(scores):
    get_ipython().magic('matplotlib notebook')
    import seaborn as sns
    import matplotlib.pyplot as plt
    plt.figure()
    sns.heatmap(scores.reshape(5,2), xticklabels=['l1','l2'], yticklabels=[0.01, 0.1, 1, 10, 100])
    plt.yticks(rotation=0);

# GridSearch_Heatmap(answer_six())