-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodelNN.py
More file actions
95 lines (77 loc) · 2.75 KB
/
modelNN.py
File metadata and controls
95 lines (77 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# CHAPTER 8 TOPIC 2
# Training Data in NN
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
df_train = pd.read_csv('train.csv')
df_valid = pd.read_csv('valid.csv')
df = pd.concat([df_train, df_valid], ignore_index=True)
df_test = pd.read_csv('test.csv')
df = pd.concat([df, df_test], ignore_index=True)
print("head:",df.head())
print("shape:",df.shape)
print("df.label.value_counts()",df.label.value_counts())
def cleansing(sent):
string = sent.lower()
string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
return string
df['text_clean'] = df.text.apply(cleansing)
print(df.head())
# before we perform feature extraction
data_preprocessed = df.text_clean.tolist()
# print(data_preprocessed)
# FEATURE EXTRACTION, pickle is to store the results
count_vect = CountVectorizer()
count_vect.fit(data_preprocessed)
X = count_vect.transform(data_preprocessed)
print("Feature Extraction Completed")
pickle.dump(count_vect, open("feature.p", "wb"))
# split into 80% training data and 20% testing data
classes = df.label
# print(classes)
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.2)
# training and storing
model = MLPClassifier(early_stopping=True, validation_fraction=0.1) # Enable early stopping
model.fit(X_train, y_train)
print("Training Completed")
pickle.dump(model, open("model.p", "wb"))
# evaluation with Accuracy, Precision, Recall, and F-1 Score
test = model.predict(X_test)
print("Testing Completed")
print(classification_report(y_test, test))
# cross-validation
kf = KFold(n_splits=5, random_state=42,shuffle=True)
accuracies = []
y = classes
for iteration, data in enumerate(kf.split(X), start=1):
data_train = X[data[0]]
target_train = y[data[0]]
data_test = X[data[1]]
target_test = y[data[1]]
clf = MLPClassifier()
clf.fit(data_train, target_train)
preds = clf.predict(data_test)
accuracy = accuracy_score(target_test, preds)
print("Training number", iteration)
print(classification_report(target_test, preds))
print("====================================================================")
accuracies.append(accuracy)
average_accuracy = np.mean(accuracies)
print()
print()
print()
print("Average accuracy:",average_accuracy)
# prediction
original_text = '''
Aku suka kamu
'''
text = count_vect.transform([cleansing(original_text)])
result = model.predict(text)[0]
print("Sentiment:",result)