-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrandomforest.py
More file actions
143 lines (124 loc) · 5.38 KB
/
randomforest.py
File metadata and controls
143 lines (124 loc) · 5.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import math
class RandomForest():
def __init__(self, x, y, n_trees, n_features, sample_sz, depth=10, min_leaf=5):
np.random.seed(12)
if n_features == 'sqrt':
self.n_features = int(np.sqrt(x.shape[1]))
elif n_features == 'log2':
self.n_features = int(np.log2(x.shape[1]))
else:
self.n_features = n_features
#print(self.n_features, "sha: ",x.shape[1])
self.x, self.y, self.sample_sz, self.depth, self.min_leaf = x, y, sample_sz, depth, min_leaf
self.trees = [self.create_tree() for i in range(n_trees)]
def create_tree(self):
idxs = np.random.permutation(len(self.y))[:self.sample_sz]
f_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
#print(self.x, self.y)
return DecisionTree(self.x[idxs], self.y[idxs], self.n_features, f_idxs,
idxs=np.array(range(self.sample_sz)),depth = self.depth, min_leaf=self.min_leaf)
def predict(self, x):
return np.mean([t.predict(x) for t in self.trees], axis=0)
def std_agg(cnt, s1, s2): return math.sqrt((s2/cnt) - (s1/cnt)**2)
class DecisionTree():
def __init__(self, x, y, n_features, f_idxs,idxs,depth=10, min_leaf=5):
self.x, self.y, self.idxs, self.min_leaf, self.f_idxs = x, y, idxs, min_leaf, f_idxs
self.depth = depth
#print(f_idxs)
# print(self.depth)
self.n_features = n_features
self.n, self.c = len(idxs), x.shape[1]
self.val = np.mean(y[idxs])
self.score = float('inf')
self.find_varsplit()
def find_varsplit(self):
for i in self.f_idxs: self.find_better_split(i)
if self.is_leaf: return
x = self.split_col
lhs = np.nonzero(x<=self.split)[0]
rhs = np.nonzero(x>self.split)[0]
lf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
rf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
self.lhs = DecisionTree(self.x, self.y, self.n_features, lf_idxs, self.idxs[lhs], depth=self.depth-1, min_leaf=self.min_leaf)
self.rhs = DecisionTree(self.x, self.y, self.n_features, rf_idxs, self.idxs[rhs], depth=self.depth-1, min_leaf=self.min_leaf)
def find_better_split(self, var_idx):
x, y = self.x[self.idxs,var_idx], self.y[self.idxs]
sort_idx = np.argsort(x)
sort_y,sort_x = y[sort_idx], x[sort_idx]
rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()
lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.
for i in range(0,self.n-self.min_leaf-1):
xi,yi = sort_x[i],sort_y[i]
lhs_cnt += 1; rhs_cnt -= 1
lhs_sum += yi; rhs_sum -= yi
lhs_sum2 += yi**2; rhs_sum2 -= yi**2
if i<self.min_leaf or xi==sort_x[i+1]:
continue
lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)
rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
if curr_score<self.score:
self.var_idx,self.score,self.split = var_idx,curr_score,xi
@property
def split_name(self): return self.x.columns[self.var_idx]
@property
def split_col(self): return self.x[self.idxs,self.var_idx]
@property
def is_leaf(self): return self.score == float('inf') or self.depth <= 0
def predict(self, x):
return np.array([self.predict_row(xi) for xi in x])
def predict_row(self, xi):
if self.is_leaf: return self.val
t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
return t.predict_row(xi)
import pandas as pd
dataset = pd.read_csv('dataset.csv', usecols=['Date', 'Open', 'Close', 'High', 'Low', 'Adj Close'])
dates = dataset.iloc[:, 0].values
x = dataset.iloc[:, 1:-2].values
y = dataset.iloc[:, -2].values
#Normalization of Features i.e. Open High and Low values
for i in range(len(x)):
for j in range(len(x[i])):
x[i][j] = x[i][j] / 15000
#Normalization of predicted Closing Stock
for i in range(len(y)):
y[i] = y[i] / 15000
x_train, y_train, x_test, y_test = x[:200], y[:200], x[200:], y[200:]
rf = RandomForest(x_train, y_train, 10, 'log2', 200)
y_res = rf.predict(x_test)
#calculating the RootMeanSquareError, MeanAbsolutePercentageError, MeanBiasError
rmse_score = 0
mape_score = 0
mbe_score = 0
for i in range(len(y_test)):
#print(y_res[i], y_test[i])
rmse_score = rmse_score + ((y_test[i]-y_res[i])**2)
mape_score = mape_score + ((y_test[i]-y_res[i])/y_test[i])
mbe_score = mbe_score + (y_test[i]-y_res[i])
mape_score = mape_score/len(y_test)*100
mbe_score = mbe_score/len(y_test)
rmse_score = math.sqrt(rmse_score/len(y_test))
rmse_score_percentage = rmse_score*100
print("The percentage RMSE score is: ", rmse_score_percentage)
print("The MAPE score is: ", mape_score)
print("The MBE score is: ", mbe_score)
import matplotlib.pyplot as plt
plot_x = dates[200:]
plot_orig_y, plot_pred_y = [], []
for i in y_res:
plot_pred_y.append(i*15000)
for i in y_test:
plot_orig_y.append(i*15000)
plt.xlabel("Date")
plt.ylabel("Price")
plt.title('Closing price prediction')
plt.plot(plot_x, plot_pred_y, marker='o', color="blue")
plt.plot(plot_x, plot_orig_y, marker='o', color="red")
ax = plt.gca()
start, end = ax.get_xlim()
stepsize = 10
ax.xaxis.set_ticks(np.arange(start, end, stepsize))
plt.show()
#for i in range(len(plot_x)):
# print(plot_x[i], plot_pred_y[i])