Stock_Prediction_RandomForest_AI/randomforest.py at main · shaunak44/Stock_Prediction_RandomForest_AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import math
class RandomForest():
    def __init__(self, x, y, n_trees, n_features, sample_sz, depth=10, min_leaf=5):
        np.random.seed(12)
        if n_features == 'sqrt':
            self.n_features = int(np.sqrt(x.shape[1]))
        elif n_features == 'log2':
            self.n_features = int(np.log2(x.shape[1]))
        else:
            self.n_features = n_features
        #print(self.n_features, "sha: ",x.shape[1])
        self.x, self.y, self.sample_sz, self.depth, self.min_leaf  = x, y, sample_sz, depth, min_leaf
        self.trees = [self.create_tree() for i in range(n_trees)]

    def create_tree(self):
        idxs = np.random.permutation(len(self.y))[:self.sample_sz]
        f_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
        #print(self.x, self.y)
        return DecisionTree(self.x[idxs], self.y[idxs], self.n_features, f_idxs,
                    idxs=np.array(range(self.sample_sz)),depth = self.depth, min_leaf=self.min_leaf)

    def predict(self, x):
        return np.mean([t.predict(x) for t in self.trees], axis=0)

def std_agg(cnt, s1, s2): return math.sqrt((s2/cnt) - (s1/cnt)**2)

class DecisionTree():
    def __init__(self, x, y, n_features, f_idxs,idxs,depth=10, min_leaf=5):
        self.x, self.y, self.idxs, self.min_leaf, self.f_idxs = x, y, idxs, min_leaf, f_idxs
        self.depth = depth
        #print(f_idxs)
#         print(self.depth)
        self.n_features = n_features
        self.n, self.c = len(idxs), x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()

    def find_varsplit(self):
        for i in self.f_idxs: self.find_better_split(i)
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x<=self.split)[0]
        rhs = np.nonzero(x>self.split)[0]
        lf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
        rf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
        self.lhs = DecisionTree(self.x, self.y, self.n_features, lf_idxs, self.idxs[lhs], depth=self.depth-1, min_leaf=self.min_leaf)
        self.rhs = DecisionTree(self.x, self.y, self.n_features, rf_idxs, self.idxs[rhs], depth=self.depth-1, min_leaf=self.min_leaf)

    def find_better_split(self, var_idx):
        x, y = self.x[self.idxs,var_idx], self.y[self.idxs]
        sort_idx = np.argsort(x)
        sort_y,sort_x = y[sort_idx], x[sort_idx]
        rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()
        lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.

        for i in range(0,self.n-self.min_leaf-1):
            xi,yi = sort_x[i],sort_y[i]
            lhs_cnt += 1; rhs_cnt -= 1
            lhs_sum += yi; rhs_sum -= yi
            lhs_sum2 += yi**2; rhs_sum2 -= yi**2
            if i<self.min_leaf or xi==sort_x[i+1]:
                continue

            lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)
            rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
            curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
            if curr_score<self.score:
                self.var_idx,self.score,self.split = var_idx,curr_score,xi

    @property
    def split_name(self): return self.x.columns[self.var_idx]

    @property
    def split_col(self): return self.x[self.idxs,self.var_idx]

    @property
    def is_leaf(self): return self.score == float('inf') or self.depth <= 0


    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
        return t.predict_row(xi)

import pandas as pd
dataset = pd.read_csv('dataset.csv', usecols=['Date', 'Open', 'Close', 'High', 'Low', 'Adj Close'])
dates = dataset.iloc[:, 0].values
x = dataset.iloc[:, 1:-2].values
y = dataset.iloc[:, -2].values

#Normalization of Features i.e. Open High and Low values
for i in range(len(x)):
    for j in range(len(x[i])):
        x[i][j] = x[i][j] / 15000
#Normalization of predicted Closing Stock
for i in range(len(y)):
    y[i] = y[i] / 15000

x_train, y_train, x_test, y_test = x[:200], y[:200], x[200:], y[200:]
rf = RandomForest(x_train, y_train, 10, 'log2', 200)

y_res = rf.predict(x_test)
#calculating the RootMeanSquareError, MeanAbsolutePercentageError, MeanBiasError
rmse_score = 0
mape_score = 0
mbe_score = 0
for i in range(len(y_test)):
    #print(y_res[i], y_test[i])
    rmse_score = rmse_score + ((y_test[i]-y_res[i])**2)
    mape_score = mape_score + ((y_test[i]-y_res[i])/y_test[i])
    mbe_score = mbe_score + (y_test[i]-y_res[i])
mape_score = mape_score/len(y_test)*100
mbe_score = mbe_score/len(y_test)
rmse_score = math.sqrt(rmse_score/len(y_test))
rmse_score_percentage = rmse_score*100
print("The percentage RMSE score is: ", rmse_score_percentage)
print("The MAPE score is: ", mape_score)
print("The MBE score is: ", mbe_score)

import matplotlib.pyplot as plt
plot_x = dates[200:]
plot_orig_y, plot_pred_y = [], []
for i in y_res:
    plot_pred_y.append(i*15000)
for i in y_test:
    plot_orig_y.append(i*15000)
plt.xlabel("Date")
plt.ylabel("Price")
plt.title('Closing price prediction')
plt.plot(plot_x, plot_pred_y, marker='o', color="blue")
plt.plot(plot_x, plot_orig_y, marker='o', color="red")
ax = plt.gca()
start, end = ax.get_xlim()
stepsize = 10
ax.xaxis.set_ticks(np.arange(start, end, stepsize))
plt.show()
#for i in range(len(plot_x)):
#    print(plot_x[i], plot_pred_y[i])