-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcart_pole_saturday.py
More file actions
146 lines (99 loc) · 3.9 KB
/
cart_pole_saturday.py
File metadata and controls
146 lines (99 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import numpy as np
try:
import cPickle as pickle
except:
import pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import math
import tensorflow.contrib.slim as slim
try:
xrange = xrange
except:
xrange = range
import gym
env = gym.make('CartPole-v0')
#hyper paramaters
# Hyper Ps
H = 10
batch_size = 5
learning_rate = 1e-2
gamma = 0.99
D = 4 #input dimensions
tf.reset_default_graph()
observations = tf.placeholder(tf.float32, [None,D], name='observation')
W1 = tf.get_variable("W1", shape=[D,H], initializer=tf.contrib.layers.xavier_initializer())
layer_1 = tf.nn.relu(tf.matmul(observations,W1))
W2 = tf.get_variable("W2", shape=[H,1], initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer_1, W2)
probability = tf.nn.sigmoid(score)
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1], name="actions")
advantages = tf.placeholder(tf.float32,name='reward_signal')
loglik = tf.log(input_y*(input_y - probability) + (1-input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik*advantages)
newGrads = tf.gradients(loss,tvars)
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
w1Grad = tf.placeholder(tf.float32,name='batch_grad1')
w2Grad = tf.placeholder(tf.float32,name='batch_grad2')
batch_grad = [w1Grad, w2Grad]
updateGrads = adam.apply_gradients(zip(batch_grad, tvars))
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(xrange(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
xs,hs,dlogs,drs,ys,tfps= [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()
with tf.Session() as sess:
rendering = False
sess.run(init)
observation = env.reset()
gradBuffer = sess.run(tvars)
for indx,grad in enumerate(gradBuffer):
gradBuffer[indx] = grad * 0
while episode_number <= total_episodes:
if reward_sum > 975 or rendering == True:
env.render()
rendering = True
x = np.reshape(observation, [1,D])
tfprob = sess.run(probability,feed_dict={observations:x})
action = 1 if np.random.uniform() < tfprob else 0
xs.append(x)
y = 1 if action == 0 else 0
ys.append(y)
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward)
if done:
episode_number +=1
epx = np.vstack(xs) # states / observation of states
epy = np.vstack(ys) # inputs
epr = np.vstack(drs) # #rewards
tfp = tfps
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
discounted_epr = discount_rewards(epr)
discounted_epr -= np.mean(discounted_epr)
discounted_epr //= np.std(discounted_epr)
tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
for ix,grad in enumerate(tGrad):
gradBuffer[ix] += grad
if episode_number % batch_size == 0:
sess.run(updateGrads, feed_dict={w1Grad: gradBuffer[0], w2Grad:gradBuffer[1]})
for indx,grad in enumerate(gradBuffer):
gradBuffer[indx] = grad * 0
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print('Average reward for episode %f. Total average reward %f.' % (reward_sum//batch_size, running_reward//batch_size))
if reward_sum//batch_size > 200:
print("Task solved in", episode_number, "episodes!")
break
reward_sum = 0
observation = env.reset()
print(episode_number, 'episodes completed.')