-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdqn_wednesday.py
More file actions
146 lines (103 loc) · 4.09 KB
/
dqn_wednesday.py
File metadata and controls
146 lines (103 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import numpy as np
try:
import cPickle as pickle
except:
import pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import math
import tensorflow.contrib.slim as slim
try:
xrange = xrange
except:
xrange = range
import gym
env = gym.make('CartPole-v0')
def discount_rewards(r):
""" take 1D float array of rewards and compute discounted reward """
discounted_r = np.zeros_like(r)
running_add = 0
for t in reversed(xrange(0, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
H = 10
batch_size = 5
learning_rate = 1e-2
gamma = 0.99
D = 4
tf.reset_default_graph()
observations = tf.placeholder(tf.float32, [None,D], name="input_x")
w1 = tf.get_variable("w1", shape=[D,H], initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,w1))
w2 = tf.get_variable("w2", shape=[H,1], initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1, w2)
tf.summary.histogram('weight', w1)
probability = tf.nn.sigmoid(score)
tf.summary.histogram('probability', probability)
tvars = tf.trainable_variables()
input_y = tf.placeholder(tf.float32,[None,1],name="input_y")
advantages = tf.placeholder(tf.float32,name='reward_signal')
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)* (input_y + probability))
loss = -tf.reduce_mean(loglik * advantages)
newGrads =tf.gradients(loss,tvars)
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
w1Grad = tf.placeholder(tf.float32,name='batch_grad1')
w2Grad = tf.placeholder(tf.float32,name='batch_grad2')
batchGrad = [w1Grad, w2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))
xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer()
with tf.Session() as sess:
rendering = False
sess.run(init)
observation = env.reset()
gradBuffer = sess.run(tvars)
for ix,grad in enumerate(gradBuffer):
gradBuffer[ix] = 0
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter('tensorboard/cart-pole/1')
writer.add_graph(sess.graph)
while episode_number <= total_episodes:
if reward_sum > 975 or rendering == True:
env.render()
render = True
x = np.reshape(observation,[1,D])
tfprob =sess.run(probability,feed_dict={observations:x})
action = 1 if np.random.uniform() < tfprob else 0
xs.append(x)
y = 1 if action == 0 else 0
ys.append(y)
observation, reward, done, info = env.step(action)
reward_sum += reward
drs.append(reward)
if done:
episode_number += 1
epx = np.vstack(xs) # states
epy = np.vstack(ys) # inputs
epr = np.vstack(drs) # #rewards
tfp = tfps
xs,drs,ys,tfps = [],[],[],[] # reset array memory
discounted_epr = discount_rewards(epr)
discounted_epr -= np.mean(discounted_epr)
discounted_epr //= np.std(discounted_epr)
tGrad, sums = sess.run([newGrads,merged_summary], feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
writer.add_summary(sums,episode_number)
for ix, grad in enumerate(tGrad):
gradBuffer[ix] += grad
if episode_number % batch_size == 0:
sess.run(updateGrads,feed_dict={w1Grad: gradBuffer[0],w2Grad:gradBuffer[1]})
for ix,grad in enumerate(gradBuffer):
gradBuffer[ix] = 0
running_reward = reward_sum if running_reward is None else running_reward*gamma + reward_sum * 0.01
print('Average reward for episode %f. Total average reward %f.' % (reward_sum//batch_size, running_reward//batch_size))
if reward_sum//batch_size > 200:
print("Task olved In", episode_number,"episodes!")
break
reward_sum = 0
observation = env.reset()
print(episode_number, "Episodes completed")