-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
115 lines (97 loc) · 4.85 KB
/
main.py
File metadata and controls
115 lines (97 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from unityagents import UnityEnvironment
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from utils.ddpg_agent import Agent
import argparse
import os
import os.path as op
# Code based on https://github.com/udacity/deep-reinforcement-learning.git
def ddpg(dir_, n_episodes=500, max_t=1000, learn_every=20, num_learning=10, goal=30.0):
scores_window = deque(maxlen=100)
scores = []
max_score = -np.Inf
for i_episode in range(1, n_episodes+1):
env_info = env.reset(train_mode=True)[brain_name]
state = env_info.vector_observations
agent.reset()
score = np.zeros(num_agents)
for t in range(max_t):
action = agent.act(state)
env_info = env.step(action)[brain_name] # send the action to the environment
next_state = env_info.vector_observations # get the next state
reward = env_info.rewards # get the reward
done = env_info.local_done # see if episode has finished
for s, a, r, n_s, d in zip(state, action, reward, next_state, done):
agent.add_memory(s, a, r, n_s, d)
state = next_state
score += env_info.rewards #reward
if t % learn_every == 0:
for _ in range(num_learning):
agent.step()
if np.any(done):
break
mean_score = np.mean(score)
scores_window.append(mean_score)
scores.append(mean_score)
print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_window), mean_score), end="")
if i_episode % 100 == 0:
torch.save(agent.actor_local.state_dict(), op.join(dir_, 'checkpoint_actor.pth'))
torch.save(agent.critic_local.state_dict(), op.join(dir_, 'checkpoint_critic.pth'))
print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
if np.mean(scores_window)>=goal and i_episode >= 120:
print('\nEnvironment solved after {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
torch.save(agent.actor_local.state_dict(), op.join(dir_, 'checkpoint _actor.pth'))
torch.save(agent.critic_local.state_dict(), op.join(dir_, 'checkpoint_critic.pth'))
break
return scores
def create_directory(logdir):
try:
os.makedirs(logdir)
except FileExistsError:
pass
if __name__ == '__main__':
# Inputs for the main function
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--n_episodes', default=500, type=int, help='max number of training episodes')
parser.add_argument('--max_t', default=1000, type=int, help='max. number of timesteps per episode')
parser.add_argument('--learn_every', default=20, type=int, help='number of timesteps to wait until updating network')
parser.add_argument('--num_learning', default=10, type=int, help='number of updates')
parser.add_argument('--goal', default=30.0, type=float, help='reward goal that considers the problem solved')
parser.add_argument('--seed', default=7, type=int, help='training seed')
parser.add_argument('--dest', default='runs', type=str, help='experiment dir')
args = parser.parse_args()
# Change the file_name parameter to match the location of the Univy environment.
env = UnityEnvironment(file_name='Reacher_single.app')
path = args.dest
create_directory(path)
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
agent = Agent(state_size=state_size, action_size=action_size, random_seed=args.seed)
scores = ddpg(dir_=path, n_episodes=args.n_episodes, max_t=args.max_t, learn_every=args.learn_every, num_learning=args.num_learning, goal=args.goal)
env.close()
# plot scores
df_scores = pd.DataFrame(scores)
df_scores.to_csv(op.join(path, 'scores_values.csv'))
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.savefig(op.join(path, 'score.png'))