From 453f6c9c7d976e1410b17dd2ae97b2d54aab4155 Mon Sep 17 00:00:00 2001 From: jinho Date: Tue, 7 Aug 2018 15:24:04 +0900 Subject: [PATCH] fixed the typo in 'experience.py' and removed deprecated features 'torch.autograd.Variable' from /samples/rainbow/lib/common --- ptan/experience.py | 4 ++-- samples/rainbow/lib/common.py | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ptan/experience.py b/ptan/experience.py index 373ecb2..a58cbee 100644 --- a/ptan/experience.py +++ b/ptan/experience.py @@ -493,8 +493,8 @@ def simple_dqn(model, **kwargs): return QLearningPreprocessor(model=model, target_model=None, use_double_dqn=False, **kwargs) @staticmethod - def target_dqn(model, target_model, **kwards): - return QLearningPreprocessor(model, target_model, use_double_dqn=False, **kwards) + def target_dqn(model, target_model, **kwargs): + return QLearningPreprocessor(model, target_model, use_double_dqn=False, **kwargs) @staticmethod def double_dqn(model, target_model, **kwargs): diff --git a/samples/rainbow/lib/common.py b/samples/rainbow/lib/common.py index a354215..39adc0d 100644 --- a/samples/rainbow/lib/common.py +++ b/samples/rainbow/lib/common.py @@ -85,10 +85,10 @@ def unpack_batch(batch): def calc_loss_dqn(batch, net, tgt_net, gamma, cuda=False): states, actions, rewards, dones, next_states = unpack_batch(batch) - states_v = Variable(torch.from_numpy(states)) - next_states_v = Variable(torch.from_numpy(next_states), volatile=True) - actions_v = Variable(torch.from_numpy(actions)) - rewards_v = Variable(torch.from_numpy(rewards)) + states_v = torch.from_numpy(states) + next_states_v = torch.from_numpy(next_states) + actions_v = torch.from_numpy(actions) + rewards_v = torch.from_numpy(rewards) done_mask = torch.ByteTensor(dones) if cuda: states_v = states_v.cuda() @@ -100,10 +100,9 @@ def calc_loss_dqn(batch, net, tgt_net, gamma, cuda=False): state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 - next_state_values.volatile = False - expected_state_action_values = next_state_values * gamma + rewards_v - return nn.MSELoss()(state_action_values, expected_state_action_values) + expected_state_action_values = (next_state_values * gamma + rewards_v).detach() + return nn.functional.mse_loss(state_action_values, expected_state_action_values) class RewardTracker: