From 6c26f54f689b766f67950a69db954502e3fc95b3 Mon Sep 17 00:00:00 2001 From: Yoni Date: Wed, 6 May 2020 21:26:47 -0400 Subject: [PATCH 1/2] minor bug fixes --- .gitignore | 2 ++ generator.py | 13 +++++++++---- model.py | 18 ++++++++++++------ util.py | 12 ++++++------ 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index b6e4761..bd8c640 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,5 @@ dmypy.json # Pyre type checker .pyre/ + +associative-retrieval.pkl diff --git a/generator.py b/generator.py index f095cf2..731644b 100644 --- a/generator.py +++ b/generator.py @@ -1,6 +1,9 @@ import numpy as np import random -import cPickle as pickle +try: + import cPickle as pickle +except: + import pickle num_train = 60000 num_val = 10000 @@ -36,7 +39,7 @@ def generate_one(): for i in range(0, step_num): c = random.randint(0, 25) - while d.has_key(c): + while d.get(c): c = random.randint(0, 25) b = random.randint(0, 9) d[c] = b @@ -45,14 +48,16 @@ def generate_one(): a[i*2] = get_one_hot(s) a[i*2+1] = get_one_hot(t) - s = random.choice(d.keys()) + s = random.choice(list(d.keys())) t = chr(s + ord('a')) r = chr(d[s] + ord('0')) a[step_num * 2] = get_one_hot('?') a[step_num * 2 + 1] = get_one_hot('?') a[step_num * 2 + 2] = get_one_hot(t) st += '??' + t + r + e = get_one_hot(r) + return a, e if __name__ == '__main__': @@ -74,4 +79,4 @@ def generate_one(): 'y_val': y_val } with open('associative-retrieval.pkl', 'wb') as f: - pickle.dump(d, f, protocol=2) \ No newline at end of file + pickle.dump(d, f, protocol=2) diff --git a/model.py b/model.py index e5d8d58..3788a21 100644 --- a/model.py +++ b/model.py @@ -5,7 +5,7 @@ import torch.nn as nn import torch.nn.functional as F from config import cfg -from tensorboardX import SummaryWriter +from torch.utils.tensorboard import SummaryWriter from torch.autograd import Variable import time from retrieval import read_data @@ -25,17 +25,22 @@ class fast_weights_model(nn.Module): """docstring for fast_weights_model""" def __init__(self, batch_size, step_num, elem_num, hidden_num): super(fast_weights_model, self).__init__() + # Inputs self.x = Variable(torch.randn(batch_size, step_num, elem_num).type(torch.float32)) + # Targets self.y = Variable(torch.randn(batch_size, elem_num).type(torch.float32)) + # Learning Rate self.l = torch.tensor([0.9], dtype=torch.float32) + # Decay Rate self.e = torch.tensor([0.5], dtype=torch.float32) + # Input Weights self.w1 = Variable(torch.empty(elem_num, 50).uniform_(-np.sqrt(0.02), np.sqrt(0.02))) self.b1 = Variable(torch.zeros([1, 50]).type(torch.float32)) self.w2 = Variable(torch.empty(500, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01))) self.b2 = Variable(torch.zeros([1, 100]).type(torch.float32)) self.w3 = Variable(torch.empty(hidden_num, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01))) - self.b3 = Variable(torch.zeros([1, 100]).type(torch.float32)) + self.b3 = Variable(torch.zeros([1, 100]).type(torch.float32)) self.w4 = Variable(torch.empty(100, elem_num).uniform_(-np.sqrt(1.0 / elem_num), np.sqrt(1.0 / elem_num))) self.b4 = Variable(torch.zeros([1, elem_num]).type(torch.float32)) @@ -46,7 +51,7 @@ def __init__(self, batch_size, step_num, elem_num, hidden_num): self.g = Variable(torch.ones([1, hidden_num]).type(torch.float32)) self.b = Variable(torch.ones([1, hidden_num]).type(torch.float32)) - def forward(self, bx, by) + def forward(self, bx, by): self.x = bx self.y = by a = torch.zeros([batch_size, hidden_num, hidden_num]).type(torch.float32) @@ -85,12 +90,13 @@ def forward(self, bx, by) return self.loss, self.acc -def train(self, save = 0, verbose = 0): - model = fast_weights_model(STEP_NUM, ELEM_NUM, HIDDEN_NUM) +def train(save = 0, verbose = 0): + BATCH_SIZE = 60000 + model = fast_weights_model(BATCH_SIZE, STEP_NUM, ELEM_NUM, HIDDEN_NUM) model.train() batch_size = cfg.train.batch_size start_time = time.time() - optimizer = torch.optim.Adam(model.paramters(), lr=cfg.train.model_lr) + optimizer = torch.optim.Adam(model.parameters(), lr=cfg.train.model_lr) writer = SummaryWriter(logdir=os.path.join(cfg.logdir, cfg.exp_name), flush_secs=30) checkpointer = Checkpointer(os.path.join(cfg.checkpointdir, cfg.exp_name)) start_epoch = 0 diff --git a/util.py b/util.py index 63dfe74..ef6ea25 100644 --- a/util.py +++ b/util.py @@ -6,7 +6,7 @@ import torch from torch import nn from torch import optim -from tensorboardX import SummaryWriter +from torch.utils.tensorboard import SummaryWriter class Checkpointer: def __init__(self, path, max_num=3): @@ -19,8 +19,8 @@ def __init__(self, path, max_num=3): with open(self.listfile, 'wb') as f: model_list = [] pickle.dump(model_list, f) - - + + def save(self, model, optimizer, epoch): checkpoint = { 'model': model.state_dict(), @@ -38,10 +38,10 @@ def save(self, model, optimizer, epoch): model_list.append(filename) with open(self.listfile, 'rb+') as f: pickle.dump(model_list, f) - + with open(filename, 'wb') as f: torch.save(checkpoint, f) - + def load(self, model, optimizer): """ Return starting epoch @@ -56,4 +56,4 @@ def load(self, model, optimizer): model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print('Load checkpoint from {}.'.format(model_list[-1])) - return checkpoint['epoch'] \ No newline at end of file + return checkpoint['epoch'] From 3c7821a38d8586f3b5f80248da6795f612349a15 Mon Sep 17 00:00:00 2001 From: Yoni Date: Wed, 6 May 2020 21:59:53 -0400 Subject: [PATCH 2/2] more flexible code based off of github.com/GokuMohandas/fast-weights --- model.py | 61 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/model.py b/model.py index 3788a21..e685e60 100644 --- a/model.py +++ b/model.py @@ -1,5 +1,6 @@ from __future__ import print_function +import os import torch import numpy as np import torch.nn as nn @@ -23,39 +24,53 @@ def softmax_cross_entropy_with_logits(logits, labels): class fast_weights_model(nn.Module): """docstring for fast_weights_model""" - def __init__(self, batch_size, step_num, elem_num, hidden_num): + def __init__(self, args): super(fast_weights_model, self).__init__() + self.batch_size = args.batch_size # Inputs - self.x = Variable(torch.randn(batch_size, step_num, elem_num).type(torch.float32)) + self.X = Variable(torch.randn(args.batch_size, args.input_dim, args.num_classes).type(torch.float32)) # Targets - self.y = Variable(torch.randn(batch_size, elem_num).type(torch.float32)) + self.y = Variable(torch.randn(args.batch_size, args.num_classes).type(torch.float32)) # Learning Rate - self.l = torch.tensor([0.9], dtype=torch.float32) + self.l = torch.tensor([args.learning_rate], dtype=torch.float32) # Decay Rate - self.e = torch.tensor([0.5], dtype=torch.float32) + self.e = torch.tensor([args.decay_rate], dtype=torch.float32) # Input Weights - self.w1 = Variable(torch.empty(elem_num, 50).uniform_(-np.sqrt(0.02), np.sqrt(0.02))) - self.b1 = Variable(torch.zeros([1, 50]).type(torch.float32)) - self.w2 = Variable(torch.empty(500, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01))) - self.b2 = Variable(torch.zeros([1, 100]).type(torch.float32)) - self.w3 = Variable(torch.empty(hidden_num, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01))) - self.b3 = Variable(torch.zeros([1, 100]).type(torch.float32)) - self.w4 = Variable(torch.empty(100, elem_num).uniform_(-np.sqrt(1.0 / elem_num), np.sqrt(1.0 / elem_num))) - self.b4 = Variable(torch.zeros([1, elem_num]).type(torch.float32)) + self.W_x = Variable(torch.empty( + args.num_classes, + args.hidden_size).uniform_( + -np.sqrt(2.0/args.num_classes), + np.sqrt(2.0/args.num_classes) + ), dtype=torch.float32) + self.b_x = Variable(torch.zeros( + [args.hidden_size] + ), dtype=torch.float32) + + # Hidden weights (initialization explained in Hinton video) + self.W_h = Variable(initial_value=0.5 * np.identity(args.hidden_size), + dtype=torch.float32) + + # Softmax weights + self.W_softmax = Variable(torch.empty( + args.hidden_size, + args.num_classes).uniform_( + -np.sqrt(2.0/args.hidden_size), + np.sqrt(2.0/args.hidden_size) + ), dtype=torch.float32) + self.b_softmax = Variable(torch.zeros(args.num_classes), + dtype=torch.float32) + + # Scale and shift everything for layernorm + self.gain = Variable(torch.ones(args.hidden_size), dtype=torch.float32) + self.bias = Variable(torch.zeros(args.hidden_size), dtype=torch.float32) - self.w = Variable(torch.tensor(0.05 * np.identity(hidden_num)).type(torch.float32)) - - self.c = Variable(torch.empty(100, hidden_num).uniform_(-np.sqrt(hidden_num), np.sqrt(hidden_num))) - - self.g = Variable(torch.ones([1, hidden_num]).type(torch.float32)) - self.b = Variable(torch.ones([1, hidden_num]).type(torch.float32)) def forward(self, bx, by): self.x = bx self.y = by - a = torch.zeros([batch_size, hidden_num, hidden_num]).type(torch.float32) - h = torch.zeros([batch_size, hidden_num]).type(torch.float32) + a = torch.zeros([self.batch_size, hidden_num, hidden_num]).type(torch.float32) + h = torch.zeros([self.batch_size, hidden_num]).type(torch.float32) la = [] @@ -65,7 +80,7 @@ def forward(self, bx, by): h = torch.relu(torch.matmul(h, self.w) + torch.matmul(z, self.c)) - hs = torch.reshape(h, [batch_size, 1, hidden_num]) + hs = torch.reshape(h, [self.batch_size, 1, hidden_num]) hh = hs @@ -80,7 +95,7 @@ def forward(self, bx, by): sig = torch.sqrt(torch.mean(torch.pow((hs - mu), 2), 0)) hs = torch.relu(torch.div(torch.mul(self.g, (hs - mu)), sig) + self.b) - h = torch.reshape(hs, [batch_size, hidden_num]) + h = torch.reshape(hs, [self.batch_size, hidden_num]) h = torch.relu(torch.matmul(h, self.w3) + self.b3) logits = torch.matmul(h, self.w4) + self.b4