bugs fixed

Victorwz · Victorwz · commit cef79631c9e4 · 2020-05-08T16:04:54.000+08:00
diff --git a/README.md b/README.md
@@ -1,2 +1,34 @@
 # fast-weights-pytorch
 PyTorch Implementation of the paper [Using Fast Weights to Attend to the Recent Past]
+Code for generating sequential data is forked from [jiamings/fast-weights](https://github.com/jiamings/fast-weights/tree/master)
+
+## Dependencies
+Python >= 3.6
+Pytorch
+TensorboardX
+Numpy
+Pickle
+
+## Usage
+Generate a dataset
+
+```
+$ python generator.py
+```
+
+Train the model of fast-weights
+
+```
+$ python fast_weights.py
+```
+
+## Training Result
+![](fig/acc.png)
+
+![](fig/loss.png)
+
+### References
+
+[Using Fast Weights to Attend to the Recent Past](https://arxiv.org/abs/1610.06258). Jimmy Ba,  Geoffrey Hinton, Volodymyr Mnih, Joel Z. Leibo, Catalin Ionescu.
+
+[Layer Normalization](https://arxiv.org/abs/1607.06450). Jimmy Ba, Ryan Kiros, Geoffery Hinton.
diff --git a/fast_weights.py b/fast_weights.py
@@ -1,5 +1,6 @@
 from __future__ import print_function
 
+import os
 import torch
 import numpy as np
 import torch.nn as nn
@@ -25,42 +26,46 @@ class fast_weights_model(nn.Module):
     """docstring for fast_weights_model"""
     def __init__(self, batch_size, step_num, elem_num, hidden_num):
         super(fast_weights_model, self).__init__()
+        self.batch_size = batch_size
         self.x = Variable(torch.randn(batch_size, step_num, elem_num).type(torch.float32))
         self.y = Variable(torch.randn(batch_size, elem_num).type(torch.float32))
-        self.l = torch.tensor([0.9], dtype=torch.float32)
-        self.e = torch.tensor([0.5], dtype=torch.float32)
+        self.l = nn.Parameter(torch.tensor([0.9], dtype=torch.float32))
+        self.e = nn.Parameter(torch.tensor([0.5], dtype=torch.float32))
 
-        self.w1 = Variable(torch.empty(elem_num, 50).uniform_(-np.sqrt(0.02), np.sqrt(0.02)))
-        self.b1 = Variable(torch.zeros([1, 50]).type(torch.float32))
-        self.w2 = Variable(torch.empty(500, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01)))
-        self.b2 = Variable(torch.zeros([1, 100]).type(torch.float32))
-        self.w3 = Variable(torch.empty(hidden_num, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01)))
-        self.b3 = Variable(torch.zeros([1, 100]).type(torch.float32)) 
-        self.w4 = Variable(torch.empty(100, elem_num).uniform_(-np.sqrt(1.0 / elem_num), np.sqrt(1.0 / elem_num)))
-        self.b4 = Variable(torch.zeros([1, elem_num]).type(torch.float32))
+        self.w1 = nn.Parameter(torch.empty(elem_num, 50).uniform_(-np.sqrt(0.02), np.sqrt(0.02)), requires_grad=True)
+        self.b1 = nn.Parameter(torch.zeros([1, 50]).type(torch.float32), requires_grad=True)
+        self.w2 = nn.Parameter(torch.empty(50, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01)), requires_grad=True)
+        self.b2 = nn.Parameter(torch.zeros([1, 100]).type(torch.float32), requires_grad=True)
+        self.w3 = nn.Parameter(torch.empty(hidden_num, 100).uniform_(-np.sqrt(0.01), np.sqrt(0.01)), requires_grad=True)
+        self.b3 = nn.Parameter(torch.zeros([1, 100]).type(torch.float32), requires_grad=True) 
+        self.w4 = nn.Parameter(torch.empty(100, elem_num).uniform_(-np.sqrt(1.0 / elem_num), np.sqrt(1.0 / elem_num)), requires_grad=True)
+        self.b4 = nn.Parameter(torch.zeros([1, elem_num]).type(torch.float32), requires_grad=True)
 
-        self.w = Variable(torch.tensor(0.05 * np.identity(hidden_num)).type(torch.float32))
+        self.w = nn.Parameter(torch.tensor(0.05 * np.identity(hidden_num)).type(torch.float32), requires_grad=True)
 
-        self.c = Variable(torch.empty(100, hidden_num).uniform_(-np.sqrt(hidden_num), np.sqrt(hidden_num)))
+        self.c = nn.Parameter(torch.empty(100, hidden_num).uniform_(-np.sqrt(hidden_num), np.sqrt(hidden_num)), requires_grad=True)
 
-        self.g = Variable(torch.ones([1, hidden_num]).type(torch.float32))
-        self.b = Variable(torch.ones([1, hidden_num]).type(torch.float32))
+        self.g = nn.Parameter(torch.ones([1, hidden_num]).type(torch.float32), requires_grad=True)
+        self.b = nn.Parameter(torch.ones([1, hidden_num]).type(torch.float32), requires_grad=True)
 
-    def forward(self, bx, by)
-        self.x = bx
-        self.y = by
-        a = torch.zeros([batch_size, hidden_num, hidden_num]).type(torch.float32)
-        h = torch.zeros([batch_size, hidden_num]).type(torch.float32)
+    def forward(self, bx, by):
+        self.x = torch.tensor(bx)
+        self.y = torch.tensor(by) 
+        #print(bx.size)
+        #print(by.size)
+        a = torch.zeros([self.batch_size, HIDDEN_NUM, HIDDEN_NUM]).type(torch.float32)
+        h = torch.zeros([self.batch_size, HIDDEN_NUM]).type(torch.float32)
 
         la = []
 
-        for i in range(0, step_num):
-            s1 = torch.relu(torch.matmul(self.x[:, t, :], self.w1) + self.b1)
+        for i in range(0, STEP_NUM):
+            s1 = torch.relu(torch.matmul(self.x[:, i, :], self.w1) + self.b1)
+            #print(s1.shape, self.w2.shape)
             z = torch.relu(torch.matmul(s1, self.w2) + self.b2)
 
             h = torch.relu(torch.matmul(h, self.w) + torch.matmul(z, self.c))
 
-            hs = torch.reshape(h, [batch_size, 1, hidden_num])
+            hs = torch.reshape(h, [self.batch_size, 1, HIDDEN_NUM])
 
             hh = hs
 
@@ -75,7 +80,7 @@ def forward(self, bx, by)
                 sig = torch.sqrt(torch.mean(torch.pow((hs - mu), 2), 0))
                 hs = torch.relu(torch.div(torch.mul(self.g, (hs - mu)), sig) + self.b)
 
-            h = torch.reshape(hs, [batch_size, hidden_num])
+            h = torch.reshape(hs, [self.batch_size, HIDDEN_NUM])
 
         h = torch.relu(torch.matmul(h, self.w3) + self.b3)
         logits = torch.matmul(h, self.w4) + self.b4
@@ -85,12 +90,12 @@ def forward(self, bx, by)
 
         return self.loss, self.acc
 
-def train(self, save = 0, verbose = 0):
-    model = fast_weights_model(STEP_NUM, ELEM_NUM, HIDDEN_NUM)
-    model.train()
+def train(save = 0, verbose = 0):
     batch_size = cfg.train.batch_size
+    model = fast_weights_model(batch_size, STEP_NUM, ELEM_NUM, HIDDEN_NUM)
     start_time = time.time()
-    optimizer = torch.optim.Adam(model.paramters(), lr=cfg.train.model_lr)
+    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.train.model_lr)
+    model.train()
     writer = SummaryWriter(logdir=os.path.join(cfg.logdir, cfg.exp_name), flush_secs=30)
     checkpointer = Checkpointer(os.path.join(cfg.checkpointdir, cfg.exp_name))
     start_epoch = 0
@@ -99,7 +104,8 @@ def train(self, save = 0, verbose = 0):
     for epoch in range(start_epoch, cfg.train.max_epochs):
         for idx in range(batch_idxs):
             gloabl_step = epoch * cfg.num_train + idx + 1
-            bx, by = ar_data.train.next_batch(batch_size=cfg.batch_size)
+            #print(ar_data.train._x)
+            bx, by = ar_data.train.next_batch(batch_size=100)
             loss, acc = model(bx, by)
             optimizer.zero_grad()
             loss.backward()
diff --git a/fig/acc.png b/fig/acc.png
diff --git a/fig/loss.png b/fig/loss.png
diff --git a/generator.py b/generator.py
@@ -1,6 +1,6 @@
 import numpy as np
 import random
-import cPickle as pickle
+import pickle
 
 num_train = 60000
 num_val = 10000
diff --git a/retrieval.py b/retrieval.py
@@ -1,9 +1,6 @@
 import numpy as np
 import collections
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
+import pickle
 
 
 Datasets = collections.namedtuple('Datasets', ['train', 'val', 'test'])
@@ -40,10 +37,13 @@ def next_batch(self, batch_size):
             start = 0
             self._index_in_epoch = batch_size
         end = self._index_in_epoch
+        #print(end)
+        #print(self._x[self.perm[start:end]], self._x[self.perm[start:end]].type)
         return self._x[self.perm[start:end]], self._y[self.perm[start:end]]
 
 
-def read_data(data_path='associative-retrieval.pkl'):
+def read_data():
+    data_path='associative-retrieval.pkl'
     with open(data_path, 'rb') as f:
         d = pickle.load(f)
     x_train = d['x_train']