lvq/batch_norm.py at master · harm-devries/lvq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# Code from Cesar Laurent
#

import logging
import numpy

from blocks.bricks import Activation, Initializable, Feedforward, Linear, Sequence
from blocks.bricks.base import Brick, application, lazy
from blocks.extensions import SimpleExtension
from blocks.extensions.monitoring import MonitoringExtension
from blocks.filter import get_brick
from blocks.graph import ComputationGraph
from blocks.monitoring.evaluators import DatasetEvaluator
from blocks.roles import add_role, WEIGHT, BIAS, PARAMETER
from blocks.utils import dict_subset

from toolz import interleave
from picklable_itertools.extras import equizip

from theano import config, shared, tensor, function

floatX = config.floatX
logger = logging.getLogger()

class MLP(Initializable, Feedforward):
    """Multi-layer perceptron with batch normalization.

    Parameters
    ----------
    activations : list of :class:`.Brick`, :class:`.BoundApplication`,
                  or ``None``
        A list of activations to apply after each linear transformation.
        Give ``None`` to not apply any activation. It is assumed that the
        application method to use is ``apply``. Required for
        :meth:`__init__`.
    dims : list of ints
        A list of input dimensions, as well as the output dimension of the
        last layer. Required for :meth:`~.Brick.allocate`.

    Notes
    -----
    See :class:`Initializable` for initialization parameters.

    Note that the ``weights_init``, ``biases_init`` and ``use_bias``
    configurations will overwrite those of the layers each time the
    :class:`MLP` is re-initialized. For more fine-grained control, push the
    configuration to the child layers manually before initialization.

    >>> from blocks.initialization import IsotropicGaussian, Constant
    >>> mlp = MLP(activations=[Tanh(), None], dims=[30, 20, 10],
    ...           weights_init=IsotropicGaussian(),
    ...           biases_init=Constant(1))
    >>> mlp.push_initialization_config()  # Configure children
    >>> mlp.children[0].weights_init = IsotropicGaussian(0.1)
    >>> mlp.initialize()

    """
    @lazy(allocation=['dims'])
    def __init__(self, activations, dims, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.activations = activations
        self.linear_transformations = [Linear(name='linear_{}'.format(i))
                                       for i in range(len(activations))]
        self.batch_norms = [BatchNorm(name='bn_{}'.format(i))
                            for i in range(len(activations))]

        self.children.extend([a for a in self.activations if a is not None])
        self.children.extend(self.batch_norms)
        self.children.extend(self.linear_transformations)

        if not dims:
            dims = [None] * (len(activations) + 1)
        self.dims = dims


    @property
    def input_dim(self):
        return self.dims[0]

    @input_dim.setter
    def input_dim(self, value):
        self.dims[0] = value

    @property
    def output_dim(self):
        return self.dims[-1]

    @output_dim.setter
    def output_dim(self, value):
        self.dims[-1] = value

    @application(inputs=['input_'], outputs=['output'])
    def inference(self, input_):
        out = input_
        for brick in interleave([self.linear_transformations, self.batch_norms, self.activations]):
	    if brick is None:
                continue
            if isinstance(brick, BatchNorm):
                out = brick.inference(out)
            else:
                out = brick.apply(out)
        return out

    @application(inputs=['input_'], outputs=['output'])
    def apply(self, input_):
        out = input_
        for brick in interleave([self.linear_transformations, self.batch_norms, self.activations]):
            if brick is None:
                continue
            if isinstance(brick, Brick):
                out = brick.apply(out)
        return out

    def _push_allocation_config(self):
        if not len(self.dims) - 1 == len(self.linear_transformations):
            raise ValueError
        for input_dim, output_dim, layer in \
                equizip(self.dims[:-1], self.dims[1:],
                        self.linear_transformations):
            layer.input_dim = input_dim
            layer.output_dim = output_dim
            layer.use_bias = self.use_bias
        for dim, bn in equizip(self.dims[1:], self.batch_norms):
            bn.input_dim = dim


class BatchNorm(Activation):
    """Brick for Batch Normalization. It works with 4D Tensors (conv.) and
    2D Tensors (fully connected layers).
    The Batch Normalization paper:
    S. Ioffe, C. Szegedy, Batch Normalization: Accelerating Deep Network
    Training by Reducing Internal Covariate Shift.
    Parameters
    ----------
    input_dim : int
        The number of features (or features maps for convolutions).
    n_batches : int
        The number of batches used to update the pop. means and vars.
    epsilon : float
        Small constant for sqrt stability.
    Examples
    --------
    >>> import theano
    >>> from theano import tensor
    >>> x = tensor.vector('x')
    Creating a network:
    >>> y = Linear(input_dim=10, output_dim=5).apply(x)
    >>> bn = BatchNorm(input_dim=5)
    >>> train_out = bn.apply(y)
    Creating both train and test computation graphs:
    >>> train_cg = ComputationGraph([train_out])
    >>> test_cg = create_inference_graph(train_cg, [bn])
    Preparing the update extension:
    >>> batch_size = 50 #The size of the batches
    >>> n_batches = 10 #The number of batches to use to update the stats.
    >>> scheme = ShuffledScheme(batch_size*n_batches, batch_size)
    >>> stream = DataStream(DATASET, iteration_scheme=scheme)
    >>> extensions.insert(0, BatchNormExtension([bn], stream, n_batches))
    """
    @lazy(allocation=['input_dim'])
    def __init__(self, input_dim, epsilon=1e-6, **kwargs):
        super(BatchNorm, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.epsilon = epsilon

    @property
    def gamma(self):
        return self.parameters[0]

    @property
    def beta(self):
        return self.parameters[1]

    def _allocate(self):
        gamma_val = numpy.ones(self.input_dim, dtype=floatX)
        gamma = shared(name='gamma', value=gamma_val)
        beta_val = numpy.zeros(self.input_dim, dtype=floatX)
        beta = shared(name='beta', value=beta_val)
        add_role(gamma, PARAMETER)
        add_role(beta, PARAMETER)
        self.parameters.append(gamma)
        self.parameters.append(beta)
        # Keeping track of the means and variances during the training.
        means_val = numpy.zeros(self.input_dim, dtype=floatX)
        self.pop_means = shared(name='means', value=means_val)
        vars_val = numpy.ones(self.input_dim, dtype=floatX)
        self.pop_vars = shared(name='variances', value=vars_val)

    def get_updates(self, n_batches):
        """Update the population means and variances of the brick. Use
        n_batches from the training dataset to do so.
        """
        m_u = (self.pop_means, (self.pop_means
                                + 1./n_batches * self.batch_means))
        v_u = (self.pop_vars, (self.pop_vars
                               + 1./n_batches * self.batch_vars))
        return [m_u, v_u]

    def _inference(self, input_):
        output = (input_ - self.pop_means.dimshuffle(*self.pattern))
        output /= tensor.sqrt(self.pop_vars.dimshuffle(*self.pattern)
                              + self.epsilon)
        output *= self.gamma.dimshuffle(*self.pattern)
        output += self.beta.dimshuffle(*self.pattern)
        return output

    def _training(self, input_):
        self.batch_means = input_.mean(axis=self.axes, keepdims=False,
                                       dtype=floatX)
        self.batch_vars = input_.var(axis=self.axes, keepdims=False)
        output = input_ - self.batch_means.dimshuffle(*self.pattern)
        output /= tensor.sqrt(self.batch_vars.dimshuffle(*self.pattern)
                              + self.epsilon)
        output *= self.gamma.dimshuffle(*self.pattern)
        output += self.beta.dimshuffle(*self.pattern)
        return output

    def _check_input(self, x):
        if x.ndim == 2:
            self.axes = [0]
            self.pattern = ['x', 0]
        elif x.ndim == 4:
            self.axes = [0, 2, 3]
            self.pattern = ['x', 0, 'x', 'x']
        elif x.ndim == 3:
            self.axes = [0, 1]
            self.pattern = ['x', 'x', 0]
        else:
            raise NotImplementedError

    #@application(inputs=['input_'], outputs=['output'])
    def apply(self, input_):
        self._check_input(input_)
        self.training_output = self._training(input_)
        return self.training_output

    #@application(inputs=['input_'], outputs=['output'])
    def inference(self, input_):
        self._check_input(input_)
        return self._inference(input_)


class BatchNormExtension(SimpleExtension, MonitoringExtension):
    """Computes the population means and variance of the BatchNorm bricks
    in the network. This extension must be placed before any other
    monitoring.

    Parameters
    ----------
    graph : instance of :class:`ComputationGraph`
        The training computation graph.
    data_stream : instance of :class:`DataStream`
        The data stream used to compute the population statistics on. It
        should provide n_batches only.
    n_batches: int
        The number of batches used to update the population statistics.
    """
    def __init__(self, graph, data_stream, n_batches, **kwargs):
        kwargs.setdefault("after_epoch", True)
        kwargs.setdefault("before_first_epoch", True)
        super(BatchNormExtension, self).__init__(**kwargs)
        self.n_batches = n_batches
        self.bricks = get_batch_norm_bricks(graph)
        self.data_stream = data_stream
        self.updates = self._get_updates()
        variables = [brick.training_output for brick in self.bricks]
        self._computation_graph = ComputationGraph(variables)
        self.inputs = self._computation_graph.inputs
        self.inputs = list(set(self.inputs))
        self.inputs_names = [v.name for v in self.inputs]
        self._compile()

    def _get_updates(self):
        updates = []
        for brick in self.bricks:
            updates.extend(brick.get_updates(self.n_batches))
        return updates

    def _reset(self, x):
        x.set_value(numpy.zeros(x.get_value().shape, dtype=floatX))

    def _compile(self):
        self._fun = function(self.inputs, [], updates=self.updates,
                             on_unused_input='ignore')

    def _evaluate(self):
        for batch in self.data_stream.get_epoch_iterator(as_dict=True):
            batch = dict_subset(batch, self.inputs_names)
            self._fun(**batch)

    def do(self, which_callback, *args):
        logger.info('Computation of population statistics started')
        # 1. Reset the pop means and vars
        for brick in self.bricks:
            self._reset(brick.pop_means)
            self._reset(brick.pop_vars)
        # 2. Update them
        self._evaluate()
        logger.info('Computation of population statistics finished')


def create_inference_graph(graph):
    """Create the inference graph from the training computation graph.
    Parameters
    ----------
    graph : instance of :class:`ComputationGraph`
        The training computation graph.
    """
    replacements = {}
    bricks = get_batch_norm_bricks(graph)
    for brick in bricks:
        replacements.update(brick.get_replacements())
    return graph.replace(replacements)


def get_batch_norm_bricks(graph):
    """Returns the batch norm bricks (BatchNorm and BatchNorm3D) in a
       computation graph.
    Parameters
    ----------
    graph : instance of :class:`ComputationGraph`
        The training computation graph.
    """
    bricks = []
    for variable in graph.variables:
        brick = get_brick(variable)
        if isinstance(brick, BatchNorm):
            if brick not in bricks:
                bricks.append(brick)
    return bricks