@@ -83,7 +83,7 @@ def register_layer(
8383 default_args (list): The default parameters to add to the function.
8484 default_kwargs (dict): The default parameters to add to the function.
8585 Those arguments can be overwritten when calling the function.
86- use_dp (bool): Wrap the function call within a dataparalellism object if
86+ use_dp (bool): Wrap the function call within a dataparallelism object if
8787 dp is available. Some layers (like MOE) must be called without dp.
8888 recompute_grad (bool): If True, recompute the function during the
8989 backward pass to save memory
@@ -1378,7 +1378,7 @@ def _relative_attention_inner(x, y, z, transpose):
13781378 x: Tensor with shape [batch_size, heads, length, length or depth].
13791379 y: Tensor with shape [batch_size, heads, length, depth].
13801380 z: Tensor with shape [length, length, depth].
1381- transpose: Whether to tranpose inner matrices of y and z. Should be true if
1381+ transpose: Whether to transpose inner matrices of y and z. Should be true if
13821382 last dimension of x is depth, not length.
13831383
13841384 Returns:
@@ -1422,7 +1422,7 @@ def dot_product_attention_relative(q,
14221422 k: a Tensor with shape [batch, heads, length, depth].
14231423 v: a Tensor with shape [batch, heads, length, depth].
14241424 bias: bias Tensor.
1425- max_relative_position: an integer specifying the maxmimum distance between
1425+ max_relative_position: an integer specifying the maximum distance between
14261426 inputs that unique position embeddings should be learned for.
14271427 dropout_rate: a floating point number.
14281428 image_shapes: optional tuple of integer scalars.
@@ -2141,7 +2141,7 @@ def gather_indices_2d(x, block_shape, block_stride):
21412141
21422142
21432143def make_2d_block_raster_mask (query_shape , memory_flange ):
2144- """creates a mask for 2d block raster scany .
2144+ """creates a mask for 2d block raster scan .
21452145
21462146 The query mask can look to the left, top left, top, and top right, but
21472147 not to the right. Inside the query, we have the standard raster scan
@@ -2661,7 +2661,7 @@ def ffn_self_attention_layer(x,
26612661 We use self-attention to do feedforward computations. We apply this function
26622662 positionwise where for each position, we linearly transform the output to have
26632663 depth filter_depth, and break up the result depth-wise into num_parts
2664- contiguous parts. The parts self-attentd , we concatenate the results
2664+ contiguous parts. The parts self-attend , we concatenate the results
26652665 depth-wise, and we linearly transform to a depth of output_depth. The
26662666 goal is to get multiplicative interactions between components of a
26672667 representation.
@@ -2764,7 +2764,7 @@ def parameter_attention(x,
27642764 x , total_key_depth , use_bias = False , name = "q_transform" )
27652765 if dropout_rate :
27662766 # This is a cheaper form of attention dropout where we use to use
2767- # the same dropout decisions across batch elemets and query positions,
2767+ # the same dropout decisions across batch elements and query positions,
27682768 # but different decisions across heads and memory positions.
27692769 v = tf .nn .dropout (
27702770 v , 1.0 - dropout_rate , noise_shape = [num_heads , memory_rows , 1 ])
0 commit comments