1d Dilated masked and unmasked self-attention. Added spaces between tokens for logging during inference.

Ashish Vaswani · Ryan Sepassi · commit ed7862c95a42 · 2017-09-29T17:12:06.000-07:00
PiperOrigin-RevId: 170552095
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
@@ -1090,6 +1090,280 @@ def pad_l_and_r(x, pad_length):
     return output
 
 
+def reshape_by_blocks(x, x_shape, memory_block_size):
+  x = tf.reshape(x, [
+      x_shape[0], x_shape[1], x_shape[2] // memory_block_size,
+      memory_block_size, x_shape[3]
+  ])
+  return x
+
+
+def dilated_self_attention_1d(q,
+                              k,
+                              v,
+                              query_block_size=128,
+                              memory_block_size=128,
+                              gap_size=2,
+                              num_memory_blocks=2,
+                              name=None):
+  """dilated self-attention.
+
+  Args:
+    q: a Tensor with shape [batch, heads, length, depth_k]
+    k: a Tensor with shape [batch, heads, length, depth_k]
+    v: a Tensor with shape [batch, heads, length, depth_v]
+    query_block_size: an integer indicating size of query block
+    memory_block_size: an integer indicating the size of a memory block.
+    gap_size: an integer indicating the gap size
+    num_memory_blocks: how many memory blocks to look at to the left and right.
+      Each will be separated by gap_size.
+    name: an optional string
+
+  Returns:
+    a Tensor of shape [batch, heads, length, depth_v]
+  """
+  with tf.variable_scope(
+      name, default_name="dilated_self_attention_1d", values=[q, k, v]):
+    v_list_shape = v.get_shape().as_list()
+    v_shape = tf.shape(v)
+    depth_v = v_shape[3]
+    batch_size = v_shape[0]
+    num_heads = v_shape[1]
+    original_length = tf.shape(q)[2]
+    # making sure q is a multiple of query block size
+    def pad_to_multiple(x, pad_length):
+      x_length = tf.shape(x)[2]
+      return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]])
+
+    def pad_l_and_r(x, pad_length):
+      return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]])
+
+    q = pad_to_multiple(q, query_block_size)
+    v = pad_to_multiple(v, query_block_size)
+    k = pad_to_multiple(k, query_block_size)
+
+    q.set_shape(v_list_shape)
+    v.set_shape(v_list_shape)
+    k.set_shape(v_list_shape)
+    # Setting up q blocks
+    new_q_shape = tf.shape(q)
+    # Setting up q blocks
+    q = reshape_by_blocks(q, new_q_shape, query_block_size)
+    self_k_part = reshape_by_blocks(k, new_q_shape, query_block_size)
+    self_v_part = reshape_by_blocks(v, new_q_shape, query_block_size)
+
+    # Setting up k and v windows
+    k_v_padding = (gap_size + memory_block_size) * num_memory_blocks
+    k = pad_l_and_r(k, k_v_padding)
+    v = pad_l_and_r(v, k_v_padding)
+    # getting gather indices
+    index_length = (new_q_shape[2] - query_block_size + memory_block_size)
+    indices = tf.range(0, index_length, delta=1, name="index_range")
+    # making indices [1, length, 1] to appy convs
+    indices = tf.reshape(indices, [1, -1, 1])
+    kernel = tf.expand_dims(tf.eye(memory_block_size), axis=1)
+    gather_indices = tf.nn.conv1d(
+        tf.cast(indices, tf.float32),
+        kernel,
+        query_block_size,
+        padding="VALID",
+        name="gather_conv")
+
+    gather_indices = tf.squeeze(tf.cast(gather_indices, tf.int32), axis=0)
+
+    # get left and right memory blocks for each query
+    # [length, batch, heads, dim]
+    k_t = tf.transpose(k, [2, 0, 1, 3])
+    v_t = tf.transpose(v, [2, 0, 1, 3])
+    left_k = gather_dilated_memory_blocks(k_t[:-k_v_padding, :, :, :],
+                                          num_memory_blocks, gap_size,
+                                          query_block_size, memory_block_size,
+                                          gather_indices)
+    left_v = gather_dilated_memory_blocks(v_t[:-k_v_padding, :, :, :],
+                                          num_memory_blocks, gap_size,
+                                          query_block_size, memory_block_size,
+                                          gather_indices)
+
+    right_k = gather_dilated_memory_blocks(k_t[k_v_padding:, :, :, :],
+                                           num_memory_blocks, gap_size,
+                                           query_block_size, memory_block_size,
+                                           gather_indices, direction="right")
+    right_v = gather_dilated_memory_blocks(v_t[k_v_padding:, :, :, :],
+                                           num_memory_blocks, gap_size,
+                                           query_block_size, memory_block_size,
+                                           gather_indices, direction="right")
+
+    k_windows = tf.concat([left_k, self_k_part, right_k], axis=3)
+    v_windows = tf.concat([left_v, self_v_part, right_v], axis=3)
+    attention_bias = tf.expand_dims(
+        embedding_to_padding(k_windows) * -1e9, axis=-2)
+
+    output = dot_product_attention(
+        q, k_windows, v_windows, attention_bias, dropout_rate=0.,
+        name="dilated_1d", make_image_summary=False)
+    output = tf.reshape(output, [batch_size, num_heads, -1, depth_v])
+    # Remove the padding if introduced
+    output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1])
+    output.set_shape(v_list_shape)
+    return output
+
+
+def gather_dilated_memory_blocks(x, num_memory_blocks, gap_size,
+                                 query_block_size, memory_block_size,
+                                 gather_indices, direction="left"):
+  """Gathers blocks with gaps in between.
+
+  Args:
+    x: A tensor of shape [length, batch, heads, depth]
+    num_memory_blocks:     num_memory_blocks: how many memory blocks to look
+      in "direction". Each will be separated by gap_size.
+    gap_size: an integer indicating the gap size
+    query_block_size: an integer indicating size of query block
+    memory_block_size: an integer indicating the size of a memory block.
+    gather_indices: The indices to gather from.
+    direction: left or right
+  Returns:
+    a tensor of shape [batch, heads, blocks, block_length, depth]
+  """
+
+  gathered_blocks = []
+  # gathering memory blocks
+  for block_id in range(num_memory_blocks):
+    block_end_index = -(query_block_size +
+                        gap_size * (block_id+1) + memory_block_size *
+                        block_id) - 1
+    block_start_index = (
+        (memory_block_size + gap_size) *
+        (num_memory_blocks - (block_id + 1))
+    )
+    if direction != "left":
+      [block_end_index, block_start_index] = [
+          -block_start_index - 1, -block_end_index + 1
+      ]
+    def gather_dilated_1d_blocks(x, gather_indices):
+      x_new = tf.gather(x, gather_indices)
+      # [batch, heads, blocks, block_length, dim]
+      return tf.transpose(x_new, [2, 3, 0, 1, 4])
+
+    gathered_blocks.append(
+        gather_dilated_1d_blocks(x[block_start_index:block_end_index],
+                                 gather_indices))
+  return tf.concat(gathered_blocks, 3)
+
+
+def masked_dilated_self_attention_1d(q,
+                                     k,
+                                     v,
+                                     query_block_size=64,
+                                     memory_block_size=64,
+                                     gap_size=2,
+                                     num_memory_blocks=2,
+                                     name=None):
+  """dilated self-attention.
+
+  Args:
+    q: a Tensor with shape [batch, heads, length, depth_k]
+    k: a Tensor with shape [batch, heads, length, depth_k]
+    v: a Tensor with shape [batch, heads, length, depth_v]
+    query_block_size: an integer
+    memory_block_size: an integer indicating how much to look left.
+    gap_size: an integer indicating the gap size
+    num_memory_blocks: how many memory blocks to look at to the left. Each will
+      be separated by gap_size.
+    name: an optional string
+
+  Returns:
+    a Tensor of shape [batch, heads, length, depth_v]
+  """
+  with tf.variable_scope(
+      name, default_name="masked_dilated_self_attention_1d", values=[q, k, v]):
+    v_list_shape = v.get_shape().as_list()
+    v_shape = tf.shape(v)
+    depth_v = v_shape[3]
+    batch_size = v_shape[0]
+    num_heads = v_shape[1]
+    original_length = tf.shape(q)[2]
+    # making sure q is a multiple of query block size
+    def pad_to_multiple(x, pad_length):
+      x_length = tf.shape(x)[2]
+      return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]])
+
+    def pad_l(x, left_pad_length):
+      return tf.pad(x, [[0, 0], [0, 0], [left_pad_length, 0], [0, 0]])
+
+    q = pad_to_multiple(q, query_block_size)
+    v = pad_to_multiple(v, query_block_size)
+    k = pad_to_multiple(k, query_block_size)
+    q.set_shape(v_list_shape)
+    v.set_shape(v_list_shape)
+    k.set_shape(v_list_shape)
+    # Setting up q blocks
+    new_q_shape = tf.shape(q)
+
+    # Setting up q blocks
+    q = reshape_by_blocks(q, new_q_shape, query_block_size)
+    self_k_part = reshape_by_blocks(k, new_q_shape, query_block_size)
+    self_v_part = reshape_by_blocks(v, new_q_shape, query_block_size)
+    # Setting up k and v windows
+    k_v_padding = (gap_size + memory_block_size) * num_memory_blocks
+    k = pad_l(k, k_v_padding)
+    v = pad_l(v, k_v_padding)
+    # getting gather indices
+    index_length = (new_q_shape[2] - query_block_size + memory_block_size)
+
+    indices = tf.range(0, index_length, delta=1, name="index_range")
+    # making indices [1, length, 1] to appy convs
+    indices = tf.reshape(indices, [1, -1, 1])
+    kernel = tf.expand_dims(tf.eye(memory_block_size), axis=1)
+    gather_indices = tf.nn.conv1d(
+        tf.cast(indices, tf.float32),
+        kernel,
+        query_block_size,
+        padding="VALID",
+        name="gather_conv")
+    gather_indices = tf.squeeze(tf.cast(gather_indices, tf.int32), axis=0)
+
+    # get left and right memory blocks for each query
+    # [length, batch, heads, dim]
+    k_t = tf.transpose(k, [2, 0, 1, 3])
+    v_t = tf.transpose(v, [2, 0, 1, 3])
+
+    k_unmasked_windows = gather_dilated_memory_blocks(k_t, num_memory_blocks,
+                                                      gap_size,
+                                                      query_block_size,
+                                                      memory_block_size,
+                                                      gather_indices)
+    v_unmasked_windows = gather_dilated_memory_blocks(v_t, num_memory_blocks,
+                                                      gap_size,
+                                                      query_block_size,
+                                                      memory_block_size,
+                                                      gather_indices)
+
+    # combine memory windows
+    block_q_shape = tf.shape(q)
+    masked_attention_bias = tf.tile(tf.expand_dims(
+        attention_bias_lower_triangle(query_block_size), axis=0),
+                                    [block_q_shape[0], block_q_shape[1],
+                                     block_q_shape[2], 1, 1])
+    padding_attention_bias = tf.expand_dims(
+        embedding_to_padding(k_unmasked_windows) * -1e9, axis=-2)
+    padding_attention_bias = tf.tile(padding_attention_bias,
+                                     [1, 1, 1, query_block_size, 1])
+    attention_bias = tf.concat([masked_attention_bias, padding_attention_bias],
+                               axis=-1)
+    # combine memory windows
+    k_windows = tf.concat([self_k_part, k_unmasked_windows], 3)
+    v_windows = tf.concat([self_v_part, v_unmasked_windows], 3)
+    output = dot_product_attention(
+        q, k_windows, v_windows, attention_bias, dropout_rate=0.,
+        name="dilated_1d", make_image_summary=False)
+    output = tf.reshape(output, [batch_size, num_heads, -1, depth_v])
+    # Remove the padding if introduced
+    output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1])
+    output.set_shape(v_list_shape)
+    return output
+
+
 def local_attention_2d(q,
                        k,
                        v,
@@ -1441,6 +1715,8 @@ def multihead_attention(query_antecedent,
                         q_padding="VALID",
                         kv_padding="VALID",
                         cache=None,
+                        gap_size=0,
+                        num_memory_blocks=2,
                         name=None,
                         **kwargs):
   """Multihead scaled-dot-product attention with input/output transformations.
@@ -1475,6 +1751,10 @@ def multihead_attention(query_antecedent,
         be empty Tensors of the appropriate shape.
             'k' [batch_size, 0, key_channels]
             'v' [batch_size, 0, value_channels]
+    gap_size: Integer option for dilated attention to indicate spacing between
+      memory blocks.
+    num_memory_blocks: Integer option to indicate how many memory blocks to look
+      at.
     name: an optional string
     **kwargs (dict): Params for the attention function
 
@@ -1542,13 +1822,22 @@ def multihead_attention(query_antecedent,
                                          dropout_rate, image_shapes)
     elif attention_type == "local_mask_right":
       x = masked_local_attention_1d(q, k, v, block_length=block_length)
-    else:
-      assert attention_type == "local_unmasked"
+    elif attention_type == "local_unmasked":
       x = local_attention_1d(
           q, k, v, block_length=block_length, filter_width=block_width)
+    elif attention_type == "masked_dilated_1d":
+      x = masked_dilated_self_attention_1d(q, k, v, block_length,
+                                           block_width,
+                                           gap_size,
+                                           num_memory_blocks)
+    else:
+      assert attention_type == "unmasked_dilated_1d"
+      x = dilated_self_attention_1d(q, k, v, block_length,
+                                    block_width,
+                                    gap_size,
+                                    num_memory_blocks)
     x = combine_heads(x)
     x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
-
     if additional_returned_value is not None:
       return x, additional_returned_value
     return x
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
@@ -86,10 +86,10 @@ def log_decode_results(inputs,
     if targets is not None:
       decoded_targets = " ".join(map(str, targets.flatten()))
   else:
-    decoded_outputs = "".join(
+    decoded_outputs = " ".join(
         map(str, targets_vocab.decode(_save_until_eos(outputs.flatten()))))
     if targets is not None:
-      decoded_targets = "".join(
+      decoded_targets = " ".join(
           map(str, targets_vocab.decode(_save_until_eos(targets.flatten()))))
 
   tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)