20210526 nestedtensor import

cpuhrsch · facebook-github-bot · commit 825729c276ee · 2021-05-26T10:33:35.000-07:00
Reviewed By: bhosmer

Differential Revision: D28712349

fbshipit-source-id: 559595fae8f3cff2cfc63fa3c4def5d402a1877d
diff --git a/benchmarks/mha_cuda.py b/benchmarks/mha_cuda.py
@@ -4,8 +4,8 @@
 
 
 @torch.inference_mode()
-def benchmark_torch_function(iters, f, *args):
-    f(*args)
+def benchmark_torch_function(iters, f, *args, **kwargs):
+    f(*args, **kwargs)
     if torch.cuda.is_available():
         torch.cuda.synchronize()
         start_event = torch.cuda.Event(enable_timing=True)
@@ -14,7 +14,7 @@ def benchmark_torch_function(iters, f, *args):
     else:
         t0 = time.time()
     for _ in range(iters):
-        f(*args)
+        f(*args, **kwargs)
     if torch.cuda.is_available():
         end_event.record()
         torch.cuda.synchronize()
@@ -38,15 +38,33 @@ def run(bdim, embedding_dim, nhead, min_t, max_t, iters, device):
     nt = nestedtensor.nested_tensor(tensors, device=device, dtype=torch.float)
 
     # Create MHA with self-attention in mind
-    lin = torch.nn.MultiheadAttention(embedding_dim, nhead).to(device).eval()
-    nt_time = benchmark_torch_function(iters, lin, nt, nt, nt)
-    # import sys; sys.exit(1)
+    mha = torch.nn.MultiheadAttention(embedding_dim, nhead).to(device).eval()
+
+    # Create regular padded Tensor with corresponding mask
+    data, mask = nt.to_tensor_mask(mask_dim=2)
+    # Prepare input for torch.nn.MHA, which is batch second for Tensor input
+    data = data.transpose(0, 1)
+    not_mask = torch.logical_not(mask)
+
+    # Comparison test to show correctness and API differences
+    with torch.inference_mode():
+        nt_output, _ = mha(nt, nt, nt, need_weights=False)
+        t_output, _ = mha(data, data, data, key_padding_mask=not_mask, need_weights=False)
+        nt_output_padded = nt_output.to_padded_tensor(padding=0)
+        t_output = t_output.transpose(0, 1)
+        # Fill in zero for masked-out values to enable comparison
+        t_output = t_output * mask.unsqueeze(-1)
+        # Tolerances taken from torch/testing/_core.py
+        assert torch.isclose(nt_output_padded, t_output, rtol=1e-4, atol=1e-5).all().item()
+
+    # Time NT version
+    nt_time = benchmark_torch_function(iters, mha, nt, nt, nt, need_weights=False)
 
-    # Created regular padded Tensor
-    data = nt.to_padded_tensor(padding=0)
     # Amount of storage used for padding only
     percentage_padded = 100 * (data.numel() - nt.numel()) / data.numel()
-    t_time = benchmark_torch_function(iters, lin, data, data, data)
+
+    # Time Tensor version
+    t_time = benchmark_torch_function(iters, mha, data, data, data, key_padding_mask=not_mask, need_weights=False)
 
     print(f"batch size: {bdim:4.0f}, embedding dim: {embedding_dim}, nhead: {nhead}, T mean:{lengths_mean:5.0f}, T std: {lengths_std:4.0f}", end='')
     print(f", padding: {percentage_padded:3.0f}%, NT: {nt_time/iters:4.0f}us, T: {t_time/iters:4.0f}us, Speedup: {t_time/nt_time:3.2f}x")
@@ -56,10 +74,10 @@ def run(bdim, embedding_dim, nhead, min_t, max_t, iters, device):
 if torch.cuda.is_available():
     print("CUDA device: ", torch.cuda.get_device_name(0))
     device = torch.device('cuda')
-iters = 1000
+iters = 10
 for nhead in [2, 4, 8]:
     print("")
-    for embed_dim in [128, 256, 512, 1024]:
+    for embed_dim in [1024, 512, 256, 128]:
         print("")
         for min_t, max_t in [(16, 128), (32, 128), (64, 128), (128, 128)]:
             run(256, embed_dim, nhead, min_t, max_t, iters, device)
diff --git a/nestedtensor/csrc/cuda/mha.cpp b/nestedtensor/csrc/cuda/mha.cpp
@@ -19,18 +19,6 @@ using namespace at;
 namespace torch {
 namespace nested_tensor {
 
-at::Tensor _sequence_mask(at::Tensor lengths) {
-    int64_t batch_size = lengths.numel();
-    int64_t max_len = lengths.max().item<int64_t>();
-    at::Tensor mask = torch::arange(0, max_len, torch::kFloat);
-    mask = mask.repeat({batch_size, 1});
-    mask = mask.lt(lengths.unsqueeze(1));
-    mask = mask.to(torch::kCUDA);
-    mask = mask.view({-1, 1, 1, max_len});
-    at::Tensor m2 = mask.transpose(2, 3);
-    return mask * m2;
-}
-
 at::Tensor bt_min_mha(
     int64_t num_heads,
     int64_t head_dim,
@@ -39,12 +27,8 @@ at::Tensor bt_min_mha(
     at::Tensor query,
     at::Tensor key,
     at::Tensor value,
-    at::Tensor attr_kernel_Q,
-    at::Tensor attr_kernel_K,
-    at::Tensor attr_kernel_V,
-    at::Tensor attr_bias_Q,
-    at::Tensor attr_bias_K,
-    at::Tensor attr_bias_V,
+    at::Tensor attr_kernel,
+    at::Tensor attr_bias,
     double scaling,
     at::Tensor out_proj_weight,
     at::Tensor out_proj_bias) {
@@ -90,9 +74,8 @@ at::Tensor bt_min_mha(
   TORCH_CHECK(query_esize.height() == 1, "Query nested dim isn't 1.");
   auto query_esize_sizes = query_esize.sizes();
 
-  at::Tensor attr_mask = _sequence_mask(
-      at::native::select(query_esize_sizes, 1, 0).contiguous());
-  attr_mask = attr_mask.to(float_options);
+  at::Tensor attr_mask = input_mask.view({-1, 1, 1, seq_len}).to(float_options);
+  attr_mask = attr_mask * attr_mask.transpose(2, 3);
 
   nteffectivetransformer::exclusiveScan_kernelLauncher(
       prefix_sum_ptr,
@@ -111,31 +94,29 @@ at::Tensor bt_min_mha(
       (int32_t)(embedding_dim),
       defaultStream);
 
-  // std::cout << "input_mask: " << input_mask << std::endl;
-  // std::cout << "prefix_sum: " << prefix_sum << std::endl;
-  // std::cout << "batch_idx: " << batch_idx << std::endl;
-  // std::cout << "word_idx: " << word_idx << std::endl;
-
-  at::Tensor q, k, v;
-  q = at::addmm(attr_bias_Q, query, attr_kernel_Q.t());
-  k = at::addmm(attr_bias_K, key, attr_kernel_K.t());
-  v = at::addmm(attr_bias_V, value, attr_kernel_V.t());
-  at::Tensor q_buf = get_buffer(q);
-  at::Tensor k_buf = get_buffer(k);
-  at::Tensor v_buf = get_buffer(v);
-
-  int valid_word_num = prefix_sum.reshape({-1})[word_num - 1].item<int>();
-  int last_mask = input_mask.reshape({-1})[word_num - 1].item<int>();
-  if (last_mask == 1) {
-    valid_word_num++;
-  }
+  at::Tensor packed = at::matmul(query, attr_kernel.t());
+  at::Tensor packed_buf = get_buffer(packed).contiguous().reshape({-1, 3 * embedding_dim});
+  std::vector<at::Tensor> packed_chunks = packed_buf.chunk(3, -1);
+  at::Tensor q_buf = packed_chunks[0].contiguous().reshape({-1});
+  at::Tensor k_buf = packed_chunks[1].contiguous().reshape({-1});
+  at::Tensor v_buf = packed_chunks[2].contiguous().reshape({-1});
+
+  int valid_word_num = get_numel(query) / embedding_dim;
 
   at::Tensor query_buf = torch::zeros(
       {batch_size, head_num, seq_len, size_per_head}, float_options);
   at::Tensor key_buf = torch::zeros(
       {batch_size, head_num, seq_len, size_per_head}, float_options);
   at::Tensor val_buf = torch::zeros(
       {batch_size, head_num, seq_len, size_per_head}, float_options);
+  at::Tensor attr_out =
+      torch::zeros({valid_word_num, embedding_dim}, float_options);
+
+  std::vector<at::Tensor> bias_chunks = attr_bias.chunk(3);
+  at::Tensor attr_bias_Q = bias_chunks[0];
+  at::Tensor attr_bias_K = bias_chunks[1];
+  at::Tensor attr_bias_V = bias_chunks[2];
+
   nteffectivetransformer::cuda::add_QKV_bias_padding_kernelLauncher<float>(
       q_buf.data_ptr<float>(),
       attr_bias_Q.data_ptr<float>(),
@@ -169,8 +150,6 @@ at::Tensor bt_min_mha(
 
   auto attn_output = at::matmul(attn_output_weights, val_buf);
 
-  at::Tensor attr_out =
-      torch::zeros({valid_word_num, embedding_dim}, float_options);
   nteffectivetransformer::cuda::transpose_rm_padding_kernelLauncher<float>(
       attn_output.data_ptr<float>(),
       attr_out.data_ptr<float>(),
@@ -184,7 +163,6 @@ at::Tensor bt_min_mha(
       defaultStream);
 
   // TODO: Bias is variably sized, need to add support for that.
-  // result = at::addmm(out_proj_bias, attr_out, out_proj_weight.t());
   at::Tensor result = at::matmul(attr_out, out_proj_weight.t());
   result = result.reshape({-1});
   return wrap_buffer(
@@ -195,7 +173,7 @@ at::Tensor bt_min_mha(
 
 TORCH_LIBRARY_FRAGMENT(nestedtensor, m) {
   m.def(
-      "bt_min_mha(int num_heads, int head_dim, float dropout_p, bool training, Tensor query, Tensor key, Tensor value, Tensor attr_kernel_Q, Tensor attr_kernel_K, Tensor attr_kernel_V, Tensor attr_bias_Q, Tensor attr_bias_K, Tensor attr_bias_V, float scaling, Tensor out_proj_weight, Tensor out_proj_bias) -> Tensor");
+      "bt_min_mha(int num_heads, int head_dim, float dropout_p, bool training, Tensor query, Tensor key, Tensor value, Tensor attr_kernel, Tensor attr_bias, float scaling, Tensor out_proj_weight, Tensor out_proj_bias) -> Tensor");
   m.impl("bt_min_mha", NestedTensorKey, &bt_min_mha);
 }
 
diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp
@@ -49,56 +49,7 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) {
       other);
 }
 
-Tensor NestedTensor_addmm(
-    const Tensor& bias,
-    const Tensor& input,
-    const Tensor& weight,
-    const c10::Scalar& alpha,
-    const c10::Scalar& beta) {
-  if (!is_nested_tensor_impl(bias) && is_nested_tensor_impl(input) &&
-      !is_nested_tensor_impl(weight)) {
-    if (get_is_contiguous(input)) {
-      if (get_dim(bias) == 1 && get_dim(input) == 3 && get_dim(weight) == 2) {
-        auto input_opt_sizes = get_opt_sizes(input);
-        if (input_opt_sizes[2]) {
-          if (*input_opt_sizes[2] == weight.size(1)) {
-            Tensor input_buffer = get_buffer(input);
-            Tensor result_buffer =
-                at::addmm(
-                    bias,
-                    input_buffer.reshape({-1, weight.size(1)}),
-                    weight,
-                    alpha,
-                    beta)
-                    .reshape({-1});
-            int64_t weight_size_1 = weight.size(1);
-            EfficientSizeNode result_nested_size = map_efficient_size(
-                [weight_size_1](int64_t* data_ptr, int64_t size) {
-                  data_ptr[1] = weight_size_1;
-                },
-                get_efficient_nested_size(input));
-            EfficientSizeNode input_nested_stride =
-                get_efficient_nested_stride(input);
-            return wrap_buffer(
-                std::move(result_buffer),
-                result_nested_size,
-                input_nested_stride);
-          }
-        }
-      }
-    }
-  }
-  return map_nested_tensor(
-      [&alpha, &beta](at::Tensor bias, at::Tensor input, at::Tensor weight) {
-        return at::addmm(bias, input, weight, alpha, beta);
-      },
-      bias,
-      input,
-      weight);
-}
-
 TORCH_LIBRARY_IMPL(aten, NestedTensor, m) {
-  nt_impl(m, "addmm", NestedTensor_addmm);
   nt_impl(m, "matmul", NestedTensor_matmul);
 }
 } // namespace at
diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp
@@ -39,21 +39,22 @@ at::Tensor min_mha(
   int64_t edim = *(opt_sizes[2]);
 
   at::Tensor q, k, v;
-  q = at::addmm(
-      at::slice(*in_proj_bias, 0, 0, edim).contiguous(),
+  q = at::matmul(
       query,
-      at::slice(in_proj_weight, 0, 0, edim).t().contiguous(),
-      scaling,
-      scaling);
-  k = at::addmm(
-      at::slice(*in_proj_bias, 0, edim, 2 * edim).contiguous(),
+      at::slice(in_proj_weight, 0, 0, edim).t().contiguous());
+  k = at::matmul(
       key,
       at::slice(in_proj_weight, 0, edim, 2 * edim).t().contiguous());
-  v = at::addmm(
-      at::slice(*in_proj_bias, 0, 2 * edim).contiguous(),
+  v = at::matmul(
       value,
       at::slice(in_proj_weight, 0, 2 * edim).t().contiguous());
 
+  q = q + at::slice(*in_proj_bias, 0, 0, edim).contiguous();
+  k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim).contiguous();
+  v = v + at::slice(*in_proj_bias, 0, 2 * edim).contiguous();
+
+  q = q * torch::tensor(scaling);
+
   q = q.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
   k = k.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
   v = v.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
@@ -62,7 +63,8 @@ at::Tensor min_mha(
   attn_output_weights = at::dropout(attn_output_weights, dropout_p, training);
   auto attn_output = at::matmul(attn_output_weights, v);
   attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim});
-  attn_output = at::addmm(out_proj_bias, attn_output, out_proj_weight.t());
+  attn_output = at::matmul(attn_output, out_proj_weight.t());
+  attn_output = attn_output + out_proj_bias;
   return attn_output;
 }
 
diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py
@@ -11,21 +11,6 @@
 # NT case query, key, value have nested_dim 1 and are of shape (bsz, tgt_len, embed_dim)
 
 
-def sequence_mask(lengths, max_len=None, is_2d=True):
-    batch_size = lengths.numel()
-    max_len = max_len or lengths.max()
-    mask = (torch.arange(0, max_len, device=lengths.device)
-            .type_as(lengths)
-            .repeat(batch_size, 1)
-            .lt(lengths.unsqueeze(1)))
-    if is_2d:
-        return mask
-    else:
-        mask = mask.view(-1, 1, 1, max_len)
-        m2 = mask.transpose(2, 3)
-        return mask * m2
-
-
 def multi_head_attention_forward(query,
                                  key,
                                  value,
@@ -77,21 +62,15 @@ def multi_head_attention_forward(query,
     scaling = float(head_dim) ** -0.5
 
     if query is key and key is value and in_proj_weight.is_cuda:
-        w_q, w_k, w_v = in_proj_weight.chunk(3)
-        b_q, b_k, b_v = in_proj_bias.chunk(3)
         return torch.ops.nestedtensor.bt_min_mha(num_heads,
                                                  head_dim,
                                                  0.5,
                                                  False,
                                                  query,
                                                  query,
                                                  query,
-                                                 w_q.contiguous(),
-                                                 w_k.contiguous(),
-                                                 w_v.contiguous(),
-                                                 b_q.contiguous(),
-                                                 b_k.contiguous(),
-                                                 b_v.contiguous(),
+                                                 in_proj_weight,
+                                                 in_proj_bias,
                                                  scaling,
                                                  out_proj_weight,
                                                  in_proj_bias), None
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.1.4+bf12d17'
-git_version = 'bf12d17c3b7891c713cb16b7e36926b873813ceb'
+__version__ = '0.1.4+e1d384f'
+git_version = 'e1d384fea9d70a664b38a53768f82c81057a7d13'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_functional.py b/test/test_nested_tensor_functional.py