pytorch
diff --git a/‎benchmarks/mha.py‎
Lines changed: 7 additions & 16 deletions b/‎benchmarks/mha.py‎
Lines changed: 7 additions & 16 deletions
diff --git a/‎benchmarks/mha_cuda.py‎
Lines changed: 65 additions & 0 deletions b/‎benchmarks/mha_cuda.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎benchmarks/utils.py‎
Lines changed: 3 additions & 4 deletions b/‎benchmarks/utils.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎nestedtensor/csrc/cuda/mha.cpp‎
Lines changed: 26 additions & 6 deletions b/‎nestedtensor/csrc/cuda/mha.cpp‎
Lines changed: 26 additions & 6 deletions
diff --git a/‎nestedtensor/csrc/masking.cpp‎
Lines changed: 138 additions & 2 deletions b/‎nestedtensor/csrc/masking.cpp‎
Lines changed: 138 additions & 2 deletions
@@ -53,7 +53,7 @@ def from_tensor_list(cls, tensor_list):
 MODEL = torch.nn.MultiheadAttention(NDIM, NHEAD).to(DEVICE).eval()
 
 
-def run_benchmark(bsz, mean_i, mean_j, var, autograd, writer):
+def run_benchmark(bsz, mean_i, mean_j, var, writer):
     RAND_INTS = [(int(random.gauss(mean_j, var)), int(
         random.gauss(mean_i, var))) for _ in range(bsz)]
     src_ = nestedtensor.nested_tensor(
@@ -70,39 +70,31 @@ def gen_t_loop_mha(src):
         src, mask = detr_nt_src.decompose()
         src = src.flatten(2).permute(2, 0, 1).contiguous()
         mask = mask.flatten(1).contiguous()
-        if autograd:
-            src.requires_grad_()
 
         def te():
-            if autograd:
-                MODEL(src, src, src, key_padding_mask=mask,
-                      need_weights=False)[0].sum()  # .backward()
             MODEL(src, src, src, key_padding_mask=mask,
                   need_weights=False)
 
         return te
 
     def gen_nt_mha(src):
         src = nestedtensor.nested_tensor([t.flatten(1).permute(
-            1, 0) for t in src], device=DEVICE, dtype=torch.float, requires_grad=False)
+            1, 0) for t in src], device=DEVICE, dtype=torch.float)
 
         def nt():
-            if autograd:
-                MODEL(src, src, src, need_weights=False)[
-                    0].sum()  # .backward()
             MODEL(src, src, src, need_weights=False)
 
         return nt
 
     result_t = {**utils.benchmark_fn(gen_t_loop_mha(src), 5.0, cuda=True), "bsz": bsz,
-                "sparsity": sparsity, "autograd": autograd, "var": var, "mean_i": mean_i, "mean_j": mean_j}
+                "sparsity": sparsity, "var": var, "mean_i": mean_i, "mean_j": mean_j}
     result_t["numel"] = sum([x.numel() for x in src_])
     result_t["numel_div_avg_us"] = result_t["numel"] / result_t["avg_us"]
     result_t["avg_ns_div_numel"] = result_t["avg_us"] / \
         result_t["numel"] * 1000
     writer.writerow(result_t)
     result_nt = {**utils.benchmark_fn(gen_nt_mha(src), 5.0, cuda=True),
-                 "bsz": bsz, "sparsity": 0.0, "autograd": autograd, "var": var, "mean_i": mean_i, "mean_j": mean_j}
+                 "bsz": bsz, "sparsity": 0.0, "var": var, "mean_i": mean_i, "mean_j": mean_j}
     result_nt["numel"] = sum([x.numel() for x in src_])
     result_nt["numel_div_avg_us"] = result_nt["numel"] / result_nt["avg_us"]
     result_nt["avg_ns_div_numel"] = result_nt["avg_us"] / \
@@ -115,10 +107,9 @@ def nt():
     torch.manual_seed(1011)
     writer = csv.DictWriter(sys.stdout, fieldnames=[
                             "name", "avg_us", "std_us", "runs", "bsz", "sparsity",
-                            "autograd", "var", "mean_i", "mean_j", "numel", "numel_div_avg_us",
+                            "var", "mean_i", "mean_j", "numel", "numel_div_avg_us",
                             "avg_ns_div_numel"])
     writer.writeheader()
     for var in [float(i) / 10 for i in range(0, 100, 50)]:
-        for autograd in [False]:
-            for batch_size in [2, 8, 16]:
-                run_benchmark(batch_size, 30, 30, var, autograd, writer)
+        for batch_size in [2, 8, 16]:
+            run_benchmark(batch_size, 30, 30, var, writer)
@@ -0,0 +1,65 @@
+import torch
+import time
+import nestedtensor
+
+
+@torch.inference_mode()
+def benchmark_torch_function(iters, f, *args):
+    f(*args)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+    for _ in range(iters):
+        f(*args)
+    if torch.cuda.is_available():
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event) * 1e3
+    else:
+        return (time.time() - t0) * 1e6
+
+
+def run(bdim, embedding_dim, nhead, min_t, max_t, iters, device):
+    import random
+    random.seed(1010)
+
+    # The following is meant to emulate the lenghts of randomly sampled tokenized sentences
+    lengths = [random.randint(min_t, max_t) for _ in range(bdim)]
+    lengths_mean = torch.tensor(lengths, dtype=torch.float).mean().item()
+    lengths_std = torch.tensor(lengths, dtype=torch.float).std().item()
+
+    # List of sentence embeddings
+    tensors = [torch.rand(i, embedding_dim) for i in lengths]
+    # Create packed NestedTensor
+    nt = nestedtensor.nested_tensor(tensors, device=device, dtype=torch.float)
+
+    # Create MHA with self-attention in mind
+    lin = torch.nn.MultiheadAttention(embedding_dim, nhead).to(device).eval()
+    nt_time = benchmark_torch_function(iters, lin, nt, nt, nt)
+    # import sys; sys.exit(1)
+
+    # Created regular padded Tensor
+    data = nt.to_padded_tensor(padding=0)
+    # Amount of storage used for padding only
+    percentage_padded = 100 * (data.numel() - nt.numel()) / data.numel()
+    t_time = benchmark_torch_function(iters, lin, data, data, data)
+
+    print(f"batch size: {bdim:4.0f}, embedding dim: {embedding_dim}, nhead: {nhead}, T mean:{lengths_mean:5.0f}, T std: {lengths_std:4.0f}", end='')
+    print(f", padding: {percentage_padded:3.0f}%, NT: {nt_time/iters:4.0f}us, T: {t_time/iters:4.0f}us, Speedup: {t_time/nt_time:3.2f}x")
+
+
+device = torch.device('cpu')
+if torch.cuda.is_available():
+    print("CUDA device: ", torch.cuda.get_device_name(0))
+    device = torch.device('cuda')
+iters = 1000
+for nhead in [2, 4, 8]:
+    print("")
+    for embed_dim in [128, 256, 512, 1024]:
+        print("")
+        for min_t, max_t in [(16, 128), (32, 128), (64, 128), (128, 128)]:
+            run(256, embed_dim, nhead, min_t, max_t, iters, device)
@@ -1,7 +1,5 @@
-from nestedtensor import torch
+import torch
 import time
-import random
-import pprint
 
 import cProfile
 import pstats
@@ -18,7 +16,8 @@ def gen_tensor():
     # return torch.tensor([globals()['SEED']])
     return torch.rand(EMBED_DIM)
 
-def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False):
+
+def benchmark_fn(fn, run_time=5.0, use_cprofile=False, warmup=1.0, cuda=False):
     times = []
     t = 0.0
     pr = cProfile.Profile()
 
@@ -19,12 +19,23 @@ using namespace at;
 namespace torch {
 namespace nested_tensor {
 
+at::Tensor _sequence_mask(at::Tensor lengths) {
+    int64_t batch_size = lengths.numel();
+    int64_t max_len = lengths.max().item<int64_t>();
+    at::Tensor mask = torch::arange(0, max_len, torch::kFloat);
+    mask = mask.repeat({batch_size, 1});
+    mask = mask.lt(lengths.unsqueeze(1));
+    mask = mask.to(torch::kCUDA);
+    mask = mask.view({-1, 1, 1, max_len});
+    at::Tensor m2 = mask.transpose(2, 3);
+    return mask * m2;
+}
+
 at::Tensor bt_min_mha(
     int64_t num_heads,
     int64_t head_dim,
     double dropout_p,
     bool training,
-    at::Tensor input_mask,
     at::Tensor query,
     at::Tensor key,
     at::Tensor value,
@@ -36,8 +47,7 @@ at::Tensor bt_min_mha(
     at::Tensor attr_bias_V,
     double scaling,
     at::Tensor out_proj_weight,
-    at::Tensor out_proj_bias,
-    at::Tensor attr_mask) {
+    at::Tensor out_proj_bias) {
   // TODO: Assert that max seq_len is 1024!
   TORCH_CHECK(get_dim(query) == 3, "query needs to be 3 dim.");
   TORCH_CHECK(get_dim(key) == 3, "key needs to be 3 dim.");
@@ -49,15 +59,17 @@ at::Tensor bt_min_mha(
   // }
   // TODO: Add explicit check that verifies query, key and value are the same
   // auto start = std::chrono::system_clock::now();
+  auto options =
+      torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
+  at::Tensor input_mask = to_mask(query, 2);
+  input_mask = input_mask.to(options);
   int64_t batch_size = input_mask.size(0);
   int64_t seq_len = input_mask.size(1);
   int64_t embedding_dim = head_dim * num_heads; //*(opt_sizes[2]);
   int64_t head_num = num_heads;
   int64_t size_per_head = embedding_dim / head_num;
   auto float_options =
       torch::TensorOptions().dtype(torch::kFloat).device(torch::kCUDA);
-  auto options =
-      torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
   at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
   at::cuda::setCurrentCUDAStream(defaultStream);
 
@@ -74,6 +86,14 @@ at::Tensor bt_min_mha(
 
   at::Tensor tmp = get_buffer(query);
 
+  auto query_esize = get_efficient_nested_size(query);
+  TORCH_CHECK(query_esize.height() == 1, "Query nested dim isn't 1.");
+  auto query_esize_sizes = query_esize.sizes();
+
+  at::Tensor attr_mask = _sequence_mask(
+      at::native::select(query_esize_sizes, 1, 0).contiguous());
+  attr_mask = attr_mask.to(float_options);
+
   nteffectivetransformer::exclusiveScan_kernelLauncher(
       prefix_sum_ptr,
       input_mask.data_ptr<int>(),
@@ -175,7 +195,7 @@ at::Tensor bt_min_mha(
 
 TORCH_LIBRARY_FRAGMENT(nestedtensor, m) {
   m.def(
-      "bt_min_mha(int num_heads, int head_dim, float dropout_p, bool training, Tensor input_mask, Tensor query, Tensor key, Tensor value, Tensor attr_kernel_Q, Tensor attr_kernel_K, Tensor attr_kernel_V, Tensor attr_bias_Q, Tensor attr_bias_K, Tensor attr_bias_V, float scaling, Tensor out_proj_weight, Tensor out_proj_bias, Tensor attr_mask) -> Tensor");
+      "bt_min_mha(int num_heads, int head_dim, float dropout_p, bool training, Tensor query, Tensor key, Tensor value, Tensor attr_kernel_Q, Tensor attr_kernel_K, Tensor attr_kernel_V, Tensor attr_bias_Q, Tensor attr_bias_K, Tensor attr_bias_V, float scaling, Tensor out_proj_weight, Tensor out_proj_bias) -> Tensor");
   m.impl("bt_min_mha", NestedTensorKey, &bt_min_mha);
 }
 
 
@@ -79,7 +79,20 @@ std::vector<int64_t> _get_max_size(const SizeNode& size_node) {
   return result;
 }
 
-std::vector<int64_t> get_max_size(Tensor nt) {
+std::vector<int64_t> get_max_size(const Tensor& nt) {
+  if (get_nested_dim(nt) == 1) {
+    auto nt_opt_sizes = get_opt_sizes(nt);
+    if (nt_opt_sizes.size() > 0 && *nt_opt_sizes[0] > 0) {
+      auto esize = get_efficient_nested_size(nt);
+      auto sizes = esize.sizes();
+      auto max_sizes = std::get<0>(sizes.max(0));
+      std::vector<int64_t> result;
+      for (int64_t i = 0; i < max_sizes.size(0); i++) {
+        result.push_back(max_sizes[i].item<int64_t>());
+      }
+      return result;
+    }
+  }
   return _get_max_size(get_nested_size(nt));
 }
 
@@ -203,7 +216,6 @@ std::tuple<Tensor, Tensor> to_tensor_mask(
     auto nt_opt_size = get_opt_sizes(nt);
     Tensor nt_buffer = get_buffer(nt);
     if (nt_opt_size[2] && nt_buffer.is_cuda()) {
-      std::cout << "Calling efficient to_tensor_mask" << std::endl;
       Tensor nt_sizes_ =
           get_efficient_nested_size(nt).sizes().to(torch::kInt32);
       TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor must be of nested_dim 2.")
@@ -258,6 +270,127 @@ std::tuple<Tensor, Tensor> to_tensor_mask(
   return merge_tensor_mask(res_tensor, res_mask, mask_dim);
 }
 
+Tensor merge_mask(
+    Tensor mask,
+    c10::optional<int64_t> mask_dim) {
+  if (mask_dim && get_dim(mask) == (*mask_dim)) {
+    return  mask;
+  }
+
+  if (get_dim(mask) == 0) {
+    return mask;
+  }
+
+  int64_t last_size = mask.size(-1);
+  Tensor collapsed_mask = mask.sum(-1);
+  Tensor is_last_size = (collapsed_mask == last_size);
+  Tensor is_zero = (collapsed_mask == 0);
+  int64_t is_last_size_sum = is_last_size.sum().item<int64_t>();
+  int64_t is_zero_sum = is_zero.sum().item<int64_t>();
+  if ((is_last_size_sum + is_zero_sum) == get_numel(collapsed_mask)) {
+    collapsed_mask = collapsed_mask.to(torch::kBool);
+    return merge_mask(collapsed_mask, mask_dim);
+  }
+
+  if (mask_dim && mask_dim != get_dim(mask)) {
+    throw std::runtime_error(
+        "Mask dimension is too small to represent data tensor.");
+  }
+  // This is expected to be a no-op, except in rare cases.
+  mask = mask.contiguous();
+  return mask;
+}
+
+Tensor _create_nt_mask(std::vector<int64_t> sizes, std::vector<int64_t> shape) {
+  int64_t numel = 1;
+  for (size_t i = 0; i < sizes.size(); i++) {
+    numel = numel * sizes[i];
+  }
+  TORCH_CHECK(numel > 0, "Empty tensors are not yet supported.");
+  // Dont pad in case of a scalar
+  if (sizes.size() == 0) {
+    return torch::tensor(true);
+  }
+  auto options = torch::TensorOptions().dtype(torch::kByte);
+  Tensor mask = pad_tensor_to_shape(
+      torch::full(
+          IntArrayRef(sizes),
+          true,
+          options),
+      shape);
+  return mask;
+}
+
+Tensor _create_nt_mask(SizeNode nt_size, std::vector<int64_t> shape) {
+  if (nt_size.degree() == 0) {
+    return _create_nt_mask(nt_size.payload(), shape);
+  }
+
+  std::vector<Tensor> res_mask;
+  if (nt_size.degree() == 0) {
+    return torch::tensor({false}, torch::kByte);
+  } else {
+    for (auto child : nt_size.unbind()) {
+      Tensor mask = _create_nt_mask(child, shape);
+      res_mask.push_back(mask);
+    }
+  }
+
+  return at::stack(res_mask);
+}
+
+Tensor _create_nt_mask(EfficientSizeNode nt_size, std::vector<int64_t> shape) {
+  if (nt_size.height() == 1) {
+    std::vector<at::Tensor> tmp_masks;
+    auto esizes = nt_size.sizes();
+    int64_t* esizes_ptr = esizes.data_ptr<int64_t>();
+    for(int64_t i = 0; i < esizes.size(0); i++) {
+      std::vector<int64_t> tmp_sizes;
+      for(size_t j = 0; j < shape.size(); j++) {
+        tmp_sizes.push_back(esizes_ptr[i * esizes.stride(0) + j]);
+      }
+      tmp_masks.push_back(_create_nt_mask(tmp_sizes, shape));
+    }
+    return at::stack(tmp_masks);
+  }
+  return _create_nt_mask(nt_size.to_size_node(), shape);
+}
+
+Tensor to_mask(
+    Tensor nt,
+    c10::optional<int64_t> mask_dim) {
+  TORCH_CHECK(
+      !mask_dim || *mask_dim <= get_dim(nt),
+      "Requested mask dimension ",
+      *mask_dim,
+      " is bigger than dimension ",
+      get_dim(nt),
+      " of given NestedTensor.");
+
+
+  auto opt_sizes = get_opt_sizes(nt);
+  if (opt_sizes.size() == 1 && *opt_sizes[0] == 1) {
+    Tensor result_mask = !mask_dim || *mask_dim == 0 ? torch::tensor(true)
+                                                     : torch::tensor({true});
+    return result_mask;
+  }
+
+  std::vector<int64_t> max_size;
+  if (get_nested_dim(nt) == 1 &&
+      get_dim(nt) > 1 &&
+      mask_dim &&
+      *mask_dim > 1) {
+    auto tmp_max_size = get_max_size(nt);
+    for (int64_t i = 1; i < *mask_dim; i++) {
+      max_size.push_back(tmp_max_size[i - 1]);
+    }
+    return _create_nt_mask(get_efficient_nested_size(nt), max_size);
+  }
+  max_size = get_max_size(nt);
+  at::Tensor res_mask = _create_nt_mask(get_efficient_nested_size(nt), max_size);
+  return merge_mask(res_mask, mask_dim);
+}
+
 Tensor to_padded_tensor(Tensor nt, double padding) {
 #ifdef WITH_CUDA
   if (get_dim(nt) == 3 && get_is_contiguous(nt)) {
@@ -315,6 +448,9 @@ TORCH_LIBRARY_FRAGMENT(nestedtensor, m) {
   m.def("to_tensor_mask(Tensor nt, int? mask_dim) -> (Tensor, Tensor)");
   m.impl("to_tensor_mask", NestedTensorKey, to_tensor_mask);
 
+  m.def("to_mask(Tensor nt, int? mask_dim) -> Tensor");
+  m.impl("to_mask", NestedTensorKey, to_mask);
+
   m.def("to_padded_tensor(Tensor nt, float padding) -> Tensor");
   m.impl("to_padded_tensor", NestedTensorKey, to_padded_tensor);
 }