pytorch
diff --git a/‎benchmarks/embedding.py‎
Lines changed: 64 additions & 0 deletions b/‎benchmarks/embedding.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/autograd_functions.cpp‎
Lines changed: 4 additions & 4 deletions b/‎nestedtensor/csrc/autograd_functions.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎nestedtensor/csrc/cuda/padding.cu‎
Lines changed: 59 additions & 1 deletion b/‎nestedtensor/csrc/cuda/padding.cu‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎nestedtensor/csrc/cuda/padding.h‎
Lines changed: 13 additions & 0 deletions b/‎nestedtensor/csrc/cuda/padding.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/functions.cpp‎
Lines changed: 6 additions & 3 deletions b/‎nestedtensor/csrc/functions.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎nestedtensor/csrc/masking.cpp‎
Lines changed: 44 additions & 9 deletions b/‎nestedtensor/csrc/masking.cpp‎
Lines changed: 44 additions & 9 deletions
diff --git a/‎nestedtensor/csrc/nested_tensor_impl.cpp‎
Lines changed: 0 additions & 7 deletions b/‎nestedtensor/csrc/nested_tensor_impl.cpp‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎nestedtensor/csrc/nested_tensor_impl.h‎
Lines changed: 3 additions & 6 deletions b/‎nestedtensor/csrc/nested_tensor_impl.h‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎nestedtensor/csrc/py_init.cpp‎
Lines changed: 0 additions & 16 deletions b/‎nestedtensor/csrc/py_init.cpp‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎nestedtensor/csrc/storage/EfficientSizeNode.h‎
Lines changed: 3 additions & 0 deletions b/‎nestedtensor/csrc/storage/EfficientSizeNode.h‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,64 @@
+import torch
+import time
+import nestedtensor
+
+
+@torch.inference_mode()
+def benchmark_torch_function(iters, f, *args):
+    f(*args)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+    for _ in range(iters):
+        f(*args)
+    if torch.cuda.is_available():
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event) * 1e3
+    else:
+        return (time.time() - t0) * 1e6
+
+
+def run(bdim, embedding_dim, vocab_size, min_t, max_t, iters, device):
+    import random
+    random.seed(1010)
+
+    # The following is meant to emulate the lenghts of randomly sampled tokenized sentences
+    lengths = [random.randint(min_t, max_t) for _ in range(bdim)]
+    lengths_mean = torch.tensor(lengths, dtype=torch.float).mean().item()
+    lengths_std = torch.tensor(lengths, dtype=torch.float).std().item()
+
+    # List of sentence embeddings
+    tensors = [torch.tensor(random.randint(1, vocab_size)) for i in lengths]
+    # Create packed NestedTensor
+    nt = nestedtensor.nested_tensor(tensors, device=device, dtype=torch.int64)
+    # Created regular padded Tensor
+    data, _ = nt.to_tensor_mask()
+    data = data.to(torch.int64)
+    # Amount of storage used for padding only
+    percentage_padded = 100 * (data.numel() - nt.numel()) / data.numel()
+
+    # Projects embeddings into another space
+    lin = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0).to(device)
+    nt_time = benchmark_torch_function(iters, lin, nt)
+    t_time = benchmark_torch_function(iters, lin, data)
+
+    print(f"batch size: {bdim:4.0f}, embedding dim: {embedding_dim}, vocab_size: {vocab_size}, T mean:{lengths_mean:5.0f}, T std: {lengths_std:4.0f}", end='')
+    print(f", padding: {percentage_padded:3.0f}%, NT: {nt_time/iters:4.0f}us, T: {t_time/iters:4.0f}us, Speedup: {t_time/nt_time:3.2f}x")
+
+
+device = torch.device('cpu')
+if torch.cuda.is_available():
+    print("CUDA device: ", torch.cuda.get_device_name(0))
+    device = torch.device('cuda')
+iters = 100
+for vocab_size in [65536, 32768, 16384, 8192, 4096]:
+    print("")
+    for embed_dim in [4096, 2048, 1024, 512, 256]:
+        print("")
+        for min_t, max_t in [(16, 128), (32, 128), (64, 128), (128, 128)]:
+            run(256, embed_dim, vocab_size, min_t, max_t, iters, device)
@@ -92,21 +92,21 @@ Tensor NestedTensor_batch_norm(
   int64_t n_input = *opt_sizes[1];
   if (running_mean) {
     check_dims_match_num_input_features(
-        "running_mean", n_input, running_mean->numel());
+        "running_mean", n_input, get_numel(*running_mean));
   } else if (!training) {
     AT_ERROR("running_mean must be defined in evaluation mode");
   }
   if (running_var) {
     check_dims_match_num_input_features(
-        "running_var", n_input, running_var->numel());
+        "running_var", n_input, get_numel(*running_var));
   } else if (!training) {
     AT_ERROR("running_var must be defined in evaluation mode");
   }
   if (weight) {
-    check_dims_match_num_input_features("weight", n_input, weight->numel());
+    check_dims_match_num_input_features("weight", n_input, get_numel(*weight));
   }
   if (bias) {
-    check_dims_match_num_input_features("bias", n_input, bias->numel());
+    check_dims_match_num_input_features("bias", n_input, get_numel(*bias));
   }
 
   auto scalar_shape = make_scalar_shape(get_dim(input), n_input);
 
@@ -26,7 +26,7 @@ void add_padding(
 template<typename T>
 void add_padding_kernelLauncher(
     T* input, // [batch_size x None]
-    T* output, // [batch_size x max(input.nested_size(1))]
+    T* output, // [batch_size x max(input.nested_size(1)) x inner_size]
     const int* offsets, // [batch_size]
     const int batch_size,
     const int output_stride,
@@ -53,5 +53,63 @@ template void add_padding_kernelLauncher<float>(
     const int output_stride,
     const int inner_size,
     const cudaStream_t stream);
+
+template<typename T>
+__global__
+void add_padding_mask(
+    const T* input,
+    T* output,
+    int* output_mask,
+    const int* offsets,
+    const int batch_size,
+    const int mask_stride,
+    const int output_stride,
+    const int inner_size)
+{
+  const int batch_id  = blockIdx.x;
+  for (int i = 0; i < (offsets[batch_id + 1] - offsets[batch_id]); i++) {
+    output_mask[batch_id*mask_stride + i] = 1;
+  }
+  for (int i = 0; i < (offsets[batch_id + 1] - offsets[batch_id]) * inner_size; i++) {
+    output[batch_id * output_stride + i] = input[offsets[batch_id] * inner_size + i];
+  }
+}
+
+template<typename T>
+void add_padding_mask_kernelLauncher(
+    T* input, // [batch_size x None]
+    T* output, // [batch_size x max(input.nested_size(1)) x inner_size]
+    int* output_mask, // [batch_size x max(input.nested_size(1))]
+    const int* offsets, // [batch_size]
+    const int batch_size,
+    const int mask_stride,
+    const int output_stride,
+    const int inner_size,
+    const cudaStream_t stream)
+{
+  dim3 grid;
+  grid.x = batch_size;
+
+  add_padding_mask<float><<<grid, 1, 0, stream>>>(
+      input,
+      output,
+      output_mask,
+      offsets,
+      batch_size,
+      mask_stride,
+      output_stride,
+      inner_size);
+}
+
+template void add_padding_mask_kernelLauncher<float>(
+    float* input,
+    float* output,
+    int* output_mask,
+    const int* offsets,
+    const int batch_size,
+    const int mask_stride,
+    const int output_stride,
+    const int inner_size,
+    const cudaStream_t stream);
 }
 }
@@ -15,5 +15,18 @@ void add_padding_kernelLauncher(
     const int output_stride,
     const int inner_size,
     const cudaStream_t stream);
+
+template <typename T>
+void add_padding_mask_kernelLauncher(
+    T* input,
+    T* output,
+    int* output_mask,
+    const int* lengths,
+    const int batch_size,
+    const int mask_stride,
+    const int output_stride,
+    const int inner_size,
+    const cudaStream_t stream);
+
 }
 } // namespace nested_tensor
@@ -26,9 +26,12 @@ Tensor NestedTensor_embedding(
         weight,
         indices);
   }
-  if (is_nested_tensor_impl(indices) && get_is_contiguous(indices) &&
-      !is_nested_tensor_impl(weight) && get_dim(indices) == 2 &&
-      get_nested_dim(indices) == 1) {
+  if (is_nested_tensor_impl(indices) &&
+      !is_nested_tensor_impl(weight) &&
+      get_dim(indices) == 1 &&
+      get_dim(weight) == 2 &&
+      get_is_contiguous(indices) &&
+      get_is_contiguous(weight)) {
     Tensor indices_buffer = get_buffer(indices);
     Tensor result_buffer = at::embedding(
         weight, indices_buffer, padding_idx, scale_grad_by_freq, sparse);
 
@@ -26,7 +26,7 @@ std::tuple<Tensor, Tensor> merge_tensor_mask(
   Tensor is_zero = (collapsed_mask == 0);
   int64_t is_last_size_sum = is_last_size.sum().item<int64_t>();
   int64_t is_zero_sum = is_zero.sum().item<int64_t>();
-  if ((is_last_size_sum + is_zero_sum) == collapsed_mask.numel()) {
+  if ((is_last_size_sum + is_zero_sum) == get_numel(collapsed_mask)) {
     collapsed_mask = collapsed_mask.to(torch::kBool);
     return merge_tensor_mask(tensor, collapsed_mask, mask_dim);
   }
@@ -85,7 +85,7 @@ std::vector<int64_t> get_max_size(Tensor nt) {
 
 std::tuple<Tensor, Tensor> pad_nt(Tensor nt, std::vector<int64_t> shape) {
   if (!is_nested_tensor_impl(nt)) {
-    if (nt.numel() == 0) {
+    if (get_numel(nt) == 0) {
       TORCH_CHECK(false, "Empty tensors are not yet supported.");
     }
     // Dont pad in case of a scalar
@@ -131,7 +131,7 @@ c10::optional<Tensor> nt_from_tensor_mask(
     Tensor mask,
     int64_t nested_dim) {
   if (nested_dim == 0) {
-    if ((mask.numel() == 0) || (mask.numel() == 1 && mask.item<bool>())) {
+    if ((get_numel(mask) == 0) || (get_numel(mask) == 1 && mask.item<bool>())) {
       return tensor;
     }
 
@@ -153,7 +153,7 @@ c10::optional<Tensor> nt_from_tensor_mask(
       bool all_zero = true;
       for (int64_t i = 0; i < mask.size(0); i++) {
         Tensor tmp = *nt_from_tensor_mask(tensor[i], mask[i], nested_dim);
-        if (tmp.numel() > 0) {
+        if (get_numel(tmp) > 0) {
           all_zero = false;
           tensors.push_back(tmp);
         }
@@ -172,12 +172,12 @@ c10::optional<Tensor> nt_from_tensor_mask(
     return c10::nullopt;
   }
   std::vector<c10::optional<Tensor>> inner_tensors;
-  if ((mask.numel() == 0) || (mask.numel() == 1 && mask.item<bool>())) {
+  if ((get_numel(mask) == 0) || (get_numel(mask) == 1 && mask.item<bool>())) {
     for (int64_t i = 0; i < tensor.size(0); i++) {
       inner_tensors.push_back(
           nt_from_tensor_mask(tensor[i], mask, nested_dim - 1));
     }
-  } else if (mask.numel() == 1 && !mask.item<bool>()) {
+  } else if (get_numel(mask) == 1 && !mask.item<bool>()) {
     inner_tensors.push_back(c10::nullopt);
   } else {
     for (int64_t i = 0; i < tensor.size(0); i++) {
@@ -198,6 +198,41 @@ c10::optional<Tensor> nt_from_tensor_mask(
 std::tuple<Tensor, Tensor> to_tensor_mask(
     Tensor nt,
     c10::optional<int64_t> mask_dim) {
+#ifdef WITH_CUDA
+  if (get_dim(nt) == 3 && get_is_contiguous(nt) && mask_dim && *mask_dim == 2) {
+    auto nt_opt_size = get_opt_sizes(nt);
+    Tensor nt_buffer = get_buffer(nt);
+    if (nt_opt_size[2] && nt_buffer.is_cuda()) {
+      std::cout << "Calling efficient to_tensor_mask" << std::endl;
+      Tensor nt_sizes_ =
+          get_efficient_nested_size(nt).sizes().to(torch::kInt32);
+      TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor must be of nested_dim 2.")
+      Tensor nt_sizes = at::native::narrow(nt_sizes_, 1, 0, 1);
+      int max_size_1 = nt_sizes.max().item<int>();
+      nt_sizes =
+          at::native::cumsum(nt_sizes, 0).to(torch::kInt32).reshape({-1});
+      nt_sizes = at::cat({torch::tensor({0}, torch::kInt32), nt_sizes});
+      Tensor output = torch::zeros(
+          {*nt_opt_size[0], max_size_1, *nt_opt_size[2]}, nt_buffer.options());
+      nt_sizes = nt_sizes.to(torch::kCUDA);
+      Tensor output_mask = torch::zeros(
+          {*nt_opt_size[0], max_size_1}, nt_buffer.options());
+      output_mask = output_mask.to(torch::kInt32);
+      at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
+      nested_tensor::cuda::add_padding_mask_kernelLauncher(
+          nt_buffer.data_ptr<float>(),
+          output.data_ptr<float>(),
+          output_mask.data_ptr<int>(),
+          nt_sizes.data_ptr<int>(),
+          *nt_opt_size[0],
+          output_mask.stride(0),
+          output.stride(0),
+          *nt_opt_size[2],
+          defaultStream);
+      return std::make_tuple(output, output_mask.to(torch::kBool));
+    }
+  }
+#endif
   TORCH_CHECK(
       !mask_dim || *mask_dim <= get_dim(nt),
       "Requested mask dimension ",
@@ -225,10 +260,10 @@ std::tuple<Tensor, Tensor> to_tensor_mask(
 
 Tensor to_padded_tensor(Tensor nt, double padding) {
 #ifdef WITH_CUDA
-  if (get_dim(nt) == 3) {
+  if (get_dim(nt) == 3 && get_is_contiguous(nt)) {
     auto nt_opt_size = get_opt_sizes(nt);
-    if (nt_opt_size[2]) {
-      Tensor nt_buffer = get_buffer(nt);
+    Tensor nt_buffer = get_buffer(nt);
+    if (nt_opt_size[2] && nt_buffer.is_cuda()) {
       Tensor nt_sizes_ =
           get_efficient_nested_size(nt).sizes().to(torch::kInt32);
       TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor must be of nested_dim 2.")
 
@@ -344,18 +344,11 @@ Tensor NestedTensor_unsqueeze(const Tensor& self, int64_t dim) {
   return wrap_tensor_node(TensorNode(std::move(result_nodes)));
 }
 
-Tensor NestedTensor_serialize_nested_size(const Tensor& tensor) {
-  auto nt_impl = get_nested_tensor_impl(tensor);
-  std::vector<int64_t> out;
-  return torch::tensor(torch::nested_tensor::serialize(nt_impl->nested_size()));
-}
-
 TORCH_LIBRARY_IMPL(aten, NestedTensor, m) {
   nt_impl(m, "contiguous", NestedTensor_contiguous);
   nt_impl(m, "copy_", NestedTensor_copy_);
   nt_impl(m, "is_pinned", NestedTensor_is_pinned);
   nt_impl(m, "select.int", NestedTensor_select);
-  nt_impl(m, "serialize_nested_size", NestedTensor_serialize_nested_size);
   nt_impl(m, "size.int", NestedTensor_size_int);
   nt_impl(m, "slice.Tensor", NestedTensor_slice);
   nt_impl(m, "squeeze", NestedTensor_squeeze);
 
@@ -206,10 +206,7 @@ inline int64_t get_dim(const at::Tensor& tensor) {
 
 inline int64_t get_numel(const at::Tensor& tensor) {
   if (is_nested_tensor_impl(tensor)) {
-    return reduce(
-        [](at::Tensor leaf, int64_t input) { return input + leaf.numel(); },
-        0,
-        get_nested_tensor_structure(tensor));
+    return get_nested_tensor_impl(tensor)->get_storage()->numel();
   }
   return tensor.numel();
 }
@@ -304,8 +301,8 @@ inline Tensor NestedTensor_to_sparse_csr(Tensor tensor) {
     col_indices_.push_back(torch::arange({tensor_sizes_ptr[i]}));
   }
   at::Tensor col_indices = at::cat(col_indices_);
-  return at::native::sparse_csr_tensor(crow_indices, col_indices, values,
-      c10::nullopt, torch::kSparseCsr);
+  return at::native::sparse_csr_tensor(
+      crow_indices, col_indices, values, c10::nullopt, torch::kSparseCsr);
 }
 
 inline std::ostream& operator<<(
 
@@ -249,22 +249,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     return _nested_helper(index, get_nested_size(self));
   });
 
-  m.def("serialize_nested_size", [](Tensor self) {
-    return serialize(get_nested_tensor_impl(self)->nested_size());
-  });
-
-  m.def("deserialize_nested_size", [](std::vector<int64_t> out) {
-    SizeNode nested_size = deserialize_size_node(out);
-    return py::cast(THPPythonNode(
-        map(
-            [](std::vector<int64_t> e) {
-              return py::reinterpret_steal<py::object>(
-                  THPSize_NewFromSizes(e.size(), e.data()));
-            },
-            nested_size),
-        "NestedSize"));
-  });
-
   m.def("nested_stride", [](Tensor self, c10::optional<int64_t> index_) {
     if (!index_) {
       return py::cast(THPPythonNode(
 
@@ -135,6 +135,9 @@ struct EfficientSizeNode {
       return _structure[0];
     }
     if (_sizes.dim() > 0) {
+      if (_sizes.numel() == 0) {
+        return 0;
+      }
       Tensor nt_sizes = at::native::narrow(
           _sizes, 1 /* dim */, 0 /* start */, 1 /* length */);
       for (int64_t i = 1; i < _sizes.size(1); i++) {
Original file line number	Diff line number	Diff line change
`@@ -206,10 +206,7 @@ inline int64_t get_dim(const at::Tensor& tensor) {`
`206`	`206`
`207`	`207`	`inline int64_t get_numel(const at::Tensor& tensor) {`
`208`	`208`	`if (is_nested_tensor_impl(tensor)) {`
`209`		`- return reduce(`
`210`		`- [](at::Tensor leaf, int64_t input) { return input + leaf.numel(); },`
`211`		`- 0,`
`212`		`- get_nested_tensor_structure(tensor));`
	`209`	`+ return get_nested_tensor_impl(tensor)->get_storage()->numel();`
`213`	`210`	`}`
`214`	`211`	`return tensor.numel();`
`215`	`212`	`}`
`@@ -304,8 +301,8 @@ inline Tensor NestedTensor_to_sparse_csr(Tensor tensor) {`
`304`	`301`	`col_indices_.push_back(torch::arange({tensor_sizes_ptr[i]}));`
`305`	`302`	`}`
`306`	`303`	`at::Tensor col_indices = at::cat(col_indices_);`
`307`		`- return at::native::sparse_csr_tensor(crow_indices, col_indices, values,`
`308`		`- c10::nullopt, torch::kSparseCsr);`
	`304`	`+ return at::native::sparse_csr_tensor(`
	`305`	`+ crow_indices, col_indices, values, c10::nullopt, torch::kSparseCsr);`
`309`	`306`	`}`
`310`	`307`
`311`	`308`	`inline std::ostream& operator<<(`