pytorch
diff --git a/‎nestedtensor/csrc/cuda/padding.cu‎
Lines changed: 57 additions & 0 deletions b/‎nestedtensor/csrc/cuda/padding.cu‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/cuda/padding.h‎
Lines changed: 19 additions & 0 deletions b/‎nestedtensor/csrc/cuda/padding.h‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/functions.cpp‎
Lines changed: 27 additions & 0 deletions b/‎nestedtensor/csrc/functions.cpp‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/masking.cpp‎
Lines changed: 74 additions & 11 deletions b/‎nestedtensor/csrc/masking.cpp‎
Lines changed: 74 additions & 11 deletions
diff --git a/‎nestedtensor/csrc/nested_tensor_impl.h‎
Lines changed: 27 additions & 4 deletions b/‎nestedtensor/csrc/nested_tensor_impl.h‎
Lines changed: 27 additions & 4 deletions
diff --git a/‎nestedtensor/csrc/py_init.cpp‎
Lines changed: 14 additions & 3 deletions b/‎nestedtensor/csrc/py_init.cpp‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎nestedtensor/csrc/storage/Packed.h‎
Lines changed: 3 additions & 1 deletion b/‎nestedtensor/csrc/storage/Packed.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎nestedtensor/nested/masking.py‎
Lines changed: 0 additions & 39 deletions b/‎nestedtensor/nested/masking.py‎
Lines changed: 0 additions & 39 deletions
@@ -0,0 +1,57 @@
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cmath>
+#include <nestedtensor/csrc/cuda/attention.h>
+#include <stdio.h>
+
+namespace nested_tensor {
+namespace cuda {
+
+template<typename T>
+__global__
+void add_padding(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int batch_size,
+    const int output_stride,
+    const int inner_size) 
+{
+  const int batch_id  = blockIdx.x;
+  for (int i = 0; i < (offsets[batch_id + 1] - offsets[batch_id]) * inner_size; i++) {
+    output[batch_id * output_stride + i] = input[offsets[batch_id] * inner_size + i];
+  }
+}
+
+template<typename T>
+void add_padding_kernelLauncher(
+    T* input, // [batch_size x None]
+    T* output, // [batch_size x max(input.nested_size(1))]
+    const int* offsets, // [batch_size]
+    const int batch_size,
+    const int output_stride,
+    const int inner_size,
+    const cudaStream_t stream)
+{
+  dim3 grid;
+  grid.x = batch_size;
+
+  add_padding<float><<<grid, 1, 0, stream>>>(
+      input,
+      output,
+      offsets,
+      batch_size,
+      output_stride,
+      inner_size);
+}
+
+template void add_padding_kernelLauncher<float>(
+    float* input,
+    float* output,
+    const int* offsets,
+    const int batch_size,
+    const int output_stride,
+    const int inner_size,
+    const cudaStream_t stream);
+}
+}
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+namespace nested_tensor {
+namespace cuda {
+
+template <typename T>
+void add_padding_kernelLauncher(
+    T* input,
+    T* output,
+    const int* lengths,
+    const int batch_size,
+    const int output_stride,
+    const int inner_size,
+    const cudaStream_t stream);
+}
+} // namespace nested_tensor
@@ -26,6 +26,33 @@ Tensor NestedTensor_embedding(
         weight,
         indices);
   }
+  if (is_nested_tensor_impl(indices) && get_is_contiguous(indices) &&
+      !is_nested_tensor_impl(weight) && get_dim(indices) == 2 &&
+      get_nested_dim(indices) == 1) {
+    Tensor indices_buffer = get_buffer(indices);
+    Tensor result_buffer = at::embedding(
+        weight, indices_buffer, padding_idx, scale_grad_by_freq, sparse);
+    EfficientSizeNode new_nested_size = get_efficient_nested_size(indices);
+    EfficientSizeNode new_nested_stride = get_efficient_nested_stride(indices);
+    auto new_nested_size_sizes = new_nested_size.sizes();
+    auto new_nested_stride_sizes = new_nested_stride.sizes();
+    auto tmp = torch::empty(
+        {new_nested_size_sizes.size(0)}, new_nested_size_sizes.options());
+    tmp.fill_(weight.size(1));
+    tmp = tmp.reshape({new_nested_size_sizes.size(0), 1});
+    new_nested_size_sizes = at::cat({new_nested_size_sizes, tmp}, 1);
+    new_nested_stride_sizes = at::cat({tmp, new_nested_stride_sizes}, 1);
+    return wrap_buffer(
+        std::move(result_buffer),
+        EfficientSizeNode(
+            new_nested_size.height(),
+            new_nested_size.structure(),
+            new_nested_size_sizes),
+        EfficientSizeNode(
+            new_nested_stride.height(),
+            new_nested_stride.structure(),
+            new_nested_stride_sizes));
+  }
   return map_nested_tensor(
       [&](at::Tensor i) {
         return at::embedding(
 
@@ -1,5 +1,9 @@
 #include <nestedtensor/csrc/masking.h>
 #include <chrono>
+#ifdef WITH_CUDA
+#include <c10/cuda/CUDAStream.h>
+#include <nestedtensor/csrc/cuda/padding.h>
+#endif
 
 using namespace torch::nested_tensor;
 using namespace at;
@@ -40,7 +44,7 @@ std::tuple<Tensor, Tensor> merge_tensor_mask(
 Tensor pad_tensor_to_shape(Tensor t, std::vector<int64_t> goal_shape) {
   std::vector<int64_t> padd;
   auto tup = t.sizes();
-  if (get_dim(t) != goal_shape.size()) {
+  if (get_dim(t) != (int64_t)(goal_shape.size())) {
     throw std::runtime_error("dimension doesn't match length of goal shape.");
   }
   for (int64_t i = tup.size() - 1; i >= 0; i--) {
@@ -182,7 +186,7 @@ c10::optional<Tensor> nt_from_tensor_mask(
     }
   }
   std::vector<TensorNode> inner_tensor_nodes;
-  for (int64_t i = 0; i < inner_tensors.size(); i++) {
+  for (size_t i = 0; i < inner_tensors.size(); i++) {
     if (inner_tensors[i]) {
       TensorNode node = get_nested_tensor_structure(*inner_tensors[i]);
       inner_tensor_nodes.push_back(node);
@@ -194,15 +198,68 @@ c10::optional<Tensor> nt_from_tensor_mask(
 std::tuple<Tensor, Tensor> to_tensor_mask(
     Tensor nt,
     c10::optional<int64_t> mask_dim) {
-  // TODO: Cover if not isinstance(nt, list) and nt.size() == (1,):
-  // TODO: Move to_tensor_mask entirely into C++
-
-  std::vector<int64_t> max_size = get_max_size(nt);
-  Tensor tensor;
-  Tensor mask;
-  std::tie(tensor, mask) = pad_nt(nt, max_size);
-  std::tie(tensor, mask) = merge_tensor_mask(tensor, mask, mask_dim);
-  return std::make_tuple(tensor, mask);
+  TORCH_CHECK(
+      !mask_dim || *mask_dim <= get_dim(nt),
+      "Requested mask dimension ",
+      *mask_dim,
+      " is bigger than dimension ",
+      get_dim(nt),
+      " of given NestedTensor.");
+
+  auto opt_sizes = get_opt_sizes(nt);
+  if (opt_sizes.size() == 1 && *opt_sizes[0] == 1) {
+    nt = NestedTensor_contiguous(nt);
+    Tensor nt_buffer = get_buffer(nt);
+    nt_buffer = nt_buffer.reshape({-1});
+    Tensor result_mask = !mask_dim || *mask_dim == 0 ? torch::tensor(true)
+                                                     : torch::tensor({true});
+    return std::make_tuple(nt_buffer, result_mask);
+  }
+
+  auto max_size = get_max_size(nt);
+  at::Tensor res_tensor;
+  at::Tensor res_mask;
+  std::tie(res_tensor, res_mask) = pad_nt(nt, max_size);
+  return merge_tensor_mask(res_tensor, res_mask, mask_dim);
+}
+
+Tensor to_padded_tensor(Tensor nt, double padding) {
+#ifdef WITH_CUDA
+  if (get_dim(nt) == 3) {
+    auto nt_opt_size = get_opt_sizes(nt);
+    if (nt_opt_size[2]) {
+      Tensor nt_buffer = get_buffer(nt);
+      Tensor nt_sizes_ =
+          get_efficient_nested_size(nt).sizes().to(torch::kInt32);
+      TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor must be of nested_dim 2.")
+      Tensor nt_sizes = at::native::narrow(nt_sizes_, 1, 0, 1);
+      int max_size_1 = nt_sizes.max().item<int>();
+      nt_sizes =
+          at::native::cumsum(nt_sizes, 0).to(torch::kInt32).reshape({-1});
+      nt_sizes = at::cat({torch::tensor({0}, torch::kInt32), nt_sizes});
+      Tensor output = torch::empty(
+          {*nt_opt_size[0], max_size_1, *nt_opt_size[2]}, nt_buffer.options());
+      output.fill_(padding);
+      nt_sizes = nt_sizes.to(torch::kCUDA);
+      at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
+      nested_tensor::cuda::add_padding_kernelLauncher(
+          nt_buffer.data_ptr<float>(),
+          output.data_ptr<float>(),
+          nt_sizes.data_ptr<int>(),
+          *nt_opt_size[0],
+          output.stride(0),
+          *nt_opt_size[2],
+          defaultStream);
+      return output;
+    }
+  }
+#endif
+  at::Tensor tensor;
+  at::Tensor mask;
+  std::tie(tensor, mask) = to_tensor_mask(nt, get_dim(nt));
+  mask = mask.to(torch::kBool);
+  tensor.masked_fill_(at::logical_not(mask), padding);
+  return tensor;
 }
 
 TORCH_LIBRARY_FRAGMENT(nestedtensor, m) {
@@ -219,4 +276,10 @@ TORCH_LIBRARY_FRAGMENT(nestedtensor, m) {
 
   m.def("get_max_size(Tensor nt) -> int[]");
   m.impl("get_max_size", NestedTensorKey, TORCH_FN(get_max_size));
+
+  m.def("to_tensor_mask(Tensor nt, int? mask_dim) -> (Tensor, Tensor)");
+  m.impl("to_tensor_mask", NestedTensorKey, to_tensor_mask);
+
+  m.def("to_padded_tensor(Tensor nt, float padding) -> Tensor");
+  m.impl("to_padded_tensor", NestedTensorKey, to_padded_tensor);
 }
@@ -173,15 +173,13 @@ inline const std::vector<c10::optional<int64_t>> get_opt_sizes(
   return get_nested_tensor_impl(tensor)->opt_sizes();
 }
 
-inline const EfficientSizeNode get_efficient_nested_size(
-    at::Tensor tensor) {
+inline const EfficientSizeNode get_efficient_nested_size(at::Tensor tensor) {
   TORCH_CHECK(
       is_nested_tensor_impl(tensor), "Given tensor must be NestedTensor.");
   return get_nested_tensor_impl(tensor)->get_storage()->nested_size();
 }
 
-inline const EfficientSizeNode get_efficient_nested_stride(
-    at::Tensor tensor) {
+inline const EfficientSizeNode get_efficient_nested_stride(at::Tensor tensor) {
   TORCH_CHECK(
       is_nested_tensor_impl(tensor), "Given tensor must be NestedTensor.");
   return get_nested_tensor_impl(tensor)->get_storage()->nested_stride();
@@ -285,6 +283,31 @@ inline bool is_tensor_shape(const at::Tensor tensor) {
 
 Tensor NestedTensor_to_tensor(Tensor tensor, c10::optional<int64_t> dim_);
 
+inline Tensor NestedTensor_to_sparse_csr(Tensor tensor) {
+  TORCH_CHECK(
+      get_dim(tensor) == 2,
+      "Given tensor must be of dimension 2, got dimension ",
+      get_dim(tensor));
+  Tensor values;
+  if (get_is_contiguous(tensor)) {
+    values = get_buffer(tensor).reshape({-1});
+  } else {
+    values = at::cat(flatten(get_nested_tensor_structure(tensor)));
+  }
+  auto tensor_sizes = get_efficient_nested_size(tensor).sizes();
+  tensor_sizes = tensor_sizes.reshape({-1});
+  int64_t* tensor_sizes_ptr = tensor_sizes.data_ptr<int64_t>();
+  at::Tensor crow_indices =
+      at::cat({torch::tensor({0}), at::cumsum(tensor_sizes, 0)});
+  std::vector<at::Tensor> col_indices_;
+  for (int64_t i = 0; i < tensor_sizes.size(0); i++) {
+    col_indices_.push_back(torch::arange({tensor_sizes_ptr[i]}));
+  }
+  at::Tensor col_indices = at::cat(col_indices_);
+  return at::native::sparse_csr_tensor(crow_indices, col_indices, values,
+      c10::nullopt, torch::kSparseCsr);
+}
+
 inline std::ostream& operator<<(
     std::ostream& out,
     const NestedTensorImpl& batch_tensor) {
 
@@ -179,18 +179,29 @@ TORCH_LIBRARY(nestedtensor, m) {
   m.impl("get_dim", NestedTensorKey, [](Tensor self) { return get_dim(self); });
 
   m.def("get_numel(Tensor self) -> int");
-  m.impl("get_numel", NestedTensorKey, [](Tensor self) { return get_numel(self); });
+  m.impl("get_numel", NestedTensorKey, [](Tensor self) {
+    return get_numel(self);
+  });
 
   m.def("get_is_contiguous(Tensor self) -> int");
-  m.impl("get_is_contiguous", NestedTensorKey, [](Tensor self) { return get_is_contiguous(self); });
+  m.impl("get_is_contiguous", NestedTensorKey, [](Tensor self) {
+    return get_is_contiguous(self);
+  });
 
   m.def("make_contiguous(Tensor self) -> Tensor");
-  m.impl("make_contiguous", NestedTensorKey, [](Tensor self) { return NestedTensor_contiguous(self); });
+  m.impl("make_contiguous", NestedTensorKey, [](Tensor self) {
+    return NestedTensor_contiguous(self);
+  });
 
   m.def("to_tensor_list(Tensor tensor) -> Tensor[]");
   m.impl("to_tensor_list", NestedTensorKey, [](Tensor tensor) {
     return flatten_nested_tensor(tensor);
   });
+
+  m.def("to_sparse_csr(Tensor tensor) -> Tensor");
+  m.impl("to_sparse_csr", NestedTensorKey, [](Tensor tensor) {
+    return NestedTensor_to_sparse_csr(tensor);
+  });
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
@@ -9,6 +9,8 @@ inline std::tuple<TensorNode, at::Tensor> build_structure(
     const at::Tensor& buffer,
     const SizeNode& nested_size,
     const SizeNode& nested_stride) {
+  TORCH_CHECK(
+      buffer.dim() == 1, "Given buffer must be vector, i.e. dim 1 Tensor.");
   std::vector<int64_t> split_sizes = flatten(
       map([](std::vector<int64_t> a,
              std::vector<int64_t> b) { return num_memory(a, b); },
@@ -121,7 +123,7 @@ struct PackedStorage : public NestedTensorStorage {
   }
   TensorNode get_structure() const override {
     return std::get<0>(impl::build_structure(
-        _buffer, _nested_size.to_size_node(), _nested_stride.to_size_node()));
+        _buffer.reshape({-1}), _nested_size.to_size_node(), _nested_stride.to_size_node()));
   }
   at::Tensor& get_buffer() {
     return _buffer;
 
@@ -48,42 +48,3 @@ def nt_from_tensor_mask(tensor, mask, nested_dim):
         tensor, mask, nested_dim)
     assert result is not None
     return nestedtensor.NestedTensor(result).contiguous()
-
-
-def get_tensor_mask(nt, shape):
-    return torch.ops.nestedtensor.pad_nt(nt, shape)
-
-
-# Return a tuple of a tensor and a mask that represent the given tensor list
-# Returned tensor is always the same no matter what mask_dim was passed.
-# If mask_dim was not passed, a mask with the smallest dimensionality would be returned.
-# if passed mask_dim is lower than the minimal dimensionality of the mask that can represent
-# the data tensor, an error is thrown.
-def to_tensor_mask(nt, mask_dim):
-    if mask_dim is not None and mask_dim > nt.dim():
-        raise RuntimeError(
-            "Mask dimension is bigger than nested dimension of a nested tensor.")
-
-    # Check if scalar was passed
-    if not isinstance(nt, list) and nt.size() == (1,):
-        res_scalar = torch.tensor(
-            [nt[0].item()], dtype=nt.dtype, device=nt.device, requires_grad=nt.requires_grad)
-        mask = torch.tensor(
-            True) if mask_dim == 0 or mask_dim is None else torch.tensor([True])
-        return res_scalar, mask
-
-    max_size = torch.ops.nestedtensor.get_max_size(nt)
-    res_tensor, res_mask = get_tensor_mask(nt, max_size)
-    tensor_mask_tuple = merge_tensor_mask(
-        TensorMask(res_tensor, res_mask), mask_dim)
-
-    return tensor_mask_tuple.tensor, tensor_mask_tuple.mask
-
-
-# Merge mask to a given dimension if possible.
-def merge_tensor_mask(tensor_mask, mask_dim):
-    tensor = tensor_mask.tensor
-    mask = tensor_mask.mask
-    tensor, mask = torch.ops.nestedtensor.merge_tensor_mask(
-        tensor, mask, mask_dim)
-    return TensorMask(tensor=tensor, mask=mask)