pytorch
diff --git a/‎benchmarks/classy.py‎
Lines changed: 75 additions & 0 deletions b/‎benchmarks/classy.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎benchmarks/conv2d.py‎
Lines changed: 63 additions & 0 deletions b/‎benchmarks/conv2d.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/BinaryOps.cpp‎
Lines changed: 167 additions & 10 deletions b/‎nestedtensor/csrc/BinaryOps.cpp‎
Lines changed: 167 additions & 10 deletions
diff --git a/‎nestedtensor/csrc/activation.cpp‎
Lines changed: 11 additions & 1 deletion b/‎nestedtensor/csrc/activation.cpp‎
Lines changed: 11 additions & 1 deletion
@@ -0,0 +1,75 @@
+import torch
+import numpy as np
+import time
+import random
+import nestedtensor
+from classy_vision.models import build_model
+
+
+@torch.inference_mode()
+def benchmark_torch_function(iters, f, *args, **kwargs):
+    f(*args, **kwargs)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+    for _ in range(iters):
+        f(*args, **kwargs)
+    if torch.cuda.is_available():
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event) / 1e3
+    else:
+        return (time.time() - t0)
+
+
+@torch.inference_mode()
+def run_benchmark(iters, shapes, model, model_name, bsz):
+    ts = []
+    for s in shapes:
+        inp = torch.randn(*s, dtype=torch.half).cuda()
+        ts.append(inp)
+    ts_nt = nestedtensor.nested_tensor([t.squeeze(0) for t in ts], device=torch.device('cuda'), dtype=torch.half)
+
+    def _loop():
+        model_outputs = []
+        for inp in ts:
+            model_outputs.append(model(inp))
+        return model_outputs
+
+    # Test
+    outputs_nt = model(ts_nt)
+    model_outputs = _loop()
+    for mo, ntmo in zip(model_outputs, outputs_nt.unbind()):
+        # Using float16 tolerances from torch/testing/_core.yp
+        assert torch.allclose(mo.squeeze(0), ntmo, rtol=1e-3, atol=1e-3)
+
+    loop_time = benchmark_torch_function(iters, _loop)
+    nt_time = benchmark_torch_function(iters, lambda: model(ts_nt))
+
+    shapes_2_array = np.array([s[2] for s in shapes])
+    shapes_3_array = np.array([s[3] for s in shapes])
+    print(f"model_name: {model_name.rjust(18)},", end='')
+    print(f" bsz: {bsz},", end='')
+    print(f" mean±std shapes[2]: {shapes_2_array.mean():.2f}±{shapes_2_array.std():.2f},", end='')
+    print(f" mean±std shapes[3]: {shapes_3_array.mean():.2f}±{shapes_3_array.std():.2f},", end='')
+    print(f" loop: {loop_time / iters:.2f}s, nt: {nt_time / iters:.2f}s, speedup: {loop_time / nt_time:.2f}x")
+
+if __name__ == "__main__":
+    def _benchmark(model_name, bsz):
+        model = build_model({"name": model_name})
+        model = model.cuda().half().eval()
+        random.seed(123)
+        shapes = [(1, 3, random.randint(100, 600), random.randint(100, 600)) for _ in range(bsz)]
+        run_benchmark(1, shapes, model, model_name, bsz)
+
+    _benchmark("resnext101_32x4d", 64)
+    _benchmark("resnext101_32x4d", 128)
+    _benchmark("resnext101_32x4d", 256)
+    _benchmark("regnet_y_128gf", 64)
+    _benchmark("regnet_y_128gf", 128)
+    # Runs out of memory
+    # _benchmark("regnet_y_128gf", 256)
@@ -0,0 +1,63 @@
+import torch
+import time
+import nestedtensor
+
+
+@torch.inference_mode()
+def benchmark_torch_function(iters, f, *args):
+    f(*args)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+    for _ in range(iters):
+        f(*args)
+    if torch.cuda.is_available():
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event)
+    else:
+        return (time.time() - t0) * 1e3
+
+
+# def run(bdim, embedding_dim, out_dim, min_t, max_t, iters, device):
+def run(bdim, nchannel, min_t, max_t, iters, device):
+    import random
+    random.seed(1010)
+
+    # The following is meant to emulate the lenghts of randomly sampled tokenized sentences
+    lengths1 = [random.randint(min_t, max_t) for _ in range(bdim)]
+    lengths2 = [random.randint(min_t, max_t) for _ in range(bdim)]
+
+    # List of sentence embeddings
+    tensors = [torch.rand(nchannel, l1, l2).to(device=device, dtype=torch.float) for (l1, l2) in zip(lengths1, lengths2)]
+    # Create packed NestedTensor
+    nt = nestedtensor.nested_tensor(tensors, device=device, dtype=torch.float)
+
+    lin = torch.nn.Conv2d(nchannel, nchannel, (1, 1), bias=False).to(device)
+
+    def _loop(tensors):
+        result = []
+        for t in tensors:
+            result.append(lin(t.unsqueeze(0)).squeeze(0))
+        return result
+
+    nt_time = benchmark_torch_function(iters, lin, nt)
+    t_time = benchmark_torch_function(iters, _loop, tensors)
+
+    # print(f"batch size: {bdim:4.0f}, embedding dim: {embedding_dim}, out_dim: {out_dim}, T mean:{lengths_mean:5.0f}, T std: {lengths_std:4.0f}", end='')
+    print(f"batch size: {bdim:4.0f}, nchannel: {nchannel:4.0f}", end='')
+    # print(f", padding: {percentage_padded:3.0f}%, NT: {nt_time/iters:4.0f}ms, T: {t_time/iters:4.0f}ms, Speedup: {t_time/nt_time:3.2f}x")
+    print(f", NT: {nt_time/iters:4.0f}ms, T: {t_time/iters:4.0f}ms, Speedup: {t_time/nt_time:3.2f}x")
+
+
+if torch.cuda.is_available():
+    print("CUDA device: ", torch.cuda.get_device_name(0))
+iters = 10
+for nchannel in [3, 128, 256, 512]:
+    for min_t, max_t in [(16, 128), (32, 128), (64, 128), (128, 128)]:
+        run(256, nchannel, min_t, max_t, iters, torch.device('cuda'))
+        break
@@ -1,4 +1,9 @@
 #include <nestedtensor/csrc/BinaryOps.h>
+#ifdef WITH_CUDA
+#include <c10/cuda/CUDAStream.h>
+#include <nestedtensor/csrc/cuda/add.h>
+#include <c10/util/Half.h>
+#endif
 
 namespace at {
 
@@ -31,11 +36,56 @@ Tensor NestedTensor_add_Tensor(
     }
   }
   if (is_nested_tensor_impl(self) && !is_nested_tensor_impl(other)) {
-    if (!get_is_contiguous(self)) {
-      self = NestedTensor_contiguous(self);
-    }
+    self = NestedTensor_contiguous(self);
     int64_t self_dim = get_dim(self);
     auto self_opt_sizes = get_opt_sizes(self);
+#ifdef WITH_CUDA
+    if (self_dim == 4 && other.dim() == 4 &&
+        self_opt_sizes[0] &&
+        self_opt_sizes[1] &&
+        (*self_opt_sizes[1]) == other.size(1) &&
+        other.size(0) == 1 &&
+        other.size(2) == 1 &&
+        other.size(3) == 1 &&
+        self.dtype() ==  c10::ScalarType::Half &&
+        other.dtype() == c10::ScalarType::Half) {
+      other = other.contiguous();
+      at::Tensor self_buffer = get_buffer(self);
+      Tensor nt_sizes_ =
+          get_efficient_nested_size(self).sizes().to(torch::kInt32);
+      Tensor nt_sizes_1 = at::native::narrow(nt_sizes_, 1, 1, 1);
+      Tensor nt_sizes_2 = at::native::narrow(nt_sizes_, 1, 2, 1);
+      Tensor nt_sizes_all = nt_sizes_1 * nt_sizes_2;
+      std::vector<int> numbers;
+      for (int64_t i = 0; i < nt_sizes_all.size(0); i++) {
+        for (int64_t j = 0; j < *self_opt_sizes[1]; j++) {
+          numbers.push_back(nt_sizes_all[i].item<int>());
+        }
+      }
+      at::Tensor numbers_t = torch::tensor(numbers).to(torch::kInt32);
+      Tensor nt_sizes_cumsum =
+          at::native::cumsum(numbers_t, 0).to(torch::kInt32).reshape({-1});
+      TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor metadata of unexpected dimension.")
+      Tensor nt_sizes = at::cat({torch::tensor({0}, torch::kInt32), nt_sizes_cumsum});
+      nt_sizes = nt_sizes.to(torch::kCUDA);
+      at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
+      at::Tensor result_buffer = self_buffer.clone();
+
+      c10::Half* self_ptr = self_buffer.data_ptr<c10::Half>();
+      c10::Half* other_ptr = other.data_ptr<c10::Half>();
+      c10::Half* result_ptr = result_buffer.data_ptr<c10::Half>();
+      nested_tensor::cuda::add_scalar_kernelLauncher(
+          self_ptr,
+          other_ptr,
+          result_ptr,
+          (int)(*self_opt_sizes[0] * *self_opt_sizes[1]),
+          (int)(*self_opt_sizes[0]),
+          nt_sizes.data_ptr<int>(),
+          defaultStream);
+      return wrap_buffer(std::move(result_buffer), get_efficient_nested_size(self),
+          get_efficient_nested_stride(self));
+    }
+#endif
     if (self_opt_sizes[self_dim - 1] && other.dim() == 1 &&
         (*(self_opt_sizes[self_dim - 1])) == other.size(0)) {
       Tensor self_buffer = get_buffer(self);
@@ -50,7 +100,8 @@ Tensor NestedTensor_add_Tensor(
   }
   std::tie(self, other) = _expand_other_as(self_, other_);
   return map_nested_tensor(
-      [&alpha](Tensor s, Tensor o) { return at::add(s, o, alpha); },
+      [&alpha](Tensor s, Tensor o) { 
+      return at::add(s, o, alpha); },
       self,
       other);
 }
@@ -180,11 +231,64 @@ Tensor& NestedTensor_floor_divide_out(
 }
 
 Tensor NestedTensor_mul_Tensor(const Tensor& self_, const Tensor& other_) {
-  Tensor self;
-  Tensor other;
+  Tensor self = self_;
+  Tensor other = other_;
+  if (is_nested_tensor_impl(self) && !is_nested_tensor_impl(other)) {
+    self = NestedTensor_contiguous(self);
+    int64_t self_dim = get_dim(self);
+    auto self_opt_sizes = get_opt_sizes(self);
+#ifdef WITH_CUDA
+    if (self_dim == 4 && other.dim() == 4 &&
+        self_opt_sizes[0] &&
+        self_opt_sizes[1] &&
+        (*self_opt_sizes[1]) == other.size(1) &&
+        other.size(0) == 1 &&
+        other.size(2) == 1 &&
+        other.size(3) == 1 &&
+        self.dtype() ==  c10::ScalarType::Half &&
+        other.dtype() == c10::ScalarType::Half) {
+      other = other.contiguous();
+      at::Tensor self_buffer = get_buffer(self);
+      Tensor nt_sizes_ =
+          get_efficient_nested_size(self).sizes().to(torch::kInt32);
+      Tensor nt_sizes_1 = at::native::narrow(nt_sizes_, 1, 1, 1);
+      Tensor nt_sizes_2 = at::native::narrow(nt_sizes_, 1, 2, 1);
+      Tensor nt_sizes_all = nt_sizes_1 * nt_sizes_2;
+      std::vector<int> numbers;
+      for (int64_t i = 0; i < nt_sizes_all.size(0); i++) {
+        for (int64_t j = 0; j < *self_opt_sizes[1]; j++) {
+          numbers.push_back(nt_sizes_all[i].item<int>());
+        }
+      }
+      at::Tensor numbers_t = torch::tensor(numbers).to(torch::kInt32);
+      Tensor nt_sizes_cumsum =
+          at::native::cumsum(numbers_t, 0).to(torch::kInt32).reshape({-1});
+      TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor metadata of unexpected dimension.")
+      Tensor nt_sizes = at::cat({torch::tensor({0}, torch::kInt32), nt_sizes_cumsum});
+      nt_sizes = nt_sizes.to(torch::kCUDA);
+      at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
+      at::Tensor result_buffer = self_buffer.clone();
+
+      c10::Half* self_ptr = self_buffer.data_ptr<c10::Half>();
+      c10::Half* other_ptr = other.data_ptr<c10::Half>();
+      c10::Half* result_ptr = result_buffer.data_ptr<c10::Half>();
+      nested_tensor::cuda::mul_scalar_kernelLauncher(
+          self_ptr,
+          other_ptr,
+          result_ptr,
+          (int)(*self_opt_sizes[0] * *self_opt_sizes[1]),
+          (int)(*self_opt_sizes[0]),
+          nt_sizes.data_ptr<int>(),
+          defaultStream);
+      return wrap_buffer(std::move(result_buffer), get_efficient_nested_size(self),
+          get_efficient_nested_stride(self));
+    }
+#endif
+  }
   std::tie(self, other) = _expand_other_as(self_, other_);
   return map_nested_tensor(
-      [](Tensor s, Tensor o) { return at::mul(s, o); }, self, other);
+      [](Tensor s, Tensor o) { 
+      return at::mul(s, o); }, self, other);
 }
 
 Tensor& NestedTensor_mul__Tensor(Tensor& self_, const Tensor& other_) {
@@ -246,11 +350,64 @@ Tensor NestedTensor_sub_Tensor(
     const Tensor& self_,
     const Tensor& other_,
     const Scalar& alpha) {
-  Tensor self;
-  Tensor other;
+  Tensor self = self_;
+  Tensor other = other_;
+  if (is_nested_tensor_impl(self) && !is_nested_tensor_impl(other)) {
+    self = NestedTensor_contiguous(self);
+    int64_t self_dim = get_dim(self);
+    auto self_opt_sizes = get_opt_sizes(self);
+#ifdef WITH_CUDA
+    if (self_dim == 4 && other.dim() == 4 &&
+        self_opt_sizes[0] &&
+        self_opt_sizes[1] &&
+        (*self_opt_sizes[1]) == other.size(1) &&
+        other.size(0) == 1 &&
+        other.size(2) == 1 &&
+        other.size(3) == 1 &&
+        self.dtype() ==  c10::ScalarType::Half &&
+        other.dtype() == c10::ScalarType::Half) {
+      other = other.contiguous();
+      at::Tensor self_buffer = get_buffer(self);
+      Tensor nt_sizes_ =
+          get_efficient_nested_size(self).sizes().to(torch::kInt32);
+      Tensor nt_sizes_1 = at::native::narrow(nt_sizes_, 1, 1, 1);
+      Tensor nt_sizes_2 = at::native::narrow(nt_sizes_, 1, 2, 1);
+      Tensor nt_sizes_all = nt_sizes_1 * nt_sizes_2;
+      std::vector<int> numbers;
+      for (int64_t i = 0; i < nt_sizes_all.size(0); i++) {
+        for (int64_t j = 0; j < *self_opt_sizes[1]; j++) {
+          numbers.push_back(nt_sizes_all[i].item<int>());
+        }
+      }
+      at::Tensor numbers_t = torch::tensor(numbers).to(torch::kInt32);
+      Tensor nt_sizes_cumsum =
+          at::native::cumsum(numbers_t, 0).to(torch::kInt32).reshape({-1});
+      TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor metadata of unexpected dimension.")
+      Tensor nt_sizes = at::cat({torch::tensor({0}, torch::kInt32), nt_sizes_cumsum});
+      nt_sizes = nt_sizes.to(torch::kCUDA);
+      at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
+      at::Tensor result_buffer = self_buffer.clone();
+
+      c10::Half* self_ptr = self_buffer.data_ptr<c10::Half>();
+      c10::Half* other_ptr = other.data_ptr<c10::Half>();
+      c10::Half* result_ptr = result_buffer.data_ptr<c10::Half>();
+      nested_tensor::cuda::sub_scalar_kernelLauncher(
+          self_ptr,
+          other_ptr,
+          result_ptr,
+          (int)(*self_opt_sizes[0] * *self_opt_sizes[1]),
+          (int)(*self_opt_sizes[0]),
+          nt_sizes.data_ptr<int>(),
+          defaultStream);
+      return wrap_buffer(std::move(result_buffer), get_efficient_nested_size(self),
+          get_efficient_nested_stride(self));
+    }
+#endif
+  }
   std::tie(self, other) = _expand_other_as(self_, other_);
   return map_nested_tensor(
-      [&alpha](Tensor s, Tensor o) { return at::sub(s, o, alpha); },
+      [&alpha](Tensor s, Tensor o) { 
+      return at::sub(s, o, alpha); },
       self,
       other);
 }
 
@@ -27,14 +27,24 @@ Tensor NestedTensor_relu(const Tensor& self) {
 #ifdef TRACEPACKED
     std::cout << "calling packed relu" << std::endl;
 #endif
-    return wrap_buffer(at::relu(get_buffer(self)), impl->nested_size());
+    return wrap_buffer(at::relu(get_buffer(self)),
+        get_efficient_nested_size(self),
+        get_efficient_nested_stride(self));
   }
   return map_nested_tensor(
       [](at::Tensor tensor) { return at::relu(tensor); }, self);
 }
 
 // Registered below autograd
 Tensor& NestedTensor_relu_(Tensor& self) {
+  if (get_is_contiguous(self)) {
+#ifdef TRACEPACKED
+    std::cout << "calling packed relu_" << std::endl;
+#endif
+    Tensor buffer = get_buffer(self);
+    at::relu_(buffer);
+    return self;
+  }
   apply_nested_tensor([](at::Tensor& tensor) { at::relu_(tensor); }, self);
   return self;
 }