pytorch
diff --git a/‎benchmarks/classy.py‎
Lines changed: 21 additions & 11 deletions b/‎benchmarks/classy.py‎
Lines changed: 21 additions & 11 deletions
diff --git a/‎benchmarks/gat.py‎
Lines changed: 110 additions & 0 deletions b/‎benchmarks/gat.py‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎nestedtensor/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎nestedtensor/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/BinaryOps.cpp‎
Lines changed: 8 additions & 0 deletions b/‎nestedtensor/csrc/BinaryOps.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/SoftMax.cpp‎
Lines changed: 19 additions & 0 deletions b/‎nestedtensor/csrc/SoftMax.cpp‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎nestedtensor/csrc/activation.cpp‎
Lines changed: 13 additions & 1 deletion b/‎nestedtensor/csrc/activation.cpp‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎nestedtensor/csrc/autograd_functions.cpp‎
Lines changed: 43 additions & 20 deletions b/‎nestedtensor/csrc/autograd_functions.cpp‎
Lines changed: 43 additions & 20 deletions
@@ -21,7 +21,7 @@ def benchmark_torch_function(iters, f, *args, **kwargs):
     if torch.cuda.is_available():
         end_event.record()
         torch.cuda.synchronize()
-        return start_event.elapsed_time(end_event) / 1e3
+        return start_event.elapsed_time(end_event)
     else:
         return (time.time() - t0)
 
@@ -33,43 +33,53 @@ def run_benchmark(iters, shapes, model, model_name, bsz):
         inp = torch.randn(*s, dtype=torch.half).cuda()
         ts.append(inp)
     ts_nt = nestedtensor.nested_tensor([t.squeeze(0) for t in ts], device=torch.device('cuda'), dtype=torch.half)
+    ts_padded = ts_nt.to_padded_tensor()
+    ts_nt = nestedtensor.nested_tensor([t.squeeze(0) for t in ts], device=torch.device('cuda'), dtype=torch.half, channels_last=True)
 
     def _loop():
         model_outputs = []
         for inp in ts:
             model_outputs.append(model(inp))
         return model_outputs
 
+    def _padded():
+        return model(ts_padded)
+
     # Test
     outputs_nt = model(ts_nt)
+    # import time; time.sleep(1)
+    # outputs_nt = model(ts_nt)
+    # import sys; sys.exit(1)
     model_outputs = _loop()
     for mo, ntmo in zip(model_outputs, outputs_nt.unbind()):
         # Using float16 tolerances from torch/testing/_core.yp
         assert torch.allclose(mo.squeeze(0), ntmo, rtol=1e-3, atol=1e-3)
 
     loop_time = benchmark_torch_function(iters, _loop)
+    padded_time = benchmark_torch_function(iters, _padded)
     nt_time = benchmark_torch_function(iters, lambda: model(ts_nt))
 
     shapes_2_array = np.array([s[2] for s in shapes])
     shapes_3_array = np.array([s[3] for s in shapes])
     print(f"model_name: {model_name.rjust(18)},", end='')
-    print(f" bsz: {bsz},", end='')
+    print(f" bsz: {bsz:3.0f},", end='')
     print(f" mean±std shapes[2]: {shapes_2_array.mean():.2f}±{shapes_2_array.std():.2f},", end='')
     print(f" mean±std shapes[3]: {shapes_3_array.mean():.2f}±{shapes_3_array.std():.2f},", end='')
-    print(f" loop: {loop_time / iters:.2f}s, nt: {nt_time / iters:.2f}s, speedup: {loop_time / nt_time:.2f}x")
+    print(f" padded_size: {tuple(ts_padded.size())},", end='')
+    print(f" loop: {loop_time / iters:7.2f}ms, nt: {nt_time / iters:7.2f}ms, padded: {padded_time / iters:7.2f}ms, speedup: {loop_time / nt_time:.2f}x")
 
 if __name__ == "__main__":
+    iters = 10
+
     def _benchmark(model_name, bsz):
         model = build_model({"name": model_name})
         model = model.cuda().half().eval()
         random.seed(123)
         shapes = [(1, 3, random.randint(100, 600), random.randint(100, 600)) for _ in range(bsz)]
-        run_benchmark(1, shapes, model, model_name, bsz)
+        run_benchmark(iters, shapes, model, model_name, bsz)
+
+    for bsz in [16, 32, 64, 128]:
+        _benchmark("resnext101_32x4d", bsz)
 
-    _benchmark("resnext101_32x4d", 64)
-    _benchmark("resnext101_32x4d", 128)
-    _benchmark("resnext101_32x4d", 256)
-    _benchmark("regnet_y_128gf", 64)
-    _benchmark("regnet_y_128gf", 128)
-    # Runs out of memory
-    # _benchmark("regnet_y_128gf", 256)
+    for bsz in [16, 32]:
+        _benchmark("regnet_y_128gf", bsz)
@@ -0,0 +1,110 @@
+import torch
+import torch.nn.functional as F
+from torch_geometric.nn import GATConv
+import random
+import time
+import nestedtensor
+from nestedtensor import nested_tensor as ntnt
+
+@torch.inference_mode()
+def benchmark_torch_function(iters, f, *args, **kwargs):
+    f(*args, **kwargs)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+    for _ in range(iters):
+        f(*args, **kwargs)
+    if torch.cuda.is_available():
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event)
+    else:
+        return (time.time() - t0)
+
+
+num_features = 1433
+num_classes = 7
+
+
+class Net(torch.nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = GATConv(num_features, 8, heads=8,
+                             dropout=0.6)
+
+        self.conv2 = GATConv(64, num_classes, heads=1, concat=True,
+                             dropout=0.6)
+
+    def forward(self, x, edge_index):
+        x = F.dropout(x, p=0.6, training=self.training)
+        x = F.elu(self.conv1(x, edge_index))
+        x = F.dropout(x, p=0.6, training=self.training)
+        x = self.conv2(x, edge_index)
+        return F.log_softmax(x, dim=1)
+
+
+class NTNet(torch.nn.Module):
+    def __init__(self):
+        super(NTNet, self).__init__()
+        self.conv1 = GATConv(num_features, 8, heads=8,
+                             dropout=0.6)
+
+        self.conv2 = GATConv(64, num_classes, heads=1, concat=True,
+                             dropout=0.6)
+
+    def forward(self, x, edge_index):
+        x = F.dropout(x, p=0.6, training=self.training)
+        x = ntnt([self.conv1(xi, edge_index_i) for (xi, edge_index_i) in zip(x.unbind(), edge_index.unbind())], dtype=x.dtype, device=x.device)
+        x = F.elu(x)
+        x = F.dropout(x, p=0.6, training=self.training)
+        x = ntnt([self.conv2(xi, edge_index_i) for (xi, edge_index_i) in zip(x.unbind(), edge_index.unbind())], dtype=x.dtype, device=x.device)
+        return F.log_softmax(x, dim=1)
+
+
+def create_models(device):
+    model = Net().to(device).eval()
+    nt_model = NTNet().to(device).eval()
+    return model, nt_model
+
+def create_tensors():
+    random.seed(1010)
+    nnodes_list = []
+    nedges_list = []
+    for i in range(50):
+        nnodes_list.append(random.randint(100, 4000))
+        nedges_list.append(random.randint(8000, 15000))
+    
+    tensors_x = []
+    tensors_edge_index = []
+    for nnodes, nedges in zip(nnodes_list, nedges_list):
+        x = torch.normal(-10, 4, (nnodes, 1433))
+        x[x < 0] = 0.
+        x[x > 1] = 1.
+        edge_index = torch.randint(0, nnodes, (2, nedges), dtype=torch.int64)
+        tensors_x.append(x)
+        tensors_edge_index.append(edge_index)
+    return tensors_x, tensors_edge_index
+
+
+@torch.inference_mode()
+def loop(model, tensors_x, tensors_edge_index):
+    for x, edge_index in zip(tensors_x, tensors_edge_index):
+        model(x, edge_index)
+
+
+@torch.inference_mode()
+def nt(nt_model, nt_x, nt_edge_index):
+    nt_model(nt_x, nt_edge_index)
+
+if __name__ == "__main__":
+    device = torch.device('cuda')
+    model, nt_model = create_models(device)
+    tensors_x, tensors_edge_index = create_tensors()
+    print(benchmark_torch_function(10, loop, model, tensors_x, tensors_edge_index))
+    nt_x = ntnt(tensors_x, device=device)
+    nt_edge_index = ntnt(tensors_edge_index, device=device, dtype=torch.int64)
+    print(benchmark_torch_function(10, nt, nt_model, nt_x, nt_edge_index))
@@ -8,6 +8,8 @@
 
 from .nested.nested import NestedTensor
 from .nested.nested import to_nested_tensor
+from .nested.nested import transpose_nchw_nhwc
+from .nested.nested import transpose_nhwc_nchw
 
 from . import nested
 
 
@@ -22,6 +22,14 @@ Tensor NestedTensor_add_Tensor(
         get_efficient_nested_size(other);
     if (efficient_size_matches(
             self_efficient_nested_size, other_efficient_nested_size)) {
+      if (get_is_contiguous(self, c10::MemoryFormat::ChannelsLast) &&
+          get_is_contiguous(other, c10::MemoryFormat::ChannelsLast)) {
+        return wrap_buffer(
+            at::add(
+                get_buffer(self).view({-1}), get_buffer(other).view({-1})),
+            self_efficient_nested_size,
+            get_efficient_nested_stride(self));
+      }
       if (!get_is_contiguous(self)) {
         self = NestedTensor_contiguous(self);
       }
 
@@ -27,8 +27,27 @@ Tensor NestedTensor_softmax(
       input);
 }
 
+Tensor NestedTensor_log_softmax(
+    const Tensor& input,
+    const int64_t dim_,
+    c10::optional<ScalarType> dtype) {
+  int64_t dim = maybe_wrap_dim(dim_, get_dim(input));
+  auto input_data = get_nested_tensor_impl(input);
+  int64_t nested_dim = input_data->nested_dim();
+  TORCH_CHECK(
+      dim >= nested_dim,
+      "Cannot apply log_softmax across nested dimensions ",
+      std::to_string(dim));
+  return map_nested_tensor(
+      [dim, nested_dim, dtype](const at::Tensor t) {
+        return at::log_softmax(t, dim - nested_dim, dtype);
+      },
+      input);
+}
+
 TORCH_LIBRARY_IMPL(aten, NestedTensor, m) {
   nt_impl(m, "softmax.int", NestedTensor_softmax);
+  nt_impl(m, "log_softmax.int", NestedTensor_log_softmax);
 }
 
 } // namespace at
@@ -19,6 +19,17 @@ Tensor NestedTensor_gelu(const Tensor& self) {
       [](at::Tensor tensor) { return at::gelu(tensor); }, self);
 }
 
+Tensor NestedTensor_elu(const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
+  if (is_nested_tensor_impl(self) && get_is_contiguous(self)) {
+    return wrap_buffer(
+        at::elu(get_buffer(self), alpha, scale, input_scale),
+        get_efficient_nested_size(self),
+        get_efficient_nested_stride(self));
+  }
+  return map_nested_tensor(
+      [&alpha, &scale, &input_scale](at::Tensor tensor) { return at::elu(tensor, alpha, scale, input_scale); }, self);
+}
+
 // Registered below autograd
 Tensor NestedTensor_relu(const Tensor& self) {
   auto impl = get_nested_tensor_impl(self);
@@ -37,7 +48,7 @@ Tensor NestedTensor_relu(const Tensor& self) {
 
 // Registered below autograd
 Tensor& NestedTensor_relu_(Tensor& self) {
-  if (get_is_contiguous(self)) {
+  if (get_is_contiguous(self) || get_is_contiguous(self, c10::MemoryFormat::ChannelsLast)) {
 #ifdef TRACEPACKED
     std::cout << "calling packed relu_" << std::endl;
 #endif
@@ -51,6 +62,7 @@ Tensor& NestedTensor_relu_(Tensor& self) {
 
 TORCH_LIBRARY_IMPL(aten, NestedTensor, m) {
   nt_impl(m, "gelu", NestedTensor_gelu);
+  nt_impl(m, "elu", NestedTensor_elu);
 }
 
 TORCH_LIBRARY_IMPL(aten, NestedTensor, m) {
 
@@ -105,7 +105,6 @@ Tensor NestedTensor_batch_norm(
     check_dims_match_num_input_features("bias", n_input, get_numel(*bias));
   }
 
-  auto scalar_shape = make_scalar_shape(get_dim(input), n_input);
   at::Tensor mean = *running_mean;
   at::Tensor var = *running_var;
 #ifdef WITH_CUDA
@@ -120,46 +119,64 @@ Tensor NestedTensor_batch_norm(
       (mean.dtype()    == torch::kHalf) &&
       (var.dtype()     == torch::kHalf) &&
       (bias->dtype()   == torch::kHalf) &&
-      (weight->dtype() == torch::kHalf)
+      (weight->dtype() == torch::kHalf) &&
+      get_is_cuda(input)
   )
   {
-  
     // Custom CUDA Half implementation.
     mean = mean.contiguous();
     Tensor bias_cont = (*bias).contiguous();
     Tensor weight_cont = (*weight).contiguous();
     Tensor running_var_cont = (*running_var).contiguous();
+
+    c10::Half* mean_ptr = mean.data_ptr<c10::Half>();
+    c10::Half* bias_ptr = bias_cont.data_ptr<c10::Half>();
+    c10::Half* weight_ptr = weight_cont.data_ptr<c10::Half>();
+    c10::Half* running_var_ptr = running_var_cont.data_ptr<c10::Half>();
+
+    if (get_is_contiguous(input, c10::MemoryFormat::ChannelsLast)) {
+      Tensor input_buffer = get_buffer(input);
+      int64_t num_channel = weight_cont.size(0);
+      at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
+      nested_tensor::cuda::batchnorm_inference_channels_last_kernelLauncher(
+          input_buffer.data_ptr<c10::Half>(),
+          mean_ptr,
+          running_var_ptr,
+          c10::Half((float)(eps)),
+          weight_ptr,
+          bias_ptr,
+          input_buffer.data_ptr<c10::Half>(),
+          num_channel,
+          input_buffer.numel(),
+          defaultStream);
+      input_buffer = input_buffer.view(-1);
+      return wrap_buffer(std::move(input_buffer), get_efficient_nested_size(input), get_efficient_nested_stride(input));
+    }
 
     Tensor output = input;
     output = NestedTensor_contiguous(output);
     Tensor input_buffer = get_buffer(output);
-    Tensor output_buffer = input_buffer.clone();
+    // Tensor output_buffer = input_buffer.clone();
 
     auto self_opt_sizes = get_opt_sizes(input);
 
     Tensor nt_sizes_ =
-        get_efficient_nested_size(input).sizes().to(torch::kInt32);
+        get_efficient_nested_size(input).sizes(); // .to(torch::kInt32);
     Tensor nt_sizes_1 = at::native::narrow(nt_sizes_, 1, 1, 1);
     Tensor nt_sizes_2 = at::native::narrow(nt_sizes_, 1, 2, 1);
     Tensor nt_sizes_all = nt_sizes_1 * nt_sizes_2;
-    int* nt_sizes_all_ptr = nt_sizes_all.data_ptr<int>();
-    std::vector<int> numbers;
-    numbers.reserve(1 + (nt_sizes_all.size(0) * *self_opt_sizes[1]));
-    numbers.push_back(0);
+    int64_t* nt_sizes_all_ptr = nt_sizes_all.data_ptr<int64_t>();
+    at::Tensor numbers_t = at::empty({1 + (nt_sizes_all.size(0) * *self_opt_sizes[1])}, torch::kInt64);
+    int64_t* numbers_t_ptr = numbers_t.data_ptr<int64_t>();
+    numbers_t_ptr[0] = 0;
     int64_t index = 1;
     for (int64_t i = 0; i < nt_sizes_all.size(0); i++) {
       for (int64_t j = 0; j < *self_opt_sizes[1]; j++) {
-        numbers.push_back(numbers[index - 1] + nt_sizes_all_ptr[i]);
+        numbers_t_ptr[index] = (numbers_t_ptr[index - 1] + nt_sizes_all_ptr[i]);
         index++;
       }
     }
-    at::Tensor numbers_t = torch::tensor(numbers).to(torch::kInt32);
-    Tensor nt_sizes = numbers_t.to(torch::kCUDA);
-  
-    c10::Half* mean_ptr = mean.data_ptr<c10::Half>();
-    c10::Half* running_var_ptr = running_var_cont.data_ptr<c10::Half>();
-    c10::Half* bias_ptr = bias_cont.data_ptr<c10::Half>();
-    c10::Half* weight_ptr = weight_cont.data_ptr<c10::Half>();
+    Tensor nt_sizes = numbers_t.to(at::Device(kCUDA), torch::kInt32, true, true);
 
     at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
     nested_tensor::cuda::batchnorm_inference_kernelLauncher(
@@ -169,15 +186,21 @@ Tensor NestedTensor_batch_norm(
         c10::Half((float)(eps)),
         weight_ptr,
         bias_ptr,
-        output_buffer.data_ptr<c10::Half>(),
-        (int)(*self_opt_sizes[0] * *self_opt_sizes[1]),
+        input_buffer.data_ptr<c10::Half>(),
+        // output_buffer.data_ptr<c10::Half>(),
         (int)(*self_opt_sizes[0]),
+        (int)(weight_cont.size(0)),
+        (int)(*self_opt_sizes[0] *
+              *self_opt_sizes[1] *
+              *self_opt_sizes[2] *
+              *self_opt_sizes[3]),
         nt_sizes.data_ptr<int>(),
         defaultStream
         );
-    return wrap_buffer(std::move(output_buffer), get_efficient_nested_size(output), get_efficient_nested_stride(output));
+    return wrap_buffer(std::move(input_buffer), get_efficient_nested_size(output), get_efficient_nested_stride(output));
   }
 #endif
+  auto scalar_shape = make_scalar_shape(get_dim(input), n_input);
 
   at::Tensor invstd = 1 / at::sqrt(*running_var + eps);