20210520 nestedtensor import

cpuhrsch · facebook-github-bot · commit fcf06d2693b7 · 2021-05-20T15:23:22.000-07:00
Reviewed By: NicolasHug

Differential Revision: D28575245

fbshipit-source-id: d4853e8914772a42037490e4f87e575b959ba6d4
diff --git a/benchmarks/linear.py b/benchmarks/linear.py
@@ -1,43 +1,61 @@
 import torch
+import time
 import nestedtensor
-import utils
-
-import random
-random.seed(1010)
-
-BDIM=10
-
-# Performance tanks hard for lots of small Tensors as expected
-RAND_INTS = [random.randint(100, 300) for _ in range(BDIM)]
-
-OUTDIM=256
-GOALDIM=512
-
-TENSORS0 = [torch.rand(i, OUTDIM).cuda() for i in RAND_INTS]
-
-def gen_t_linear():
-    nt0 = nestedtensor.nested_tensor(TENSORS0, device=torch.device('cuda'), dtype=torch.float)
-    data, _ = nt0.to_tensor_mask()
-    lin = torch.nn.Linear(OUTDIM, GOALDIM).cuda()
-
-    def t():
-        lin(data)
-    return t
 
 
 @torch.inference_mode()
-def gen_nt_linear():
-    nt0 = nestedtensor.nested_tensor(TENSORS0, device=torch.device('cuda'), dtype=torch.float)
-    lin = torch.nn.Linear(OUTDIM, GOALDIM).cuda()
-
-    def nt():
-        lin(nt0)
-        # print("nt0.size()")
-        # print(nt0.size())
-        # import sys; sys.exit(1)
-    return nt
-
-
-if __name__ == "__main__":
-    print(utils.benchmark_fn(gen_t_linear()))
-    print(utils.benchmark_fn(gen_nt_linear()))
+def benchmark_torch_function(iters, f, *args):
+    f(*args)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+    for _ in range(iters):
+        f(*args)
+    if torch.cuda.is_available():
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event)
+    else:
+        return (time.time() - t0) * 1e3
+
+
+def run(bdim, embedding_dim, out_dim, min_t, max_t, iters, device):
+    import random
+    random.seed(1010)
+
+    # The following is meant to emulate the lenghts of randomly sampled tokenized sentences
+    lengths = [random.randint(min_t, max_t) for _ in range(bdim)]
+    lengths_mean = torch.tensor(lengths, dtype=torch.float).mean().item()
+    lengths_std = torch.tensor(lengths, dtype=torch.float).std().item()
+
+    # List of sentence embeddings
+    tensors = [torch.rand(i, embedding_dim) for i in lengths]
+    # Create packed NestedTensor
+    nt = nestedtensor.nested_tensor(tensors, device=device, dtype=torch.float)
+    # Created regular padded Tensor
+    data = nt.to_padded_tensor(padding=0)
+    # Amount of storage used for padding only
+    percentage_padded = 100 * (data.numel() - nt.numel()) / data.numel()
+
+    # Projects embeddings into another space
+    lin = torch.nn.Linear(embedding_dim, out_dim).to(device)
+    nt_time = benchmark_torch_function(iters, lin, nt)
+    t_time = benchmark_torch_function(iters, lin, data)
+
+    print(f"batch size: {bdim:4.0f}, embedding dim: {embedding_dim}, out_dim: {out_dim}, T mean:{lengths_mean:5.0f}, T std: {lengths_std:4.0f}", end='')
+    print(f", padding: {percentage_padded:3.0f}%, NT: {nt_time/iters:4.0f}ms, T: {t_time/iters:4.0f}ms, Speedup: {t_time/nt_time:3.2f}x")
+
+
+if torch.cuda.is_available():
+    print("CUDA device: ", torch.cuda.get_device_name(0))
+iters = 10
+for out_dim in [4096, 2048, 1024, 512, 256]:
+    print("")
+    for embed_dim in [4096, 2048, 1024, 512, 256]:
+        print("")
+        for min_t, max_t in [(16, 128), (32, 128), (64, 128), (128, 128)]:
+            run(256, embed_dim, out_dim, min_t, max_t, iters, torch.device('cuda'))
diff --git a/nestedtensor/csrc/BinaryOps.cpp b/nestedtensor/csrc/BinaryOps.cpp
@@ -8,9 +8,8 @@ Tensor NestedTensor_add_Tensor(
     const Tensor& self_,
     const Tensor& other_,
     const Scalar& alpha) {
-  Tensor self;
-  Tensor other;
-  std::tie(self, other) = _expand_other_as(self_, other_);
+  Tensor self = self_;
+  Tensor other = other_;
   if (is_nested_tensor_impl(self) && is_nested_tensor_impl(other)) {
     EfficientSizeNode self_efficient_nested_size =
         get_efficient_nested_size(self);
@@ -49,6 +48,7 @@ Tensor NestedTensor_add_Tensor(
           get_efficient_nested_stride(self));
     }
   }
+  std::tie(self, other) = _expand_other_as(self_, other_);
   return map_nested_tensor(
       [&alpha](Tensor s, Tensor o) { return at::add(s, o, alpha); },
       self,
diff --git a/nestedtensor/csrc/storage/EfficientSizeNode.h b/nestedtensor/csrc/storage/EfficientSizeNode.h
@@ -40,7 +40,7 @@ inline std::vector<c10::optional<int64_t>> construct_efficient_size(
 }
 
 inline void _efficient_serialize(
-    SizeNode nested_node,
+    const SizeNode& nested_node,
     std::vector<int64_t>& out) {
   if (!nested_node.is_leaf()) {
     out.push_back(nested_node.degree());
@@ -50,7 +50,7 @@ inline void _efficient_serialize(
   }
 }
 
-inline std::vector<int64_t> efficient_serialize(SizeNode nested_node) {
+inline std::vector<int64_t> efficient_serialize(const SizeNode& nested_node) {
   std::vector<int64_t> out;
   _efficient_serialize(nested_node, out);
   return out;
@@ -85,7 +85,7 @@ inline SizeNode efficient_deserialize(
 } // namespace impl
 
 struct EfficientSizeNode {
-  explicit EfficientSizeNode(SizeNode size_node)
+  explicit EfficientSizeNode(const SizeNode& size_node)
       : _height(size_node.height()),
         _structure(impl::efficient_serialize(size_node)),
         _sizes(impl::stack_sizes(size_node)) {}
@@ -130,6 +130,22 @@ struct EfficientSizeNode {
   EfficientSizeNode clone() const {
     return EfficientSizeNode(_height, _structure, _sizes.clone());
   }
+  int64_t numel() const {
+    if (_sizes.dim() == 0 && _structure.size() > 0) {
+      return _structure[0];
+    }
+    if (_sizes.dim() > 0) {
+      Tensor nt_sizes = at::native::narrow(
+          _sizes, 1 /* dim */, 0 /* start */, 1 /* length */);
+      for (int64_t i = 1; i < _sizes.size(1); i++) {
+        Tensor tmp = at::native::narrow(
+            _sizes, 1 /* dim */, i /* start */, 1 /* length */);
+        nt_sizes = nt_sizes * tmp;
+      }
+      return nt_sizes.sum().item<int64_t>();
+    }
+    return 0;
+  }
 
  private:
   int64_t _height;
diff --git a/nestedtensor/csrc/storage/List.h b/nestedtensor/csrc/storage/List.h
@@ -61,6 +61,9 @@ struct ListStorage : public NestedTensorStorage {
     return get_first_leaf(_structure) ? get_first_leaf(_structure)->is_cuda()
                                       : false;
   }
+  int64_t numel() const override {
+    return _nested_size.numel();
+  }
 
  private:
   TensorNode _structure;
diff --git a/nestedtensor/csrc/storage/Packed.h b/nestedtensor/csrc/storage/Packed.h
@@ -173,6 +173,9 @@ struct PackedStorage : public NestedTensorStorage {
   bool is_cuda() const override {
     return _buffer.is_cuda();
   }
+  int64_t numel() const override {
+    return _nested_size.numel();
+  }
 
  private:
   at::Tensor _buffer;
diff --git a/nestedtensor/csrc/storage/StorageBase.h b/nestedtensor/csrc/storage/StorageBase.h
@@ -41,6 +41,9 @@ struct NestedTensorStorage {
   virtual bool is_cuda() const {
     TORCH_CHECK(false, "Not Implemented.");
   }
+  virtual int64_t numel() const {
+    TORCH_CHECK(false, "Not Implemented.");
+  }
 };
 } // namespace nested_tensor
 } // namespace torch

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,9 @@ struct ListStorage : public NestedTensorStorage {`
`61`	`61`	`return get_first_leaf(_structure) ? get_first_leaf(_structure)->is_cuda()`
`62`	`62`	`: false;`
`63`	`63`	`}`
	`64`	`+ int64_t numel() const override {`
	`65`	`+ return _nested_size.numel();`
	`66`	`+ }`
`64`	`67`
`65`	`68`	`private:`
`66`	`69`	`TensorNode _structure;`
Original file line number	Diff line number	Diff line change
`@@ -173,6 +173,9 @@ struct PackedStorage : public NestedTensorStorage {`
`173`	`173`	`bool is_cuda() const override {`
`174`	`174`	`return _buffer.is_cuda();`
`175`	`175`	`}`
	`176`	`+ int64_t numel() const override {`
	`177`	`+ return _nested_size.numel();`
	`178`	`+ }`
`176`	`179`
`177`	`180`	`private:`
`178`	`181`	`at::Tensor _buffer;`