Skip to content
This repository was archived by the owner on Nov 15, 2022. It is now read-only.

Commit 744434a

Browse files
cpuhrschfacebook-github-bot
authored andcommitted
20210522 nestedtensor import
Reviewed By: astaff Differential Revision: D28624864 fbshipit-source-id: 441ad5b8af01918903d68818cfb90824b9467e37
1 parent fcf06d2 commit 744434a

18 files changed

+264
-174
lines changed

benchmarks/embedding.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import torch
2+
import time
3+
import nestedtensor
4+
5+
6+
@torch.inference_mode()
7+
def benchmark_torch_function(iters, f, *args):
8+
f(*args)
9+
if torch.cuda.is_available():
10+
torch.cuda.synchronize()
11+
start_event = torch.cuda.Event(enable_timing=True)
12+
end_event = torch.cuda.Event(enable_timing=True)
13+
start_event.record()
14+
else:
15+
t0 = time.time()
16+
for _ in range(iters):
17+
f(*args)
18+
if torch.cuda.is_available():
19+
end_event.record()
20+
torch.cuda.synchronize()
21+
return start_event.elapsed_time(end_event) * 1e3
22+
else:
23+
return (time.time() - t0) * 1e6
24+
25+
26+
def run(bdim, embedding_dim, vocab_size, min_t, max_t, iters, device):
27+
import random
28+
random.seed(1010)
29+
30+
# The following is meant to emulate the lenghts of randomly sampled tokenized sentences
31+
lengths = [random.randint(min_t, max_t) for _ in range(bdim)]
32+
lengths_mean = torch.tensor(lengths, dtype=torch.float).mean().item()
33+
lengths_std = torch.tensor(lengths, dtype=torch.float).std().item()
34+
35+
# List of sentence embeddings
36+
tensors = [torch.tensor(random.randint(1, vocab_size)) for i in lengths]
37+
# Create packed NestedTensor
38+
nt = nestedtensor.nested_tensor(tensors, device=device, dtype=torch.int64)
39+
# Created regular padded Tensor
40+
data, _ = nt.to_tensor_mask()
41+
data = data.to(torch.int64)
42+
# Amount of storage used for padding only
43+
percentage_padded = 100 * (data.numel() - nt.numel()) / data.numel()
44+
45+
# Projects embeddings into another space
46+
lin = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0).to(device)
47+
nt_time = benchmark_torch_function(iters, lin, nt)
48+
t_time = benchmark_torch_function(iters, lin, data)
49+
50+
print(f"batch size: {bdim:4.0f}, embedding dim: {embedding_dim}, vocab_size: {vocab_size}, T mean:{lengths_mean:5.0f}, T std: {lengths_std:4.0f}", end='')
51+
print(f", padding: {percentage_padded:3.0f}%, NT: {nt_time/iters:4.0f}us, T: {t_time/iters:4.0f}us, Speedup: {t_time/nt_time:3.2f}x")
52+
53+
54+
device = torch.device('cpu')
55+
if torch.cuda.is_available():
56+
print("CUDA device: ", torch.cuda.get_device_name(0))
57+
device = torch.device('cuda')
58+
iters = 100
59+
for vocab_size in [65536, 32768, 16384, 8192, 4096]:
60+
print("")
61+
for embed_dim in [4096, 2048, 1024, 512, 256]:
62+
print("")
63+
for min_t, max_t in [(16, 128), (32, 128), (64, 128), (128, 128)]:
64+
run(256, embed_dim, vocab_size, min_t, max_t, iters, device)

nestedtensor/csrc/autograd_functions.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,21 +92,21 @@ Tensor NestedTensor_batch_norm(
9292
int64_t n_input = *opt_sizes[1];
9393
if (running_mean) {
9494
check_dims_match_num_input_features(
95-
"running_mean", n_input, running_mean->numel());
95+
"running_mean", n_input, get_numel(*running_mean));
9696
} else if (!training) {
9797
AT_ERROR("running_mean must be defined in evaluation mode");
9898
}
9999
if (running_var) {
100100
check_dims_match_num_input_features(
101-
"running_var", n_input, running_var->numel());
101+
"running_var", n_input, get_numel(*running_var));
102102
} else if (!training) {
103103
AT_ERROR("running_var must be defined in evaluation mode");
104104
}
105105
if (weight) {
106-
check_dims_match_num_input_features("weight", n_input, weight->numel());
106+
check_dims_match_num_input_features("weight", n_input, get_numel(*weight));
107107
}
108108
if (bias) {
109-
check_dims_match_num_input_features("bias", n_input, bias->numel());
109+
check_dims_match_num_input_features("bias", n_input, get_numel(*bias));
110110
}
111111

112112
auto scalar_shape = make_scalar_shape(get_dim(input), n_input);

nestedtensor/csrc/cuda/padding.cu

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ void add_padding(
2626
template<typename T>
2727
void add_padding_kernelLauncher(
2828
T* input, // [batch_size x None]
29-
T* output, // [batch_size x max(input.nested_size(1))]
29+
T* output, // [batch_size x max(input.nested_size(1)) x inner_size]
3030
const int* offsets, // [batch_size]
3131
const int batch_size,
3232
const int output_stride,
@@ -53,5 +53,63 @@ template void add_padding_kernelLauncher<float>(
5353
const int output_stride,
5454
const int inner_size,
5555
const cudaStream_t stream);
56+
57+
template<typename T>
58+
__global__
59+
void add_padding_mask(
60+
const T* input,
61+
T* output,
62+
int* output_mask,
63+
const int* offsets,
64+
const int batch_size,
65+
const int mask_stride,
66+
const int output_stride,
67+
const int inner_size)
68+
{
69+
const int batch_id = blockIdx.x;
70+
for (int i = 0; i < (offsets[batch_id + 1] - offsets[batch_id]); i++) {
71+
output_mask[batch_id*mask_stride + i] = 1;
72+
}
73+
for (int i = 0; i < (offsets[batch_id + 1] - offsets[batch_id]) * inner_size; i++) {
74+
output[batch_id * output_stride + i] = input[offsets[batch_id] * inner_size + i];
75+
}
76+
}
77+
78+
template<typename T>
79+
void add_padding_mask_kernelLauncher(
80+
T* input, // [batch_size x None]
81+
T* output, // [batch_size x max(input.nested_size(1)) x inner_size]
82+
int* output_mask, // [batch_size x max(input.nested_size(1))]
83+
const int* offsets, // [batch_size]
84+
const int batch_size,
85+
const int mask_stride,
86+
const int output_stride,
87+
const int inner_size,
88+
const cudaStream_t stream)
89+
{
90+
dim3 grid;
91+
grid.x = batch_size;
92+
93+
add_padding_mask<float><<<grid, 1, 0, stream>>>(
94+
input,
95+
output,
96+
output_mask,
97+
offsets,
98+
batch_size,
99+
mask_stride,
100+
output_stride,
101+
inner_size);
102+
}
103+
104+
template void add_padding_mask_kernelLauncher<float>(
105+
float* input,
106+
float* output,
107+
int* output_mask,
108+
const int* offsets,
109+
const int batch_size,
110+
const int mask_stride,
111+
const int output_stride,
112+
const int inner_size,
113+
const cudaStream_t stream);
56114
}
57115
}

nestedtensor/csrc/cuda/padding.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,18 @@ void add_padding_kernelLauncher(
1515
const int output_stride,
1616
const int inner_size,
1717
const cudaStream_t stream);
18+
19+
template <typename T>
20+
void add_padding_mask_kernelLauncher(
21+
T* input,
22+
T* output,
23+
int* output_mask,
24+
const int* lengths,
25+
const int batch_size,
26+
const int mask_stride,
27+
const int output_stride,
28+
const int inner_size,
29+
const cudaStream_t stream);
30+
1831
}
1932
} // namespace nested_tensor

nestedtensor/csrc/functions.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,12 @@ Tensor NestedTensor_embedding(
2626
weight,
2727
indices);
2828
}
29-
if (is_nested_tensor_impl(indices) && get_is_contiguous(indices) &&
30-
!is_nested_tensor_impl(weight) && get_dim(indices) == 2 &&
31-
get_nested_dim(indices) == 1) {
29+
if (is_nested_tensor_impl(indices) &&
30+
!is_nested_tensor_impl(weight) &&
31+
get_dim(indices) == 1 &&
32+
get_dim(weight) == 2 &&
33+
get_is_contiguous(indices) &&
34+
get_is_contiguous(weight)) {
3235
Tensor indices_buffer = get_buffer(indices);
3336
Tensor result_buffer = at::embedding(
3437
weight, indices_buffer, padding_idx, scale_grad_by_freq, sparse);

nestedtensor/csrc/masking.cpp

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ std::tuple<Tensor, Tensor> merge_tensor_mask(
2626
Tensor is_zero = (collapsed_mask == 0);
2727
int64_t is_last_size_sum = is_last_size.sum().item<int64_t>();
2828
int64_t is_zero_sum = is_zero.sum().item<int64_t>();
29-
if ((is_last_size_sum + is_zero_sum) == collapsed_mask.numel()) {
29+
if ((is_last_size_sum + is_zero_sum) == get_numel(collapsed_mask)) {
3030
collapsed_mask = collapsed_mask.to(torch::kBool);
3131
return merge_tensor_mask(tensor, collapsed_mask, mask_dim);
3232
}
@@ -85,7 +85,7 @@ std::vector<int64_t> get_max_size(Tensor nt) {
8585

8686
std::tuple<Tensor, Tensor> pad_nt(Tensor nt, std::vector<int64_t> shape) {
8787
if (!is_nested_tensor_impl(nt)) {
88-
if (nt.numel() == 0) {
88+
if (get_numel(nt) == 0) {
8989
TORCH_CHECK(false, "Empty tensors are not yet supported.");
9090
}
9191
// Dont pad in case of a scalar
@@ -131,7 +131,7 @@ c10::optional<Tensor> nt_from_tensor_mask(
131131
Tensor mask,
132132
int64_t nested_dim) {
133133
if (nested_dim == 0) {
134-
if ((mask.numel() == 0) || (mask.numel() == 1 && mask.item<bool>())) {
134+
if ((get_numel(mask) == 0) || (get_numel(mask) == 1 && mask.item<bool>())) {
135135
return tensor;
136136
}
137137

@@ -153,7 +153,7 @@ c10::optional<Tensor> nt_from_tensor_mask(
153153
bool all_zero = true;
154154
for (int64_t i = 0; i < mask.size(0); i++) {
155155
Tensor tmp = *nt_from_tensor_mask(tensor[i], mask[i], nested_dim);
156-
if (tmp.numel() > 0) {
156+
if (get_numel(tmp) > 0) {
157157
all_zero = false;
158158
tensors.push_back(tmp);
159159
}
@@ -172,12 +172,12 @@ c10::optional<Tensor> nt_from_tensor_mask(
172172
return c10::nullopt;
173173
}
174174
std::vector<c10::optional<Tensor>> inner_tensors;
175-
if ((mask.numel() == 0) || (mask.numel() == 1 && mask.item<bool>())) {
175+
if ((get_numel(mask) == 0) || (get_numel(mask) == 1 && mask.item<bool>())) {
176176
for (int64_t i = 0; i < tensor.size(0); i++) {
177177
inner_tensors.push_back(
178178
nt_from_tensor_mask(tensor[i], mask, nested_dim - 1));
179179
}
180-
} else if (mask.numel() == 1 && !mask.item<bool>()) {
180+
} else if (get_numel(mask) == 1 && !mask.item<bool>()) {
181181
inner_tensors.push_back(c10::nullopt);
182182
} else {
183183
for (int64_t i = 0; i < tensor.size(0); i++) {
@@ -198,6 +198,41 @@ c10::optional<Tensor> nt_from_tensor_mask(
198198
std::tuple<Tensor, Tensor> to_tensor_mask(
199199
Tensor nt,
200200
c10::optional<int64_t> mask_dim) {
201+
#ifdef WITH_CUDA
202+
if (get_dim(nt) == 3 && get_is_contiguous(nt) && mask_dim && *mask_dim == 2) {
203+
auto nt_opt_size = get_opt_sizes(nt);
204+
Tensor nt_buffer = get_buffer(nt);
205+
if (nt_opt_size[2] && nt_buffer.is_cuda()) {
206+
std::cout << "Calling efficient to_tensor_mask" << std::endl;
207+
Tensor nt_sizes_ =
208+
get_efficient_nested_size(nt).sizes().to(torch::kInt32);
209+
TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor must be of nested_dim 2.")
210+
Tensor nt_sizes = at::native::narrow(nt_sizes_, 1, 0, 1);
211+
int max_size_1 = nt_sizes.max().item<int>();
212+
nt_sizes =
213+
at::native::cumsum(nt_sizes, 0).to(torch::kInt32).reshape({-1});
214+
nt_sizes = at::cat({torch::tensor({0}, torch::kInt32), nt_sizes});
215+
Tensor output = torch::zeros(
216+
{*nt_opt_size[0], max_size_1, *nt_opt_size[2]}, nt_buffer.options());
217+
nt_sizes = nt_sizes.to(torch::kCUDA);
218+
Tensor output_mask = torch::zeros(
219+
{*nt_opt_size[0], max_size_1}, nt_buffer.options());
220+
output_mask = output_mask.to(torch::kInt32);
221+
at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
222+
nested_tensor::cuda::add_padding_mask_kernelLauncher(
223+
nt_buffer.data_ptr<float>(),
224+
output.data_ptr<float>(),
225+
output_mask.data_ptr<int>(),
226+
nt_sizes.data_ptr<int>(),
227+
*nt_opt_size[0],
228+
output_mask.stride(0),
229+
output.stride(0),
230+
*nt_opt_size[2],
231+
defaultStream);
232+
return std::make_tuple(output, output_mask.to(torch::kBool));
233+
}
234+
}
235+
#endif
201236
TORCH_CHECK(
202237
!mask_dim || *mask_dim <= get_dim(nt),
203238
"Requested mask dimension ",
@@ -225,10 +260,10 @@ std::tuple<Tensor, Tensor> to_tensor_mask(
225260

226261
Tensor to_padded_tensor(Tensor nt, double padding) {
227262
#ifdef WITH_CUDA
228-
if (get_dim(nt) == 3) {
263+
if (get_dim(nt) == 3 && get_is_contiguous(nt)) {
229264
auto nt_opt_size = get_opt_sizes(nt);
230-
if (nt_opt_size[2]) {
231-
Tensor nt_buffer = get_buffer(nt);
265+
Tensor nt_buffer = get_buffer(nt);
266+
if (nt_opt_size[2] && nt_buffer.is_cuda()) {
232267
Tensor nt_sizes_ =
233268
get_efficient_nested_size(nt).sizes().to(torch::kInt32);
234269
TORCH_CHECK(nt_sizes_.dim() == 2, "NestedTensor must be of nested_dim 2.")

nestedtensor/csrc/nested_tensor_impl.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -344,18 +344,11 @@ Tensor NestedTensor_unsqueeze(const Tensor& self, int64_t dim) {
344344
return wrap_tensor_node(TensorNode(std::move(result_nodes)));
345345
}
346346

347-
Tensor NestedTensor_serialize_nested_size(const Tensor& tensor) {
348-
auto nt_impl = get_nested_tensor_impl(tensor);
349-
std::vector<int64_t> out;
350-
return torch::tensor(torch::nested_tensor::serialize(nt_impl->nested_size()));
351-
}
352-
353347
TORCH_LIBRARY_IMPL(aten, NestedTensor, m) {
354348
nt_impl(m, "contiguous", NestedTensor_contiguous);
355349
nt_impl(m, "copy_", NestedTensor_copy_);
356350
nt_impl(m, "is_pinned", NestedTensor_is_pinned);
357351
nt_impl(m, "select.int", NestedTensor_select);
358-
nt_impl(m, "serialize_nested_size", NestedTensor_serialize_nested_size);
359352
nt_impl(m, "size.int", NestedTensor_size_int);
360353
nt_impl(m, "slice.Tensor", NestedTensor_slice);
361354
nt_impl(m, "squeeze", NestedTensor_squeeze);

nestedtensor/csrc/nested_tensor_impl.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,7 @@ inline int64_t get_dim(const at::Tensor& tensor) {
206206

207207
inline int64_t get_numel(const at::Tensor& tensor) {
208208
if (is_nested_tensor_impl(tensor)) {
209-
return reduce(
210-
[](at::Tensor leaf, int64_t input) { return input + leaf.numel(); },
211-
0,
212-
get_nested_tensor_structure(tensor));
209+
return get_nested_tensor_impl(tensor)->get_storage()->numel();
213210
}
214211
return tensor.numel();
215212
}
@@ -304,8 +301,8 @@ inline Tensor NestedTensor_to_sparse_csr(Tensor tensor) {
304301
col_indices_.push_back(torch::arange({tensor_sizes_ptr[i]}));
305302
}
306303
at::Tensor col_indices = at::cat(col_indices_);
307-
return at::native::sparse_csr_tensor(crow_indices, col_indices, values,
308-
c10::nullopt, torch::kSparseCsr);
304+
return at::native::sparse_csr_tensor(
305+
crow_indices, col_indices, values, c10::nullopt, torch::kSparseCsr);
309306
}
310307

311308
inline std::ostream& operator<<(

nestedtensor/csrc/py_init.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -249,22 +249,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
249249
return _nested_helper(index, get_nested_size(self));
250250
});
251251

252-
m.def("serialize_nested_size", [](Tensor self) {
253-
return serialize(get_nested_tensor_impl(self)->nested_size());
254-
});
255-
256-
m.def("deserialize_nested_size", [](std::vector<int64_t> out) {
257-
SizeNode nested_size = deserialize_size_node(out);
258-
return py::cast(THPPythonNode(
259-
map(
260-
[](std::vector<int64_t> e) {
261-
return py::reinterpret_steal<py::object>(
262-
THPSize_NewFromSizes(e.size(), e.data()));
263-
},
264-
nested_size),
265-
"NestedSize"));
266-
});
267-
268252
m.def("nested_stride", [](Tensor self, c10::optional<int64_t> index_) {
269253
if (!index_) {
270254
return py::cast(THPPythonNode(

nestedtensor/csrc/storage/EfficientSizeNode.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ struct EfficientSizeNode {
135135
return _structure[0];
136136
}
137137
if (_sizes.dim() > 0) {
138+
if (_sizes.numel() == 0) {
139+
return 0;
140+
}
138141
Tensor nt_sizes = at::native::narrow(
139142
_sizes, 1 /* dim */, 0 /* start */, 1 /* length */);
140143
for (int64_t i = 1; i < _sizes.size(1); i++) {

0 commit comments

Comments
 (0)