@@ -96,7 +96,8 @@ void setup_input_tensors(
9696 std::vector<at::Tensor> inputs,
9797 c10::intrusive_ptr<TRTEngine> compiled_engine,
9898 bool cudagraphs_enabled,
99- bool need_cudagraphs_record) {
99+ bool need_cudagraphs_record,
100+ bool shape_changed) {
100101 // this is a buffer to store shape tensor input addresses throughout the runtime scope
101102 std::list<std::vector<int64_t >> inputShapeTensorValues;
102103 std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
@@ -117,7 +118,7 @@ void setup_input_tensors(
117118 auto shape = core::util::toVec (dims);
118119 LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
119120
120- if (compiled_engine->cuda_engine -> isShapeInferenceIO ( name. c_str ()) ) {
121+ if (compiled_engine->isShapeInferenceIO [ name] ) {
121122 // Shape tensor inputs are casted to int64 explicitly.
122123 // Refer to
123124 // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
@@ -145,10 +146,10 @@ void setup_input_tensors(
145146 // Create a new persistent input buffer
146147 compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
147148 }
148-
149- TORCHTRT_CHECK (
150- compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
151-
149+ if (shape_changed) {
150+ TORCHTRT_CHECK (
151+ compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
152+ }
152153 if (cudagraphs_enabled) {
153154 // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
154155 compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
@@ -217,7 +218,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
217218 compiled_engine->cudagraph .reset ();
218219 }
219220
220- std::vector<at::Tensor> outputs (compiled_engine-> num_io . second ) ;
221+ std::vector<at::Tensor> outputs;
221222
222223 // Intialize inputs and outputs to be available throughout the succeeding scopes
223224 { // Input Setup
@@ -226,10 +227,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
226227 input_profiler_guard =
227228 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
228229 }
229-
230- setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
230+ setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, shape_changed);
231231 // Check if input shapes can be inferred.
232- int32_t const io_size{compiled_engine->cuda_engine -> getNbIOTensors () };
232+ int32_t const io_size{compiled_engine->io_size };
233233 std::vector<char const *> names (io_size);
234234 int32_t const nbNames = compiled_engine->exec_ctx ->inferShapes (names.size (), names.data ());
235235 TORCHTRT_CHECK (
@@ -240,6 +240,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240240 }
241241
242242 { // Output Setup
243+ bool new_outputs = false ;
243244 std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244245 if (compiled_engine->profile_execution ) {
245246 output_profiler_guard =
@@ -248,64 +249,60 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
248249 if (can_use_pre_allocated_outputs) {
249250 outputs = compiled_engine->pre_allocated_outputs ;
250251 } else {
251- outputs = create_output_tensors (compiled_engine);
252+ if (compiled_engine->allocated_outputs .size () == 0 or compiled_engine->requires_new_output_tensor or
253+ shape_changed) {
254+ compiled_engine->allocated_outputs = create_output_tensors (compiled_engine);
255+ new_outputs = true ;
256+ }
257+ outputs = compiled_engine->allocated_outputs ;
252258 }
253259
254- for (auto output_indices : compiled_engine->out_binding_map ) {
255- auto pyt_idx = output_indices.second ;
256- std::string name = compiled_engine->out_binding_names [pyt_idx];
257- if (need_cudagraphs_record) {
258- // If we are recording the cuda graph then we need to update the persistent output buffer
259- compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
260- }
260+ if (new_outputs) {
261+ for (auto output_indices : compiled_engine->out_binding_map ) {
262+ auto pyt_idx = output_indices.second ;
263+ std::string name = compiled_engine->out_binding_names [pyt_idx];
264+ if (need_cudagraphs_record) {
265+ // If we are recording the cuda graph then we need to update the persistent output buffer
266+ compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
267+ }
261268
262- if (cudagraphs_enabled) {
263- TORCHTRT_CHECK (
264- compiled_engine->exec_ctx ->setTensorAddress (
265- name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
266- " Error while setting the output tensor address" );
267- } else {
268- TORCHTRT_CHECK (
269- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
270- " Error while setting the output tensor address" );
269+ if (cudagraphs_enabled) {
270+ TORCHTRT_CHECK (
271+ compiled_engine->exec_ctx ->setTensorAddress (
272+ name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
273+ " Error while setting the output tensor address" );
274+ } else {
275+ TORCHTRT_CHECK (
276+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
277+ " Error while setting the output tensor address" );
278+ }
271279 }
272280 }
273281 }
274282
275283 auto current_device_id = -1 ;
276284 if (inputs.size () > 0 ) {
277285 current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
278- } else if (outputs.size () > 0 ) {
279- current_device_id = outputs[0 ].device ().index (); // Done this way to avoid a call to cudart
280- }
281-
282- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
283- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
284- // Create a new stream if the engine stream is the default stream
285- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
286+ if (current_device_id != compiled_engine->current_device_id ) {
287+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
288+ }
286289 }
287290
288291 { // Engine Execution (execute on engine stream)
289- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
290292
291293 std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292294 if (compiled_engine->profile_execution ) {
293295 enqueue_profiler_guard =
294296 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
295297 }
296298
297- // Block engine stream until results are available on caller stream
298- at::cuda::CUDAEvent caller_exec_complete;
299- caller_exec_complete.record (compiled_engine->caller_stream );
300- caller_exec_complete.block (compiled_engine->engine_stream );
301-
302299 if (!cudagraphs_enabled) {
303300 // Direct execution uses the caller buffers directly
304- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
301+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
305302 } else {
306303 if (need_cudagraphs_record) {
307304 // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308- c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream ;
305+ c10::cuda::CUDAStream recording_stream = compiled_engine->stream ;
309306 compiled_engine->cudagraph .capture_begin ();
310307 compiled_engine->exec_ctx ->enqueueV3 (recording_stream);
311308 compiled_engine->cudagraph .capture_end ();
@@ -325,11 +322,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
325322 compiled_engine->pre_allocated_outputs = create_output_tensors (compiled_engine);
326323 }
327324
328- // Block caller stream until engine execution is complete
329- at::cuda::CUDAEvent trt_exec_complete;
330- trt_exec_complete.record (compiled_engine->engine_stream );
331- trt_exec_complete.block (compiled_engine->caller_stream );
332-
333325 if (cudagraphs_enabled) {
334326 // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335327 for (size_t o = 0 ; o < compiled_engine->output_buffers .size (); o++) {
@@ -354,7 +346,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
354346 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
355347 }
356348
357- setup_input_tensors (inputs, compiled_engine, false , false );
349+ setup_input_tensors (inputs, compiled_engine, false , false , true );
358350 // Check if input shapes can be inferred.
359351 int32_t const io_size{compiled_engine->cuda_engine ->getNbIOTensors ()};
360352 std::vector<char const *> names (io_size);
@@ -378,40 +370,24 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378370 auto current_device_id = -1 ;
379371 if (inputs.size () > 0 ) {
380372 current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
381- } else {
382- current_device_id = at::cuda::current_device ();
383- }
384-
385- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
386- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
387- // Create a new stream if the engine stream is the default stream
388- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
373+ if (current_device_id != compiled_engine->current_device_id ) {
374+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
375+ }
389376 }
390377
391378 { // Engine Execution (execute on engine stream)
392- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
393379
394380 std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395381 if (compiled_engine->profile_execution ) {
396382 enqueue_profiler_guard =
397383 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
398384 }
399385
400- // Block engine stream until results are available on caller stream
401- at::cuda::CUDAEvent caller_exec_complete;
402- caller_exec_complete.record (compiled_engine->caller_stream );
403- caller_exec_complete.block (compiled_engine->engine_stream );
404-
405386 // Direct execution uses the caller buffers directly
406- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
387+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
407388
408389 } // End engine exeuction (resets to caller stream)
409390
410- // Block caller stream until engine execution is complete
411- at::cuda::CUDAEvent trt_exec_complete;
412- trt_exec_complete.record (compiled_engine->engine_stream );
413- trt_exec_complete.block (compiled_engine->caller_stream );
414-
415391 std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416392 if (compiled_engine->profile_execution ) {
417393 output_profiler_guard =
0 commit comments