@@ -96,7 +96,8 @@ void setup_input_tensors(
9696 std::vector<at::Tensor> inputs,
9797 c10::intrusive_ptr<TRTEngine> compiled_engine,
9898 bool cudagraphs_enabled,
99- bool need_cudagraphs_record) {
99+ bool need_cudagraphs_record,
100+ bool shape_changed) {
100101 // this is a buffer to store shape tensor input addresses throughout the runtime scope
101102 std::list<std::vector<int64_t >> inputShapeTensorValues;
102103 std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
@@ -117,7 +118,7 @@ void setup_input_tensors(
117118 auto shape = core::util::toVec (dims);
118119 LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
119120
120- if (compiled_engine->cuda_engine -> isShapeInferenceIO ( name. c_str ()) ) {
121+ if (compiled_engine->isShapeInferenceIO [ name] ) {
121122 // Shape tensor inputs are casted to int64 explicitly.
122123 // Refer to
123124 // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
@@ -145,10 +146,10 @@ void setup_input_tensors(
145146 // Create a new persistent input buffer
146147 compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
147148 }
148-
149- TORCHTRT_CHECK (
150- compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
151-
149+ if (shape_changed) {
150+ TORCHTRT_CHECK (
151+ compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
152+ }
152153 if (cudagraphs_enabled) {
153154 // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
154155 compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
@@ -217,7 +218,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
217218 compiled_engine->cudagraph .reset ();
218219 }
219220
220- std::vector<at::Tensor> outputs (compiled_engine-> num_io . second ) ;
221+ std::vector<at::Tensor> outputs;
221222
222223 // Intialize inputs and outputs to be available throughout the succeeding scopes
223224 { // Input Setup
@@ -226,10 +227,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
226227 input_profiler_guard =
227228 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
228229 }
229-
230- setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
230+ setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, shape_changed);
231231 // Check if input shapes can be inferred.
232- int32_t const io_size{compiled_engine->cuda_engine -> getNbIOTensors () };
232+ int32_t const io_size{compiled_engine->io_size };
233233 std::vector<char const *> names (io_size);
234234 int32_t const nbNames = compiled_engine->exec_ctx ->inferShapes (names.size (), names.data ());
235235 TORCHTRT_CHECK (
@@ -240,6 +240,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240240 }
241241
242242 { // Output Setup
243+ bool new_outputs = false ;
243244 std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244245 if (compiled_engine->profile_execution ) {
245246 output_profiler_guard =
@@ -248,64 +249,59 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
248249 if (can_use_pre_allocated_outputs) {
249250 outputs = compiled_engine->pre_allocated_outputs ;
250251 } else {
251- outputs = create_output_tensors (compiled_engine);
252+ if (compiled_engine->allocated_outputs .size () == 0 or compiled_engine->unowned_output_tensor or shape_changed) {
253+ compiled_engine->allocated_outputs = create_output_tensors (compiled_engine);
254+ new_outputs = true ;
255+ }
256+ outputs = compiled_engine->allocated_outputs ;
252257 }
253258
254- for (auto output_indices : compiled_engine->out_binding_map ) {
255- auto pyt_idx = output_indices.second ;
256- std::string name = compiled_engine->out_binding_names [pyt_idx];
257- if (need_cudagraphs_record) {
258- // If we are recording the cuda graph then we need to update the persistent output buffer
259- compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
260- }
259+ if (new_outputs) {
260+ for (auto output_indices : compiled_engine->out_binding_map ) {
261+ auto pyt_idx = output_indices.second ;
262+ std::string name = compiled_engine->out_binding_names [pyt_idx];
263+ if (need_cudagraphs_record) {
264+ // If we are recording the cuda graph then we need to update the persistent output buffer
265+ compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
266+ }
261267
262- if (cudagraphs_enabled) {
263- TORCHTRT_CHECK (
264- compiled_engine->exec_ctx ->setTensorAddress (
265- name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
266- " Error while setting the output tensor address" );
267- } else {
268- TORCHTRT_CHECK (
269- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
270- " Error while setting the output tensor address" );
268+ if (cudagraphs_enabled) {
269+ TORCHTRT_CHECK (
270+ compiled_engine->exec_ctx ->setTensorAddress (
271+ name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
272+ " Error while setting the output tensor address" );
273+ } else {
274+ TORCHTRT_CHECK (
275+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
276+ " Error while setting the output tensor address" );
277+ }
271278 }
272279 }
273280 }
274281
275282 auto current_device_id = -1 ;
276283 if (inputs.size () > 0 ) {
277284 current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
278- } else if (outputs.size () > 0 ) {
279- current_device_id = outputs[0 ].device ().index (); // Done this way to avoid a call to cudart
280- }
281-
282- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
283- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
284- // Create a new stream if the engine stream is the default stream
285- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
285+ if (current_device_id != compiled_engine->current_device_id ) {
286+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
287+ }
286288 }
287289
288290 { // Engine Execution (execute on engine stream)
289- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
290291
291292 std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292293 if (compiled_engine->profile_execution ) {
293294 enqueue_profiler_guard =
294295 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
295296 }
296297
297- // Block engine stream until results are available on caller stream
298- at::cuda::CUDAEvent caller_exec_complete;
299- caller_exec_complete.record (compiled_engine->caller_stream );
300- caller_exec_complete.block (compiled_engine->engine_stream );
301-
302298 if (!cudagraphs_enabled) {
303299 // Direct execution uses the caller buffers directly
304- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
300+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
305301 } else {
306302 if (need_cudagraphs_record) {
307303 // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308- c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream ;
304+ c10::cuda::CUDAStream recording_stream = compiled_engine->stream ;
309305 compiled_engine->cudagraph .capture_begin ();
310306 compiled_engine->exec_ctx ->enqueueV3 (recording_stream);
311307 compiled_engine->cudagraph .capture_end ();
@@ -325,11 +321,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
325321 compiled_engine->pre_allocated_outputs = create_output_tensors (compiled_engine);
326322 }
327323
328- // Block caller stream until engine execution is complete
329- at::cuda::CUDAEvent trt_exec_complete;
330- trt_exec_complete.record (compiled_engine->engine_stream );
331- trt_exec_complete.block (compiled_engine->caller_stream );
332-
333324 if (cudagraphs_enabled) {
334325 // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335326 for (size_t o = 0 ; o < compiled_engine->output_buffers .size (); o++) {
@@ -354,7 +345,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
354345 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
355346 }
356347
357- setup_input_tensors (inputs, compiled_engine, false , false );
348+ setup_input_tensors (inputs, compiled_engine, false , false , true );
358349 // Check if input shapes can be inferred.
359350 int32_t const io_size{compiled_engine->cuda_engine ->getNbIOTensors ()};
360351 std::vector<char const *> names (io_size);
@@ -378,40 +369,24 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378369 auto current_device_id = -1 ;
379370 if (inputs.size () > 0 ) {
380371 current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
381- } else {
382- current_device_id = at::cuda::current_device ();
383- }
384-
385- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
386- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
387- // Create a new stream if the engine stream is the default stream
388- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
372+ if (current_device_id != compiled_engine->current_device_id ) {
373+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
374+ }
389375 }
390376
391377 { // Engine Execution (execute on engine stream)
392- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
393378
394379 std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395380 if (compiled_engine->profile_execution ) {
396381 enqueue_profiler_guard =
397382 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
398383 }
399384
400- // Block engine stream until results are available on caller stream
401- at::cuda::CUDAEvent caller_exec_complete;
402- caller_exec_complete.record (compiled_engine->caller_stream );
403- caller_exec_complete.block (compiled_engine->engine_stream );
404-
405385 // Direct execution uses the caller buffers directly
406- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
386+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
407387
408388 } // End engine exeuction (resets to caller stream)
409389
410- // Block caller stream until engine execution is complete
411- at::cuda::CUDAEvent trt_exec_complete;
412- trt_exec_complete.record (compiled_engine->engine_stream );
413- trt_exec_complete.block (compiled_engine->caller_stream );
414-
415390 std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416391 if (compiled_engine->profile_execution ) {
417392 output_profiler_guard =
0 commit comments