@@ -104,8 +104,8 @@ void setup_input_tensors(
104104 for (size_t i = 0 ; i < inputs.size (); i++) {
105105 std::string name = compiled_engine->in_binding_names [i];
106106
107- TORCHTRT_CHECK (
108- inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
107+ // TORCHTRT_CHECK(
108+ // inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
109109
110110 auto expected_type =
111111 util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
@@ -202,30 +202,30 @@ void create_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {
202202
203203std::vector<at::Tensor> execute_engine (std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
204204 auto run_standard_execution = [&]() {
205- bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206- bool shape_changed = _validate_shapes (inputs, compiled_engine);
205+ bool cudagraphs_enabled = false ; // (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
206+ bool shape_changed = false ; // _validate_shapes(inputs, compiled_engine);
207207
208208 // Whether cudagraphs needs to record the graph on this pass
209209 auto result = compiled_engine->runtime_states .set_runtime_states (
210210 cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs , shape_changed);
211211
212- bool need_cudagraphs_record = std::get<0 >(result);
212+ bool need_cudagraphs_record = false ; // std::get<0>(result);
213213 bool can_use_pre_allocated_outputs = std::get<1 >(result);
214214 bool need_cudagraphs_reset = std::get<2 >(result);
215215
216- if (need_cudagraphs_reset) {
217- compiled_engine->cudagraph .reset ();
218- }
216+ // if (need_cudagraphs_reset) {
217+ // compiled_engine->cudagraph.reset();
218+ // }
219219
220- std::vector<at::Tensor> outputs (compiled_engine-> num_io . second ) ;
220+ std::vector<at::Tensor> outputs;
221221
222222 // Intialize inputs and outputs to be available throughout the succeeding scopes
223223 { // Input Setup
224- std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225- if (compiled_engine->profile_execution ) {
226- input_profiler_guard =
227- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
228- }
224+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
225+ // if (compiled_engine->profile_execution) {
226+ // input_profiler_guard =
227+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
228+ // }
229229
230230 setup_input_tensors (inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
231231 // Check if input shapes can be inferred.
@@ -240,72 +240,71 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240240 }
241241
242242 { // Output Setup
243- std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244- if (compiled_engine->profile_execution ) {
245- output_profiler_guard =
246- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
247- }
243+ bool new_outputs = false ;
244+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
245+ // if (compiled_engine->profile_execution) {
246+ // output_profiler_guard =
247+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
248+ // }
248249 if (can_use_pre_allocated_outputs) {
249250 outputs = compiled_engine->pre_allocated_outputs ;
250251 } else {
251- outputs = create_output_tensors (compiled_engine);
252+ if (compiled_engine->allocated_outputs .size () == 0 ) {
253+ compiled_engine->allocated_outputs = create_output_tensors (compiled_engine);
254+ std::cout << " new_outputs" << std::endl;
255+ new_outputs = true ;
256+ }
257+ outputs = compiled_engine->allocated_outputs ;
252258 }
253259
254- for (auto output_indices : compiled_engine->out_binding_map ) {
255- auto pyt_idx = output_indices.second ;
256- std::string name = compiled_engine->out_binding_names [pyt_idx];
257- if (need_cudagraphs_record) {
258- // If we are recording the cuda graph then we need to update the persistent output buffer
259- compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
260- }
260+ if (new_outputs) {
261+ for (auto output_indices : compiled_engine->out_binding_map ) {
262+ auto pyt_idx = output_indices.second ;
263+ std::string name = compiled_engine->out_binding_names [pyt_idx];
264+ if (need_cudagraphs_record) {
265+ // If we are recording the cuda graph then we need to update the persistent output buffer
266+ compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
267+ }
261268
262- if (cudagraphs_enabled) {
263- TORCHTRT_CHECK (
264- compiled_engine->exec_ctx ->setTensorAddress (
265- name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
266- " Error while setting the output tensor address" );
267- } else {
268- TORCHTRT_CHECK (
269- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
270- " Error while setting the output tensor address" );
269+ if (cudagraphs_enabled) {
270+ TORCHTRT_CHECK (
271+ compiled_engine->exec_ctx ->setTensorAddress (
272+ name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
273+ " Error while setting the output tensor address" );
274+ } else {
275+ TORCHTRT_CHECK (
276+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
277+ " Error while setting the output tensor address" );
278+ }
271279 }
272280 }
273281 }
274282
275- auto current_device_id = -1 ;
276- if (inputs.size () > 0 ) {
277- current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
278- } else if (outputs.size () > 0 ) {
279- current_device_id = outputs[0 ].device ().index (); // Done this way to avoid a call to cudart
280- }
281-
282- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
283- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
284- // Create a new stream if the engine stream is the default stream
285- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
286- }
283+ // auto current_device_id = -1;
284+ // if (inputs.size() > 0) {
285+ // current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
286+ // if (current_device_id != compiled_engine->current_device_id) {
287+ // compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
288+ // }
289+ // }
287290
288291 { // Engine Execution (execute on engine stream)
289- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
290292
291- std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292- if (compiled_engine->profile_execution ) {
293- enqueue_profiler_guard =
294- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
295- }
293+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
294+ // if (compiled_engine->profile_execution) {
295+ // enqueue_profiler_guard =
296+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
297+ // }
298+
296299
297- // Block engine stream until results are available on caller stream
298- at::cuda::CUDAEvent caller_exec_complete;
299- caller_exec_complete.record (compiled_engine->caller_stream );
300- caller_exec_complete.block (compiled_engine->engine_stream );
301300
302301 if (!cudagraphs_enabled) {
303302 // Direct execution uses the caller buffers directly
304- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
303+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
305304 } else {
306305 if (need_cudagraphs_record) {
307306 // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308- c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream ;
307+ c10::cuda::CUDAStream recording_stream = compiled_engine->stream ;
309308 compiled_engine->cudagraph .capture_begin ();
310309 compiled_engine->exec_ctx ->enqueueV3 (recording_stream);
311310 compiled_engine->cudagraph .capture_end ();
@@ -321,27 +320,22 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
321320 } // End engine exeuction (resets to caller stream)
322321
323322 // Create output buffer for next execution of graph or trt context.
324- if (compiled_engine->use_pre_allocated_outputs ) {
325- compiled_engine->pre_allocated_outputs = create_output_tensors (compiled_engine);
326- }
327-
328- // Block caller stream until engine execution is complete
329- at::cuda::CUDAEvent trt_exec_complete;
330- trt_exec_complete.record (compiled_engine->engine_stream );
331- trt_exec_complete.block (compiled_engine->caller_stream );
332-
333- if (cudagraphs_enabled) {
334- // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335- for (size_t o = 0 ; o < compiled_engine->output_buffers .size (); o++) {
336- outputs[o].copy_ (compiled_engine->output_buffers [o], false );
337- }
338- }
339-
340- if (compiled_engine->profile_execution ) {
341- LOG_INFO (std::endl << *compiled_engine->trt_engine_profiler );
342- dump_trace (compiled_engine->trt_engine_profile_path , *compiled_engine->trt_engine_profiler );
343- compiled_engine->dump_engine_layer_info ();
344- }
323+ // if (compiled_engine->use_pre_allocated_outputs) {
324+ // compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
325+ // }
326+
327+ // if (cudagraphs_enabled) {
328+ // // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
329+ // for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
330+ // outputs[o].copy_(compiled_engine->output_buffers[o], false);
331+ // }
332+ // }
333+
334+ // if (compiled_engine->profile_execution) {
335+ // LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
336+ // dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
337+ // compiled_engine->dump_engine_layer_info();
338+ // }
345339
346340 return outputs;
347341 };
@@ -378,45 +372,31 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378372 auto current_device_id = -1 ;
379373 if (inputs.size () > 0 ) {
380374 current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
381- } else {
382- current_device_id = at::cuda::current_device ();
383- }
375+ if (current_device_id != compiled_engine->current_device_id ) {
376+ compiled_engine->stream = c10::cuda::getCurrentCUDAStream (current_device_id);
377+
378+ }
379+ }
384380
385- compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
386- if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream (current_device_id)) {
387- // Create a new stream if the engine stream is the default stream
388- compiled_engine->engine_stream = c10::cuda::getStreamFromPool (false , current_device_id);
389- }
390381
391382 { // Engine Execution (execute on engine stream)
392- c10::cuda::CUDAStreamGuard stream_guard (compiled_engine->engine_stream );
393-
394- std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395- if (compiled_engine->profile_execution ) {
396- enqueue_profiler_guard =
397- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path );
398- }
399383
400- // Block engine stream until results are available on caller stream
401- at::cuda::CUDAEvent caller_exec_complete;
402- caller_exec_complete.record (compiled_engine->caller_stream );
403- caller_exec_complete.block (compiled_engine->engine_stream );
384+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
385+ // if (compiled_engine->profile_execution) {
386+ // enqueue_profiler_guard =
387+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
388+ // }
404389
405390 // Direct execution uses the caller buffers directly
406- compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->engine_stream );
391+ compiled_engine->exec_ctx ->enqueueV3 (compiled_engine->stream );
407392
408393 } // End engine exeuction (resets to caller stream)
409394
410- // Block caller stream until engine execution is complete
411- at::cuda::CUDAEvent trt_exec_complete;
412- trt_exec_complete.record (compiled_engine->engine_stream );
413- trt_exec_complete.block (compiled_engine->caller_stream );
414-
415- std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416- if (compiled_engine->profile_execution ) {
417- output_profiler_guard =
418- std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
419- }
395+ // std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
396+ // if (compiled_engine->profile_execution) {
397+ // output_profiler_guard =
398+ // std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
399+ // }
420400 std::vector<at::Tensor> outputs;
421401 for (size_t i = 0 ; i < compiled_engine->out_binding_names .size (); i++) {
422402 auto name = compiled_engine->out_binding_names [i];
@@ -476,45 +456,45 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
476456 std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path );
477457 }
478458
479- RTDevice curr_device = get_current_device ();
480- LOG_DEBUG (" Current Device: " << curr_device);
481-
482- // Generic Target Device Prefix
483- std::string target_device = " cuda:" ;
484-
485- if (is_switch_required (curr_device, compiled_engine->device_info )) {
486- // Scan through available CUDA devices and set the CUDA device context correctly
487- RTDevice device =
488- select_rt_device (compiled_engine->device_info , curr_device, compiled_engine->hardware_compatible );
489- set_rt_device (device);
490-
491- // Target device is new device
492- target_device += std::to_string (device.id );
493-
494- for (auto & in : inputs) {
495- in = in.to (torch::Device (target_device));
496- }
497- } else {
498- // Target device is current device
499- target_device += std::to_string (curr_device.id );
500- }
501-
502- // For each input, ensure its current device is the desired target device
503- for (size_t i = 0 ; i < inputs.size (); i++) {
504- at::Tensor* in = &inputs[i];
505- std::string current_tensor_device = in->device ().str ();
506-
507- // If current device string does not match target device, display warning and move tensor accordingly
508- if (current_tensor_device != target_device) {
509- LOG_WARNING (
510- " Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
511- << " but should be on " << target_device << " . This tensor is being moved by the runtime but "
512- << " for performance considerations, ensure your inputs are all on GPU "
513- << " and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
514- << " warning persists." );
515- *in = in->to (torch::Device (target_device));
516- }
517- }
459+ // RTDevice curr_device = get_current_device();
460+ // LOG_DEBUG("Current Device: " << curr_device);
461+
462+ // // Generic Target Device Prefix
463+ // std::string target_device = "cuda:";
464+
465+ // if (is_switch_required(curr_device, compiled_engine->device_info)) {
466+ // // Scan through available CUDA devices and set the CUDA device context correctly
467+ // RTDevice device =
468+ // select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
469+ // set_rt_device(device);
470+
471+ // // Target device is new device
472+ // target_device += std::to_string(device.id);
473+
474+ // for (auto& in : inputs) {
475+ // in = in.to(torch::Device(target_device));
476+ // }
477+ // } else {
478+ // // Target device is current device
479+ // target_device += std::to_string(curr_device.id);
480+ // }
481+
482+ // // For each input, ensure its current device is the desired target device
483+ // for (size_t i = 0; i < inputs.size(); i++) {
484+ // at::Tensor* in = &inputs[i];
485+ // std::string current_tensor_device = in->device().str();
486+
487+ // // If current device string does not match target device, display warning and move tensor accordingly
488+ // if (current_tensor_device != target_device) {
489+ // LOG_WARNING(
490+ // "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
491+ // << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
492+ // << "for performance considerations, ensure your inputs are all on GPU "
493+ // << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
494+ // << "warning persists.");
495+ // *in = in->to(torch::Device(target_device));
496+ // }
497+ // }
518498 }
519499
520500 if (compiled_engine->requires_output_allocator ) { // engine requires OA
0 commit comments