diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp index 52a9b47c12..de70331818 100644 --- a/core/runtime/TRTEngine.cpp +++ b/core/runtime/TRTEngine.cpp @@ -61,7 +61,8 @@ TRTEngine::TRTEngine( const Platform& target_platform, bool hardware_compatible, bool requires_output_allocator, - const std::string& serialized_metadata) + const std::string& serialized_metadata, + const ResourceAllocationStrategy resource_allocation_strategy) : TRTEngine( "deserialized_trt", serialized_engine, @@ -71,7 +72,8 @@ TRTEngine::TRTEngine( target_platform, hardware_compatible, requires_output_allocator, - serialized_metadata) {} + serialized_metadata, + resource_allocation_strategy) {} TRTEngine::TRTEngine(std::vector serialized_info) : TRTEngine( @@ -83,7 +85,10 @@ TRTEngine::TRTEngine(std::vector serialized_info) Platform(serialized_info[TARGET_PLATFORM_IDX]), static_cast(std::stoi(serialized_info[HW_COMPATIBLE_IDX])), static_cast(std::stoi(serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX])), - serialized_info[SERIALIZED_METADATA_IDX]) {} + serialized_info[SERIALIZED_METADATA_IDX], + (static_cast(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])) + ? ResourceAllocationStrategy::kDynamic + : ResourceAllocationStrategy::kStatic)) {} TRTEngine::TRTEngine( const std::string& mod_name, @@ -94,7 +99,8 @@ TRTEngine::TRTEngine( const Platform& target_platform, bool hardware_compatible, bool requires_output_allocator, - const std::string& serialized_metadata) { + const std::string& serialized_metadata, + const ResourceAllocationStrategy resource_allocation_strategy) { TORCHTRT_CHECK( is_supported_on_current_platform(target_platform), "This engine was not built to run on this platform (built for: " << target_platform << ", current platform: " @@ -124,7 +130,16 @@ TRTEngine::TRTEngine( cuda_engine->setWeightStreamingBudgetV2(budget_bytes); } - exec_ctx = make_trt(cuda_engine->createExecutionContext()); + this->resource_allocation_strategy = resource_allocation_strategy; + LOG_DEBUG( + "Resource allocation strategy: " + << (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static")); + if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) { + this->exec_ctx = + make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); + } else { + this->exec_ctx = make_trt(cuda_engine->createExecutionContext()); + } TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context"); runtime_states.old_cudagraphs = CUDAGRAPHS_MODE; @@ -393,6 +408,7 @@ std::string TRTEngine::to_str() const { ss << " Device: " << device_info << std::endl; ss << " Hardware Compatibility: " << (hardware_compatible ? "Enabled" : "Disabled") << std::endl; ss << " Target Platform: " << target_platform << std::endl; + ss << " Resource Allocation Strategy: " << (resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static") << std::endl; // clang-format on return ss.str(); } @@ -436,7 +452,8 @@ FlattenedState TRTEngine::__obj_flatten__() { std::tuple("hardware_compatible", serialized_info[HW_COMPATIBLE_IDX]), std::tuple("serialized_metadata", serialized_info[SERIALIZED_METADATA_IDX]), std::tuple("requires_output_allocator", serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]), - std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX])); + std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]), + std::tuple("resource_allocation_strategy", serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])); } std::vector TRTEngine::serialize() { @@ -459,6 +476,8 @@ std::vector TRTEngine::serialize() { serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = this->requires_output_allocator ? "1" : "0"; serialized_info[SERIALIZED_METADATA_IDX] = this->serialized_metadata; serialized_info[TARGET_PLATFORM_IDX] = this->target_platform.serialize(); + serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX] = + this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "1" : "0"; return serialized_info; } @@ -467,6 +486,20 @@ void TRTEngine::reset_captured_graph() { cudagraph.reset(); } +void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) { + if (new_strategy != this->resource_allocation_strategy) { + this->resource_allocation_strategy = new_strategy; + if (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) { + LOG_DEBUG("Setting resource allocation strategy to dynamic"); + this->exec_ctx = + make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); + } else { + LOG_DEBUG("Setting resource allocation strategy to static"); + this->exec_ctx = make_trt(cuda_engine->createExecutionContext()); + } + } +} + } // namespace runtime } // namespace core } // namespace torch_tensorrt diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h index 15d723ce4e..2ed07f0bcc 100644 --- a/core/runtime/TRTEngine.h +++ b/core/runtime/TRTEngine.h @@ -29,7 +29,8 @@ using FlattenedState = std::tuple< std::tuple, // HW compatibility std::tuple, // requires_output_allocator std::tuple, // serialized metadata - std::tuple>; // Platform + std::tuple, // Platform + std::tuple>; // Resource Allocation Strategy struct TorchTRTRuntimeStates { // Indicates whether CUDAGraphs were enabled in the previous execute_engine @@ -98,6 +99,8 @@ class DynamicOutputAllocator : public nvinfer1::IOutputAllocator { }; struct TRTEngine : torch::CustomClassHolder { + // Resource Allocation Strategy + typedef enum { kStatic = 0, kDynamic } ResourceAllocationStrategy; // Each engine needs it's own runtime object std::shared_ptr rt; std::shared_ptr cuda_engine; @@ -128,7 +131,9 @@ struct TRTEngine : torch::CustomClassHolder { const Platform& target_platform = get_current_platform(), bool hardware_compatible = false, bool requires_output_allocator = false, - const std::string& serialized_metadata = ""); + const std::string& serialized_metadata = "", + const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy = + TRTEngine::ResourceAllocationStrategy::kStatic); TRTEngine(std::vector serialized_info); @@ -141,7 +146,9 @@ struct TRTEngine : torch::CustomClassHolder { const Platform& target_platform = get_current_platform(), bool hardware_compatible = false, bool requires_output_allocator = false, - const std::string& serialized_metadata = ""); + const std::string& serialized_metadata = "", + const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy = + TRTEngine::ResourceAllocationStrategy::kStatic); TRTEngine& operator=(const TRTEngine& other); std::string to_str() const; @@ -200,6 +207,9 @@ struct TRTEngine : torch::CustomClassHolder { std::string cuda_graph_debug_path; std::mutex mu; std::unique_ptr trt_engine_profiler; + ResourceAllocationStrategy resource_allocation_strategy = kStatic; + void set_resource_allocation_strategy(ResourceAllocationStrategy new_strategy); + ResourceAllocationStrategy get_resource_allocation_strategy(); }; } // namespace runtime diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp index 64b111750f..d36cc98c80 100644 --- a/core/runtime/execute_engine.cpp +++ b/core/runtime/execute_engine.cpp @@ -201,6 +201,12 @@ void create_output_allocator(c10::intrusive_ptr compiled_engine) { } std::vector execute_engine(std::vector inputs, c10::intrusive_ptr compiled_engine) { + torch::Tensor dynamic_workspace; + if (compiled_engine->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) { + dynamic_workspace = torch::empty(compiled_engine->cuda_engine->getDeviceMemorySizeV2(), {torch::kCUDA}); + compiled_engine->exec_ctx->setDeviceMemory(dynamic_workspace.data_ptr()); + } + auto run_standard_execution = [&]() { bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS); bool shape_changed = _validate_shapes(inputs, compiled_engine); diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp index 173ff8c35f..b6f2d5b6ba 100644 --- a/core/runtime/register_jit_hooks.cpp +++ b/core/runtime/register_jit_hooks.cpp @@ -90,6 +90,13 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion = .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info) .def("infer_outputs", &TRTEngine::infer_outputs) .def("reset_captured_graph", &TRTEngine::reset_captured_graph) + .def( + "use_dynamically_allocated_resources", + [](const c10::intrusive_ptr& self, bool dynamic) -> void { + self->set_resource_allocation_strategy( + dynamic ? TRTEngine::ResourceAllocationStrategy::kDynamic + : TRTEngine::ResourceAllocationStrategy::kStatic); + }) .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs) .def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs) .def_property( @@ -102,6 +109,10 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion = [](const c10::intrusive_ptr& self) -> std::vector { return self->serialize(); }, [](std::vector serialized_info) -> c10::intrusive_ptr { serialized_info[ENGINE_IDX] = base64_decode(serialized_info[ENGINE_IDX]); + LOG_DEBUG( + "Deserialized resource allocation strategy: " + << (static_cast(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])) ? "Dynamic" + : "Static")); TRTEngine::verify_serialization_fmt(serialized_info); return c10::make_intrusive(serialized_info); }); @@ -135,6 +146,7 @@ TORCH_LIBRARY(tensorrt, m) { m.def("TARGET_PLATFORM_IDX", []() -> int64_t { return TARGET_PLATFORM_IDX; }); m.def("REQUIRES_OUTPUT_ALLOCATOR_IDX", []() -> int64_t { return REQUIRES_OUTPUT_ALLOCATOR_IDX; }); m.def("SERIALIZATION_LEN", []() -> int64_t { return SERIALIZATION_LEN; }); + m.def("RESOURCE_ALLOCATION_STRATEGY_IDX", []() -> int64_t { return RESOURCE_ALLOCATION_STRATEGY_IDX; }); m.def("_platform_linux_x86_64", []() -> std::string { auto it = get_platform_name_map().find(Platform::PlatformEnum::kLINUX_X86_64); return it->second; diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h index 894df55bfe..d8f71683d3 100644 --- a/core/runtime/runtime.h +++ b/core/runtime/runtime.h @@ -16,7 +16,7 @@ namespace core { namespace runtime { using EngineID = int64_t; -const std::string ABI_VERSION = "7"; +const std::string ABI_VERSION = "8"; extern bool MULTI_DEVICE_SAFE_MODE; typedef enum { @@ -38,6 +38,7 @@ typedef enum { SERIALIZED_METADATA_IDX, TARGET_PLATFORM_IDX, REQUIRES_OUTPUT_ALLOCATOR_IDX, + RESOURCE_ALLOCATION_STRATEGY_IDX, SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO } SerializedInfoIndex; diff --git a/examples/dynamo/dynamic_memory_allocation.py b/examples/dynamo/dynamic_memory_allocation.py new file mode 100644 index 0000000000..fe64e0e3b7 --- /dev/null +++ b/examples/dynamo/dynamic_memory_allocation.py @@ -0,0 +1,45 @@ +# %% +import gc +import time + +import numpy as np +import torch +import torch_tensorrt as torch_trt +import torchvision.models as models + +np.random.seed(5) +torch.manual_seed(5) +inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + +settings = { + "ir": "dynamo", + "use_python_runtime": False, + "enabled_precisions": {torch.float32}, + "immutable_weights": False, + "lazy_engine_init": True, + "dynamically_allocate_resources": True, +} + +model = models.resnet152(pretrained=True).eval().to("cuda") +compiled_module = torch_trt.compile(model, inputs=inputs, **settings) +print((torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3) +compiled_module(*inputs) + + +time.sleep(30) +with torch_trt.dynamo.runtime.ResourceAllocationStrategy( + compiled_module, dynamically_allocate_resources=False +): + print( + "Memory used (GB):", + (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3, + ) + compiled_module(*inputs) + gc.collect() + torch.cuda.empty_cache() + time.sleep(30) + print( + "Memory used (GB):", + (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3, + ) + compiled_module(*inputs) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index c8ad938032..bb99606d34 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -104,6 +104,7 @@ def cross_compile_for_windows( l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU, use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE, + dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows @@ -178,6 +179,7 @@ def cross_compile_for_windows( tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model + dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -333,6 +335,7 @@ def cross_compile_for_windows( "tiling_optimization_level": tiling_optimization_level, "l2_limit_for_tiling": l2_limit_for_tiling, "use_distributed_mode_trace": use_distributed_mode_trace, + "dynamically_allocate_resources": dynamically_allocate_resources, } # disable the following settings is not supported for cross compilation for windows feature @@ -434,6 +437,7 @@ def compile( l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU, use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE, + dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -511,6 +515,7 @@ def compile( l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage. use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model + dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -680,6 +685,7 @@ def compile( "l2_limit_for_tiling": l2_limit_for_tiling, "offload_module_to_cpu": offload_module_to_cpu, "use_distributed_mode_trace": use_distributed_mode_trace, + "dynamically_allocate_resources": dynamically_allocate_resources, } settings = CompilationSettings(**compilation_options) diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index de970ecd81..b58d0a528b 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -57,6 +57,7 @@ L2_LIMIT_FOR_TILING = -1 USE_DISTRIBUTED_MODE_TRACE = False OFFLOAD_MODULE_TO_CPU = False +DYNAMICALLY_ALLOCATE_RESOURCES = False if platform.system() == "Linux": import pwd diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index d8f6809eae..e9f5174e2c 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -11,6 +11,7 @@ DLA_GLOBAL_DRAM_SIZE, DLA_LOCAL_DRAM_SIZE, DLA_SRAM_SIZE, + DYNAMICALLY_ALLOCATE_RESOURCES, DRYRUN, ENABLE_CROSS_COMPILE_FOR_WINDOWS, ENABLE_EXPERIMENTAL_DECOMPOSITIONS, @@ -97,6 +98,8 @@ class CompilationSettings: tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model + offload_module_to_cpu (bool): Offload the model to CPU to reduce memory footprint during compilation + dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -140,6 +143,7 @@ class CompilationSettings: l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU + dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES def __getstate__(self) -> dict[str, Any]: from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( diff --git a/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py b/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py new file mode 100644 index 0000000000..3e570f4d78 --- /dev/null +++ b/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py @@ -0,0 +1,36 @@ +from typing import Any + +import torch + + +class ResourceAllocationStrategy(torch.nn.Module): # type: ignore[misc] + """ + ResourceAllocationStrategy is a context manager module that temporarily enables dynamic resource allocation + for all TRT submodules of the given compiled_module. When entering the context, + it sets these submodules to use dynamically allocated resources. Upon exiting, it restores them to their + original (static) resource allocation mode. + """ + + def __init__( + self, + compiled_module: torch.nn.Module, + dynamically_allocate_resources: bool = True, + ) -> None: + super(ResourceAllocationStrategy, self).__init__() + self.compiled_module = compiled_module + self.dynamically_allocate_resources = dynamically_allocate_resources + + def __enter__(self) -> None: + print("Entering resource allocator context") + for name, submodule in self.compiled_module.named_modules(): + if "_run_on_acc" in name: + submodule.use_dynamically_allocated_resources( + dynamically_allocate_resources=self.dynamically_allocate_resources + ) + + def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None: + for name, submodule in self.compiled_module.named_modules(): + if "_run_on_acc" in name: + submodule.use_dynamically_allocated_resources( + dynamically_allocate_resources=self.dynamically_allocate_resources + ) diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 95f1581881..bba5ca8c92 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -50,7 +50,10 @@ REQUIRES_OUTPUT_ALLOCATOR_IDX = ( torch.ops.tensorrt.REQUIRES_OUTPUT_ALLOCATOR_IDX() ) # 9 - SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN() # 10 + RESOURCE_ALLOCATION_STRATEGY_IDX = ( + torch.ops.tensorrt.RESOURCE_ALLOCATION_STRATEGY_IDX() + ) # 10 + SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN() # 11 @for_all_methods(needs_torch_tensorrt_runtime) @@ -139,6 +142,7 @@ def __init__( self.serialized_engine = serialized_engine self.engine = None self.requires_output_allocator = requires_output_allocator + self.dynamically_allocate_resources = settings.dynamically_allocate_resources if ( serialized_engine @@ -184,6 +188,12 @@ def _pack_engine_info(self) -> List[str | bytes]: engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = str( int(self.requires_output_allocator) ) + logger.info( + f"PROVIDED RESOURCE ALLOCATION STRATEGY: {self.dynamically_allocate_resources}" + ) + engine_info[RESOURCE_ALLOCATION_STRATEGY_IDX] = str( + int(self.dynamically_allocate_resources) + ) return engine_info @@ -212,6 +222,14 @@ def set_device_memory_budget(self, budget_bytes: int) -> int: def _reset_captured_graph(self) -> None: self.engine.reset_captured_graph() + def use_dynamically_allocated_resources( + self, dynamically_allocate_resources: bool = False + ) -> None: + self.dynamically_allocate_resources = dynamically_allocate_resources + self.engine.use_dynamically_allocated_resources( + self.dynamically_allocate_resources + ) + def setup_engine(self) -> None: """ Setup engine for a module which has deferred engine setup. diff --git a/py/torch_tensorrt/dynamo/runtime/__init__.py b/py/torch_tensorrt/dynamo/runtime/__init__.py index de47d942e9..0eb66b24b0 100644 --- a/py/torch_tensorrt/dynamo/runtime/__init__.py +++ b/py/torch_tensorrt/dynamo/runtime/__init__.py @@ -2,6 +2,9 @@ from torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule import ( # noqa: F401 PythonTorchTensorRTModule, ) +from torch_tensorrt.dynamo.runtime._ResourceAllocator import ( # noqa: F401 + ResourceAllocationStrategy, +) from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ( # noqa: F401 TorchTensorRTModule, ) diff --git a/tests/py/dynamo/runtime/test_005_dynamic_allocation.py b/tests/py/dynamo/runtime/test_005_dynamic_allocation.py new file mode 100644 index 0000000000..efdc13c284 --- /dev/null +++ b/tests/py/dynamo/runtime/test_005_dynamic_allocation.py @@ -0,0 +1,52 @@ +import importlib +import unittest + +import torch +import torch.nn.functional as F +import torch_tensorrt as torch_trt +from torch import nn +from torch.testing._internal.common_utils import TestCase +from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity + +assertions = unittest.TestCase() + + +class TestDynamicAllocation(TestCase): + def test_dynamic_allocation(self): + + class net(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 3, 1) + self.conv2 = nn.Conv2d(6, 16, 3, 1) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + return x + + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + + settings = { + "ir": "dynamo", + "use_python_runtime": False, + "enabled_precisions": {torch.float32}, + "immutable_weights": False, + "lazy_engine_init": True, + "dynamically_allocate_resources": True, + } + + model = net().eval().to("cuda") + compiled_module = torch_trt.compile(model, inputs=inputs, **settings) + compiled_module(*inputs) + + # Inference on PyTorch model + model_output = model(*inputs) + cos_sim = cosine_similarity(model_output, compiled_module(*inputs)) + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset()