reverted stream changes

cehongwang · cehongwang · commit d6ed44e079f7 · 2025-12-17T21:06:12.000Z
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -237,6 +237,12 @@ TRTEngine::TRTEngine(
       out_binding_names[pyt_idx] = binding_name;
     }
     num_io = std::make_pair(inputs_size, outputs);
+
+    this->io_size = this->cuda_engine->getNbIOTensors();
+    for (int64_t i = 0; i < this->in_binding_names.size(); i++) {
+      this->isShapeInferenceIO[this->in_binding_names[i]] =
+          this->cuda_engine->isShapeInferenceIO(this->in_binding_names[i].c_str());
+    }
   }
 
 #ifndef NDEBUG
@@ -281,6 +287,14 @@ void TRTEngine::enable_profiling() {
   exec_ctx->setProfiler(trt_engine_profiler.get());
 }
 
+void TRTEngine::set_output_tensors_as_unowned(bool enable) {
+  this->output_tensors_are_unowned = enable;
+}
+
+bool TRTEngine::are_output_tensors_unowned() {
+  return this->output_tensors_are_unowned;
+}
+
 void TRTEngine::set_profile_format(std::string format) {
   if (format == "trex") {
     this->trt_engine_profiler->set_profile_format(TraceFormat::kTREX);
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -103,6 +103,9 @@ struct TRTEngine : torch::CustomClassHolder {
   std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
   std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
   std::pair<uint64_t, uint64_t> num_io;
+  uint64_t io_size;
+  std::map<std::string, bool> isShapeInferenceIO;
+  bool output_tensors_are_unowned = false;
   std::string name;
   RTDevice device_info;
 
@@ -159,6 +162,8 @@ struct TRTEngine : torch::CustomClassHolder {
   int64_t get_automatic_device_memory_budget();
   std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
   void set_pre_allocated_outputs(bool enable);
+  void set_output_tensors_as_unowned(bool enable);
+  bool are_output_tensors_unowned();
   TorchTRTRuntimeStates runtime_states;
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
@@ -176,6 +181,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string shape_key = "None";
   bool use_pre_allocated_outputs = false;
   std::vector<at::Tensor> pre_allocated_outputs;
+  std::vector<at::Tensor> allocated_outputs;
 
   // Output Allocator-Related Functionality
   bool requires_output_allocator = false; // engine requires output allocator
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -117,7 +117,7 @@ void setup_input_tensors(
     auto shape = core::util::toVec(dims);
     LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
 
-    if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
+    if (compiled_engine->isShapeInferenceIO[name]) {
       // Shape tensor inputs are casted to int64 explicitly.
       // Refer to
       // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
@@ -145,10 +145,10 @@ void setup_input_tensors(
         // Create a new persistent input buffer
         compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
       }
-
-      TORCHTRT_CHECK(
-          compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
-
+      if (need_cudagraphs_record) {
+        TORCHTRT_CHECK(
+            compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
+      }
       if (cudagraphs_enabled) {
         // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
         compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
@@ -217,7 +217,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       compiled_engine->cudagraph.reset();
     }
 
-    std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
+    std::vector<at::Tensor> outputs;
 
     // Intialize inputs and outputs to be available throughout the succeeding scopes
     { // Input Setup
@@ -226,10 +226,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
         input_profiler_guard =
             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
       }
-
       setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
       // Check if input shapes can be inferred.
-      int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
+      int32_t const io_size{compiled_engine->io_size};
       std::vector<char const*> names(io_size);
       int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
       TORCHTRT_CHECK(
@@ -240,6 +239,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
 
     { // Output Setup
+      bool new_outputs = false;
       std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
       if (compiled_engine->profile_execution) {
         output_profiler_guard =
@@ -248,26 +248,33 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       if (can_use_pre_allocated_outputs) {
         outputs = compiled_engine->pre_allocated_outputs;
       } else {
-        outputs = create_output_tensors(compiled_engine);
+        if (compiled_engine->allocated_outputs.size() == 0 or compiled_engine->output_tensors_are_unowned or
+            shape_changed) {
+          compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);
+          new_outputs = true;
+        }
+        outputs = compiled_engine->allocated_outputs;
       }
 
-      for (auto output_indices : compiled_engine->out_binding_map) {
-        auto pyt_idx = output_indices.second;
-        std::string name = compiled_engine->out_binding_names[pyt_idx];
-        if (need_cudagraphs_record) {
-          // If we are recording the cuda graph then we need to update the persistent output buffer
-          compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
-        }
+      if (new_outputs) {
+        for (auto output_indices : compiled_engine->out_binding_map) {
+          auto pyt_idx = output_indices.second;
+          std::string name = compiled_engine->out_binding_names[pyt_idx];
+          if (need_cudagraphs_record) {
+            // If we are recording the cuda graph then we need to update the persistent output buffer
+            compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
+          }
 
-        if (cudagraphs_enabled) {
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(
-                  name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
-              "Error while setting the output tensor address");
-        } else {
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
-              "Error while setting the output tensor address");
+          if (cudagraphs_enabled) {
+            TORCHTRT_CHECK(
+                compiled_engine->exec_ctx->setTensorAddress(
+                    name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
+                "Error while setting the output tensor address");
+          } else {
+            TORCHTRT_CHECK(
+                compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
+                "Error while setting the output tensor address");
+          }
         }
       }
     }
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -90,6 +90,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
         .def("infer_outputs", &TRTEngine::infer_outputs)
         .def("reset_captured_graph", &TRTEngine::reset_captured_graph)
+        .def("set_output_tensors_as_unowned", &TRTEngine::set_output_tensors_as_unowned)
+        .def("are_output_tensors_unowned", &TRTEngine::are_output_tensors_unowned)
         .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
         .def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs)
         .def_property(
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -557,7 +557,7 @@ def compile(
             stacklevel=2,
         )
 
-    if kwargs.get("use_explicit_typing", False) == False:
+    if not kwargs.get("use_explicit_typing", False):
         warnings.warn(
             "`use_explicit_typing` is deprecated. This setting will be removed and you should enable autocast instead.",
             DeprecationWarning,
@@ -949,7 +949,7 @@ def preserve_module_specs(
     for attr in dir(gm):
         if attr.startswith("_frozen_param"):
             delattr(gm, attr)
-
+    trt_module = None
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
@@ -1070,14 +1070,20 @@ def preserve_module_specs(
                     ) as f:
                         f.write(trt_module.get_layer_info())
 
+    # Only set the requires_unique_output flag for the last TRT Module when user has access to the output tensor
+
     # Parse the graph I/O and store it in dryrun tracker
     parse_graph_io(gm, dryrun_tracker)
 
     # Replace all FX Modules with TRT Modules
     for name, trt_module in trt_modules.items():
         setattr(partitioned_module, name, trt_module)
         if settings.lazy_engine_init and not settings.enable_cross_compile_for_windows:
-            getattr(partitioned_module, name).setup_engine()
+            trt_module = getattr(partitioned_module, name)
+            trt_module.setup_engine()
+
+    if trt_module:
+        trt_module.set_output_tensors_as_unowned(True)
 
     # Reset settings object to user specification after fallback to global partitioning mode
     if fast_partitioner_failed:
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -174,6 +174,7 @@ def __init__(
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
         self._caller_stream: Optional[torch.cuda.Stream] = None
         self._engine_stream: Optional[torch.cuda.Stream] = None
+        self.output_tensors: Optional[List[torch.Tensor]] = None
 
         # TODO: Make the below a Dictionary {shape: cudagraph}
         self.shape_key: Optional[str] = None
@@ -218,10 +219,27 @@ def __init__(
         self.requires_output_allocator = requires_output_allocator
         self.output_allocator: Optional[DynamicOutputAllocator] = None
         self.use_output_allocator_outputs = False
-
+        self.device = torch.cuda.current_device()
+        self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
+        # If the output tensor is not owned by the engine (output_tensors_are_unowned=True), we need to create a new output tensor in each forward pass
+        self.output_tensors_are_unowned = False
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
 
+    def set_output_tensors_as_unowned(self, enabled: bool) -> None:
+        """
+        Flag to set if the output tensors of this engine are solely owned by the Torch-TensorRT Runtime or if they might be shared with a user.
+        If the tensors are not owned by the runtime, then they must be recreated on every forward call which may have implications for performance.
+        Typically only the final engine in a graph requires output tensors to be unowned and there are performance gains to be had for intermediate engines to manage their own standing memory.
+        Therefore this should only be set to True for the final module in a graph and leave false for intermediate modules.
+
+        Args:
+            enabled: bool
+                Whether to set the flag to True.
+
+        """
+        self.output_tensors_are_unowned = enabled
+
     def get_streamable_device_memory_budget(self) -> Any:
         return self.engine.streamable_weights_size
 
@@ -288,16 +306,25 @@ def setup_engine(self) -> None:
             for output_name in self.output_names
         ]
         self.output_shapes = [
-            self.engine.get_tensor_shape(output_name)
+            tuple(self.context.get_tensor_shape(output_name))
             for output_name in self.output_names
         ]
 
+        self.shape_key = "".join(
+            str(tuple(t)).replace(" ", "") for t in self.input_shapes
+        )
+
         if self.requires_output_allocator:
             self.create_output_allocator()
 
         if torch_tensorrt.runtime.get_cudagraphs_mode():
             self.cudagraph = torch.cuda.CUDAGraph()
 
+        self.is_shape_inference_io = {
+            input_name: self.engine.is_shape_inference_io(input_name)
+            for input_name in self.input_names
+        }
+
     def _check_initialized(self) -> None:
         if not self.initialized:
             raise RuntimeError("PythonTorchTensorRTModule is not initialized.")
@@ -383,16 +410,17 @@ def setup_input_tensors(
 
             # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
             # as per TensorRT requirements
-            if self.engine.is_shape_inference_io(input_name):
+            if self.is_shape_inference_io[input_name]:
                 # Shape tensor inputs are casted to int64 explicitly
                 # Currently Torch CPU pointers are not working; numpy pointers are used instead
                 # to refer to underlying memory
                 inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
                 self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data)
             else:
-                self.context.set_input_shape(
-                    input_name, tuple(contiguous_inputs[i].shape)
-                )
+                if need_cudagraphs_record:
+                    self.context.set_input_shape(
+                        input_name, tuple(contiguous_inputs[i].shape)
+                    )
                 if cudagraphs_enabled:
                     self._input_buffers[i].copy_(contiguous_inputs[i])
                     self.context.set_tensor_address(
@@ -411,7 +439,7 @@ def create_output_tensors(self) -> List[torch.Tensor]:
             output = torch.empty(
                 size=self.output_shapes[o],
                 dtype=self.output_dtypes[o],
-                device=torch.cuda.current_device(),
+                device=self.device,
             )
             outputs.append(output)
         return outputs
@@ -460,7 +488,9 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
 
                 self.setup_input_tensors(
-                    contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record
+                    contiguous_inputs,
+                    self.cudagraphs_enabled,
+                    need_cudagraphs_record,
                 )
 
                 if shape_changed:
@@ -482,15 +512,22 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 if can_use_pre_allocated_outputs:
                     outputs = self.pre_allocated_outputs
                 else:
-                    self.output_shapes = [
-                        tuple(self.context.get_tensor_shape(output_name))
-                        for output_name in self.output_names
-                    ]
+                    if shape_changed or self.output_tensors is None:
+                        self.output_shapes = [
+                            tuple(self.context.get_tensor_shape(output_name))
+                            for output_name in self.output_names
+                        ]
                     if DYNAMIC_DIM in self.output_shapes:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
-                    outputs = self.create_output_tensors()
+                    if (
+                        self.output_tensors is None
+                        or self.output_tensors_are_unowned
+                        or shape_changed
+                    ):
+                        self.output_tensors = self.create_output_tensors()
+                    outputs = self.output_tensors
 
                 for o, output_name in enumerate(self.output_names):
                     if need_cudagraphs_record:
@@ -751,13 +788,13 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
         # x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
-        tensor_inputs = []
-        for t in inputs:
-            if not isinstance(t, torch.Tensor):
-                return True
-            tensor_inputs.append(t)
+        if not all(isinstance(t, torch.Tensor) for t in inputs):
+            return True
+
         new_shape_key = "".join(
-            str(tuple(t.shape)).replace(" ", "") for t in tensor_inputs
+            str(tuple(t.shape)).replace(" ", "")
+            for t in inputs
+            if isinstance(t, torch.Tensor)
         )
 
         # If the new shape key differs from the existing one,
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -156,6 +156,11 @@ def _pack_engine_info(self) -> List[str | bytes]:
         metadata = {
             "settings": self.settings,
             "weight_name_map": self.weight_name_map,
+            "requires_new_output_tensor": (
+                False
+                if self.engine is None
+                else self.engine.get_requires_new_output_tensor()
+            ),
         }
         target_platform = (
             Platform.current_platform()
@@ -284,6 +289,8 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
             metadata = TorchTensorRTModule.decode_metadata(serialized_metadata)
             self.settings = metadata["settings"]
             self.weight_name_map = metadata["weight_name_map"]
+            self.output_tensors_are_unowned = metadata["output_tensors_are_unowned"]
+            self.engine.set_output_tensors_as_unowned(self.output_tensors_are_unowned)
 
         else:
             self.engine = None
@@ -355,6 +362,12 @@ def enable_profiling(
         self.engine.enable_profiling()
         self.engine.set_profile_format(profile_format)
 
+    def set_output_tensors_as_unowned(self, enabled: bool) -> None:
+        self.engine.set_output_tensors_as_unowned(enabled)
+
+    def are_output_tensors_unowned(self) -> bool:
+        return self.engine.are_output_tensors_unowned()  # type: ignore[no-any-return]
+
     def disable_profiling(self) -> None:
         """Disable the profiler"""
         if self.engine is None:
diff --git a/setup.py b/setup.py
diff --git a/tools/perf/graph_break_overhead/graph_break.py b/tools/perf/graph_break_overhead/graph_break.py