Skip to content

Commit d6ed44e

Browse files
committed
reverted stream changes
1 parent edb8b43 commit d6ed44e

File tree

9 files changed

+286
-51
lines changed

9 files changed

+286
-51
lines changed

core/runtime/TRTEngine.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,12 @@ TRTEngine::TRTEngine(
237237
out_binding_names[pyt_idx] = binding_name;
238238
}
239239
num_io = std::make_pair(inputs_size, outputs);
240+
241+
this->io_size = this->cuda_engine->getNbIOTensors();
242+
for (int64_t i = 0; i < this->in_binding_names.size(); i++) {
243+
this->isShapeInferenceIO[this->in_binding_names[i]] =
244+
this->cuda_engine->isShapeInferenceIO(this->in_binding_names[i].c_str());
245+
}
240246
}
241247

242248
#ifndef NDEBUG
@@ -281,6 +287,14 @@ void TRTEngine::enable_profiling() {
281287
exec_ctx->setProfiler(trt_engine_profiler.get());
282288
}
283289

290+
void TRTEngine::set_output_tensors_as_unowned(bool enable) {
291+
this->output_tensors_are_unowned = enable;
292+
}
293+
294+
bool TRTEngine::are_output_tensors_unowned() {
295+
return this->output_tensors_are_unowned;
296+
}
297+
284298
void TRTEngine::set_profile_format(std::string format) {
285299
if (format == "trex") {
286300
this->trt_engine_profiler->set_profile_format(TraceFormat::kTREX);

core/runtime/TRTEngine.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ struct TRTEngine : torch::CustomClassHolder {
103103
std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
104104
std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
105105
std::pair<uint64_t, uint64_t> num_io;
106+
uint64_t io_size;
107+
std::map<std::string, bool> isShapeInferenceIO;
108+
bool output_tensors_are_unowned = false;
106109
std::string name;
107110
RTDevice device_info;
108111

@@ -159,6 +162,8 @@ struct TRTEngine : torch::CustomClassHolder {
159162
int64_t get_automatic_device_memory_budget();
160163
std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
161164
void set_pre_allocated_outputs(bool enable);
165+
void set_output_tensors_as_unowned(bool enable);
166+
bool are_output_tensors_unowned();
162167
TorchTRTRuntimeStates runtime_states;
163168
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
164169
static const char BINDING_DELIM = '%';
@@ -176,6 +181,7 @@ struct TRTEngine : torch::CustomClassHolder {
176181
std::string shape_key = "None";
177182
bool use_pre_allocated_outputs = false;
178183
std::vector<at::Tensor> pre_allocated_outputs;
184+
std::vector<at::Tensor> allocated_outputs;
179185

180186
// Output Allocator-Related Functionality
181187
bool requires_output_allocator = false; // engine requires output allocator

core/runtime/execute_engine.cpp

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ void setup_input_tensors(
117117
auto shape = core::util::toVec(dims);
118118
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
119119

120-
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
120+
if (compiled_engine->isShapeInferenceIO[name]) {
121121
// Shape tensor inputs are casted to int64 explicitly.
122122
// Refer to
123123
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
@@ -145,10 +145,10 @@ void setup_input_tensors(
145145
// Create a new persistent input buffer
146146
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
147147
}
148-
149-
TORCHTRT_CHECK(
150-
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
151-
148+
if (need_cudagraphs_record) {
149+
TORCHTRT_CHECK(
150+
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
151+
}
152152
if (cudagraphs_enabled) {
153153
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
154154
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
@@ -217,7 +217,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
217217
compiled_engine->cudagraph.reset();
218218
}
219219

220-
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
220+
std::vector<at::Tensor> outputs;
221221

222222
// Intialize inputs and outputs to be available throughout the succeeding scopes
223223
{ // Input Setup
@@ -226,10 +226,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
226226
input_profiler_guard =
227227
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
228228
}
229-
230229
setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
231230
// Check if input shapes can be inferred.
232-
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
231+
int32_t const io_size{compiled_engine->io_size};
233232
std::vector<char const*> names(io_size);
234233
int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
235234
TORCHTRT_CHECK(
@@ -240,6 +239,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240239
}
241240

242241
{ // Output Setup
242+
bool new_outputs = false;
243243
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244244
if (compiled_engine->profile_execution) {
245245
output_profiler_guard =
@@ -248,26 +248,33 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
248248
if (can_use_pre_allocated_outputs) {
249249
outputs = compiled_engine->pre_allocated_outputs;
250250
} else {
251-
outputs = create_output_tensors(compiled_engine);
251+
if (compiled_engine->allocated_outputs.size() == 0 or compiled_engine->output_tensors_are_unowned or
252+
shape_changed) {
253+
compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);
254+
new_outputs = true;
255+
}
256+
outputs = compiled_engine->allocated_outputs;
252257
}
253258

254-
for (auto output_indices : compiled_engine->out_binding_map) {
255-
auto pyt_idx = output_indices.second;
256-
std::string name = compiled_engine->out_binding_names[pyt_idx];
257-
if (need_cudagraphs_record) {
258-
// If we are recording the cuda graph then we need to update the persistent output buffer
259-
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
260-
}
259+
if (new_outputs) {
260+
for (auto output_indices : compiled_engine->out_binding_map) {
261+
auto pyt_idx = output_indices.second;
262+
std::string name = compiled_engine->out_binding_names[pyt_idx];
263+
if (need_cudagraphs_record) {
264+
// If we are recording the cuda graph then we need to update the persistent output buffer
265+
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
266+
}
261267

262-
if (cudagraphs_enabled) {
263-
TORCHTRT_CHECK(
264-
compiled_engine->exec_ctx->setTensorAddress(
265-
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
266-
"Error while setting the output tensor address");
267-
} else {
268-
TORCHTRT_CHECK(
269-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
270-
"Error while setting the output tensor address");
268+
if (cudagraphs_enabled) {
269+
TORCHTRT_CHECK(
270+
compiled_engine->exec_ctx->setTensorAddress(
271+
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
272+
"Error while setting the output tensor address");
273+
} else {
274+
TORCHTRT_CHECK(
275+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
276+
"Error while setting the output tensor address");
277+
}
271278
}
272279
}
273280
}

core/runtime/register_jit_hooks.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
9090
.def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
9191
.def("infer_outputs", &TRTEngine::infer_outputs)
9292
.def("reset_captured_graph", &TRTEngine::reset_captured_graph)
93+
.def("set_output_tensors_as_unowned", &TRTEngine::set_output_tensors_as_unowned)
94+
.def("are_output_tensors_unowned", &TRTEngine::are_output_tensors_unowned)
9395
.def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
9496
.def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs)
9597
.def_property(

py/torch_tensorrt/dynamo/_compiler.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ def compile(
557557
stacklevel=2,
558558
)
559559

560-
if kwargs.get("use_explicit_typing", False) == False:
560+
if not kwargs.get("use_explicit_typing", False):
561561
warnings.warn(
562562
"`use_explicit_typing` is deprecated. This setting will be removed and you should enable autocast instead.",
563563
DeprecationWarning,
@@ -949,7 +949,7 @@ def preserve_module_specs(
949949
for attr in dir(gm):
950950
if attr.startswith("_frozen_param"):
951951
delattr(gm, attr)
952-
952+
trt_module = None
953953
for name, _ in partitioned_module.named_children():
954954
submodule = getattr(partitioned_module, name)
955955
# filter on the GraphModule
@@ -1070,14 +1070,20 @@ def preserve_module_specs(
10701070
) as f:
10711071
f.write(trt_module.get_layer_info())
10721072

1073+
# Only set the requires_unique_output flag for the last TRT Module when user has access to the output tensor
1074+
10731075
# Parse the graph I/O and store it in dryrun tracker
10741076
parse_graph_io(gm, dryrun_tracker)
10751077

10761078
# Replace all FX Modules with TRT Modules
10771079
for name, trt_module in trt_modules.items():
10781080
setattr(partitioned_module, name, trt_module)
10791081
if settings.lazy_engine_init and not settings.enable_cross_compile_for_windows:
1080-
getattr(partitioned_module, name).setup_engine()
1082+
trt_module = getattr(partitioned_module, name)
1083+
trt_module.setup_engine()
1084+
1085+
if trt_module:
1086+
trt_module.set_output_tensors_as_unowned(True)
10811087

10821088
# Reset settings object to user specification after fallback to global partitioning mode
10831089
if fast_partitioner_failed:

py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ def __init__(
174174
self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
175175
self._caller_stream: Optional[torch.cuda.Stream] = None
176176
self._engine_stream: Optional[torch.cuda.Stream] = None
177+
self.output_tensors: Optional[List[torch.Tensor]] = None
177178

178179
# TODO: Make the below a Dictionary {shape: cudagraph}
179180
self.shape_key: Optional[str] = None
@@ -218,10 +219,27 @@ def __init__(
218219
self.requires_output_allocator = requires_output_allocator
219220
self.output_allocator: Optional[DynamicOutputAllocator] = None
220221
self.use_output_allocator_outputs = False
221-
222+
self.device = torch.cuda.current_device()
223+
self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
224+
# If the output tensor is not owned by the engine (output_tensors_are_unowned=True), we need to create a new output tensor in each forward pass
225+
self.output_tensors_are_unowned = False
222226
if self.serialized_engine is not None and not self.settings.lazy_engine_init:
223227
self.setup_engine()
224228

229+
def set_output_tensors_as_unowned(self, enabled: bool) -> None:
230+
"""
231+
Flag to set if the output tensors of this engine are solely owned by the Torch-TensorRT Runtime or if they might be shared with a user.
232+
If the tensors are not owned by the runtime, then they must be recreated on every forward call which may have implications for performance.
233+
Typically only the final engine in a graph requires output tensors to be unowned and there are performance gains to be had for intermediate engines to manage their own standing memory.
234+
Therefore this should only be set to True for the final module in a graph and leave false for intermediate modules.
235+
236+
Args:
237+
enabled: bool
238+
Whether to set the flag to True.
239+
240+
"""
241+
self.output_tensors_are_unowned = enabled
242+
225243
def get_streamable_device_memory_budget(self) -> Any:
226244
return self.engine.streamable_weights_size
227245

@@ -288,16 +306,25 @@ def setup_engine(self) -> None:
288306
for output_name in self.output_names
289307
]
290308
self.output_shapes = [
291-
self.engine.get_tensor_shape(output_name)
309+
tuple(self.context.get_tensor_shape(output_name))
292310
for output_name in self.output_names
293311
]
294312

313+
self.shape_key = "".join(
314+
str(tuple(t)).replace(" ", "") for t in self.input_shapes
315+
)
316+
295317
if self.requires_output_allocator:
296318
self.create_output_allocator()
297319

298320
if torch_tensorrt.runtime.get_cudagraphs_mode():
299321
self.cudagraph = torch.cuda.CUDAGraph()
300322

323+
self.is_shape_inference_io = {
324+
input_name: self.engine.is_shape_inference_io(input_name)
325+
for input_name in self.input_names
326+
}
327+
301328
def _check_initialized(self) -> None:
302329
if not self.initialized:
303330
raise RuntimeError("PythonTorchTensorRTModule is not initialized.")
@@ -383,16 +410,17 @@ def setup_input_tensors(
383410

384411
# For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
385412
# as per TensorRT requirements
386-
if self.engine.is_shape_inference_io(input_name):
413+
if self.is_shape_inference_io[input_name]:
387414
# Shape tensor inputs are casted to int64 explicitly
388415
# Currently Torch CPU pointers are not working; numpy pointers are used instead
389416
# to refer to underlying memory
390417
inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
391418
self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data)
392419
else:
393-
self.context.set_input_shape(
394-
input_name, tuple(contiguous_inputs[i].shape)
395-
)
420+
if need_cudagraphs_record:
421+
self.context.set_input_shape(
422+
input_name, tuple(contiguous_inputs[i].shape)
423+
)
396424
if cudagraphs_enabled:
397425
self._input_buffers[i].copy_(contiguous_inputs[i])
398426
self.context.set_tensor_address(
@@ -411,7 +439,7 @@ def create_output_tensors(self) -> List[torch.Tensor]:
411439
output = torch.empty(
412440
size=self.output_shapes[o],
413441
dtype=self.output_dtypes[o],
414-
device=torch.cuda.current_device(),
442+
device=self.device,
415443
)
416444
outputs.append(output)
417445
return outputs
@@ -460,7 +488,9 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
460488
), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
461489

462490
self.setup_input_tensors(
463-
contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record
491+
contiguous_inputs,
492+
self.cudagraphs_enabled,
493+
need_cudagraphs_record,
464494
)
465495

466496
if shape_changed:
@@ -482,15 +512,22 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
482512
if can_use_pre_allocated_outputs:
483513
outputs = self.pre_allocated_outputs
484514
else:
485-
self.output_shapes = [
486-
tuple(self.context.get_tensor_shape(output_name))
487-
for output_name in self.output_names
488-
]
515+
if shape_changed or self.output_tensors is None:
516+
self.output_shapes = [
517+
tuple(self.context.get_tensor_shape(output_name))
518+
for output_name in self.output_names
519+
]
489520
if DYNAMIC_DIM in self.output_shapes:
490521
raise ValueError(
491522
"Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
492523
)
493-
outputs = self.create_output_tensors()
524+
if (
525+
self.output_tensors is None
526+
or self.output_tensors_are_unowned
527+
or shape_changed
528+
):
529+
self.output_tensors = self.create_output_tensors()
530+
outputs = self.output_tensors
494531

495532
for o, output_name in enumerate(self.output_names):
496533
if need_cudagraphs_record:
@@ -751,13 +788,13 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
751788
# Representation of input shapes to a given model
752789
# Shapes are concatenated as so:
753790
# x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
754-
tensor_inputs = []
755-
for t in inputs:
756-
if not isinstance(t, torch.Tensor):
757-
return True
758-
tensor_inputs.append(t)
791+
if not all(isinstance(t, torch.Tensor) for t in inputs):
792+
return True
793+
759794
new_shape_key = "".join(
760-
str(tuple(t.shape)).replace(" ", "") for t in tensor_inputs
795+
str(tuple(t.shape)).replace(" ", "")
796+
for t in inputs
797+
if isinstance(t, torch.Tensor)
761798
)
762799

763800
# If the new shape key differs from the existing one,

py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ def _pack_engine_info(self) -> List[str | bytes]:
156156
metadata = {
157157
"settings": self.settings,
158158
"weight_name_map": self.weight_name_map,
159+
"requires_new_output_tensor": (
160+
False
161+
if self.engine is None
162+
else self.engine.get_requires_new_output_tensor()
163+
),
159164
}
160165
target_platform = (
161166
Platform.current_platform()
@@ -284,6 +289,8 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
284289
metadata = TorchTensorRTModule.decode_metadata(serialized_metadata)
285290
self.settings = metadata["settings"]
286291
self.weight_name_map = metadata["weight_name_map"]
292+
self.output_tensors_are_unowned = metadata["output_tensors_are_unowned"]
293+
self.engine.set_output_tensors_as_unowned(self.output_tensors_are_unowned)
287294

288295
else:
289296
self.engine = None
@@ -355,6 +362,12 @@ def enable_profiling(
355362
self.engine.enable_profiling()
356363
self.engine.set_profile_format(profile_format)
357364

365+
def set_output_tensors_as_unowned(self, enabled: bool) -> None:
366+
self.engine.set_output_tensors_as_unowned(enabled)
367+
368+
def are_output_tensors_unowned(self) -> bool:
369+
return self.engine.are_output_tensors_unowned() # type: ignore[no-any-return]
370+
358371
def disable_profiling(self) -> None:
359372
"""Disable the profiler"""
360373
if self.engine is None:

0 commit comments

Comments
 (0)