Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/build-test-linux-x86_64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ jobs:
set -euo pipefail
pushd .
cd tests/py/dynamo
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_converter_tests_results.xml --dist=loadscope --maxfail=20 conversion/
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_converter_tests_results.xml --maxfail=20 conversion/
popd

L0-dynamo-core-tests:
Expand Down Expand Up @@ -236,6 +236,7 @@ jobs:
cd tests/py/dynamo
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_tests_results.xml runtime/test_001_*
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_partitioning_tests_results.xml partitioning/test_001_*
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_hlo_tests_results.xml hlo/

popd

Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/build-test-linux-x86_64_rtx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ jobs:
set -euo pipefail
pushd .
cd tests/py/dynamo
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_converter_tests_results.xml --dist=loadscope --maxfail=20 conversion/
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_converter_tests_results.xml --maxfail=20 conversion/
popd

L0-dynamo-core-tests:
Expand Down Expand Up @@ -204,6 +204,7 @@ jobs:
pushd .
cd tests/py/dynamo
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_tests_results.xml runtime/test_001_*
python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_hlo_tests_results.xml hlo/
popd

L1-dynamo-compile-tests:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build-test-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ jobs:
cd tests/py/dynamo
../../../packaging/vc_env_helper.bat python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_tests_results.xml runtime/test_001_*
../../../packaging/vc_env_helper.bat python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_partitioning_tests_results.xml partitioning/test_001_*
../../../packaging/vc_env_helper.bat python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_hlo_tests_results.xml hlo/
popd

L1-dynamo-compile-tests:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build-test-windows_rtx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ jobs:
pushd .
cd tests/py/dynamo
../../../packaging/vc_env_helper.bat python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_tests_results.xml runtime/test_001_*
../../../packaging/vc_env_helper.bat python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_hlo_tests_results.xml hlo/
popd

L1-dynamo-compile-tests:
Expand Down
58 changes: 12 additions & 46 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "c10/cuda/CUDAGuard.h"
#include "c10/cuda/CUDAStream.h"

#include "ATen/record_function.h"
#include "torch/csrc/jit/runtime/custom_operator.h"
#include "torch/torch.h"

Expand Down Expand Up @@ -238,12 +239,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr

// Intialize inputs and outputs to be available throughout the succeeding scopes
{ // Input Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
if (compiled_engine->profile_execution) {
input_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
}

RECORD_USER_SCOPE("torch_tensorrt_execute_engine::InputSetup");
setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, inputShapeTensorValues);
// Check if input shapes can be inferred.
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
Expand All @@ -257,11 +253,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}

{ // Output Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
if (compiled_engine->profile_execution) {
output_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
}
RECORD_USER_SCOPE("torch_tensorrt_execute_engine::OutputSetup");
if (can_use_pre_allocated_outputs) {
outputs = compiled_engine->pre_allocated_outputs;
} else {
Expand Down Expand Up @@ -303,14 +295,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}

{ // Engine Execution (execute on engine stream)
RECORD_USER_SCOPE("torch_tensorrt_execute_engine::Enqueue");
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);

std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
if (compiled_engine->profile_execution) {
enqueue_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
}

// Block engine stream until results are available on caller stream
at::cuda::CUDAEvent caller_exec_complete;
caller_exec_complete.record(compiled_engine->caller_stream);
Expand Down Expand Up @@ -371,12 +358,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
std::list<std::vector<int64_t>> inputShapeTensorValues;

{ // Input Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
if (compiled_engine->profile_execution) {
input_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
}

RECORD_USER_SCOPE("torch_tensorrt_execute_engine::InputSetup");
setup_input_tensors(inputs, compiled_engine, false, false, inputShapeTensorValues);
// Check if input shapes can be inferred.
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
Expand All @@ -390,11 +372,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}

{ // OutputAllocator Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_allocator_profiler_guard;
if (compiled_engine->profile_execution) {
output_allocator_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
}
RECORD_USER_SCOPE("torch_tensorrt_execute_engine::OutputAllocatorSetup");
create_output_allocator(compiled_engine);
}

Expand All @@ -412,14 +390,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}

{ // Engine Execution (execute on engine stream)
RECORD_USER_SCOPE("torch_tensorrt_execute_engine::Enqueue");
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);

std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
if (compiled_engine->profile_execution) {
enqueue_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
}

// Block engine stream until results are available on caller stream
at::cuda::CUDAEvent caller_exec_complete;
caller_exec_complete.record(compiled_engine->caller_stream);
Expand All @@ -435,13 +408,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
trt_exec_complete.record(compiled_engine->engine_stream);
trt_exec_complete.block(compiled_engine->caller_stream);

std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
if (compiled_engine->profile_execution) {
output_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
}
std::vector<at::Tensor> outputs;
for (size_t i = 0; i < compiled_engine->out_binding_names.size(); i++) {
{ // Output Collection
RECORD_USER_SCOPE("torch_tensorrt_execute_engine::OutputCollection");
for (size_t i = 0; i < compiled_engine->out_binding_names.size(); i++) {
auto name = compiled_engine->out_binding_names[i];
auto dims = compiled_engine->output_allocator->getShapes().at(name);
auto dtype =
Expand All @@ -460,6 +430,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(shape);
outputs.push_back(output);
}
} // End Output Collection

if (compiled_engine->profile_execution) {
LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
Expand Down Expand Up @@ -493,12 +464,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);

if (MULTI_DEVICE_SAFE_MODE) {
std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
if (compiled_engine->profile_execution) {
device_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path);
}

RECORD_USER_SCOPE("torch_tensorrt_execute_engine::DeviceSelection");
RTDevice curr_device = get_current_device();
LOG_DEBUG("Current Device: " << curr_device);

Expand Down
Loading
Loading