Skip to content

Qualcomm AI Engine Direct - Intermediate Tensor Dump #5310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ target_link_libraries(
)
target_link_libraries(
qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
executorch_no_prim_ops qcir_utils
executorch_no_prim_ops qcir_utils extension_tensor
)
set_target_properties(
qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
Expand Down Expand Up @@ -246,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
qnn_executorch_header
executorch
qcir_utils
extension_tensor
)
target_link_libraries(
PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers
Expand Down
6 changes: 4 additions & 2 deletions backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,10 @@ Error QnnExecuTorchBackend::execute(
}

ET_CHECK_OR_RETURN_ERROR(
qnn_manager->Execute(input_tensor_structs, output_tensor_structs) ==
Error::Ok,
qnn_manager->Execute(
input_tensor_structs,
output_tensor_structs,
context.event_tracer()) == Error::Ok,
Internal,
"Fail to execute graph");
ET_CHECK_OR_RETURN_ERROR(
Expand Down
41 changes: 19 additions & 22 deletions backends/qualcomm/runtime/QnnManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <executorch/backends/qualcomm/runtime/Utils.h>
#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
#include <executorch/extension/tensor/tensor.h>
#include <algorithm>
#include <cstdlib>
#include <cstring>
Expand Down Expand Up @@ -57,9 +58,7 @@ QnnManager::QnnManager(
"backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type));
QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str());
QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str());
QNN_EXECUTORCH_LOG_INFO(
"tensor_dump_output_path: %s",
options_->tensor_dump_output_path()->c_str());
QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump());
QNN_EXECUTORCH_LOG_INFO(
"log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level()));
QNN_EXECUTORCH_LOG_INFO(
Expand Down Expand Up @@ -366,7 +365,8 @@ Error QnnManager::AllocateTensor(

Error QnnManager::Execute(
const std::vector<Qnn_Tensor_t>& input_tensor_structs,
std::vector<Qnn_Tensor_t>& output_tensor_structs) {
std::vector<Qnn_Tensor_t>& output_tensor_structs,
EventTracer* event_tracer) {
Qnn_ErrorHandle_t error = QNN_SUCCESS;

error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute(
Expand All @@ -377,30 +377,27 @@ Error QnnManager::Execute(
"qnn_graph_execute failed. Error %d", QNN_GET_ERROR_CODE(error));
return Error::Internal;
}

if (IsTensorDump()) {
// TODO: Need to handle the graph which is partitioned.
// Maybe we could use graph name.
std::string dir = options_->tensor_dump_output_path()->str() + "/Result/";
CreateDirectory(dir);
QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str());
for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size();
++out_idx) {
const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx];

std::string output_path =
dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw";

std::ofstream fout(output_path, std::ios::binary);
if (fout.fail()) {
QNN_EXECUTORCH_LOG_ERROR(
"Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name);
return Error::Internal;
}

fout.write(
static_cast<const char*>(QNN_VER_PTR(output_tensor)->clientBuf.data),
QNN_VER_PTR(output_tensor)->clientBuf.dataSize);
std::vector<exec_aten::SizesType> sizes(
QNN_VER_PTR(output_tensor)->dimensions,
QNN_VER_PTR(output_tensor)->dimensions +
QNN_VER_PTR(output_tensor)->rank);

auto dump_tensor = executorch::extension::from_blob(
QNN_VER_PTR(output_tensor)->clientBuf.data,
sizes,
qnn_dtype_to_scalar_type_[QNN_VER_PTR(output_tensor)->dataType]);

torch::executor::event_tracer_log_output_delegate<exec_aten::Tensor>(
event_tracer,
QNN_VER_PTR(output_tensor)->name,
/*delegate_debug_id=*/static_cast<torch::executor::DebugHandle>(-1),
*dump_tensor);
}
}

Expand Down
5 changes: 3 additions & 2 deletions backends/qualcomm/runtime/QnnManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class QnnManager {

Error Execute(
const std::vector<Qnn_Tensor_t>& input_tensor_structs,
std::vector<Qnn_Tensor_t>& output_tensor_structs);
std::vector<Qnn_Tensor_t>& output_tensor_structs,
EventTracer* event_tracer);

Error ProfileExecuteData(EventTracer* event_tracer);

Expand All @@ -52,7 +53,7 @@ class QnnManager {
}

bool IsTensorDump() {
return options_->tensor_dump_output_path()->size() > 0;
return options_->dump_intermediate_outputs();
}

bool IsNodeSupportedByBackend(
Expand Down
1 change: 0 additions & 1 deletion backends/qualcomm/runtime/backends/QnnProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
*/

#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
#include <iostream>

namespace torch {
namespace executor {
Expand Down
1 change: 1 addition & 0 deletions backends/qualcomm/runtime/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,6 @@ def define_common_targets():
"//executorch/backends/qualcomm/aot/wrappers:wrappers",
"//executorch/runtime/backend:interface",
"//executorch/runtime/core:core",
"//executorch/extension/tensor:tensor",
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class QnnExecuTorchOptions:
library_path: str = ""
log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff
online_prepare: bool = False
tensor_dump_output_path: str = ""
dump_intermediate_outputs: bool = False
profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
shared_buffer: bool = False
is_from_context_binary: bool = False
8 changes: 3 additions & 5 deletions backends/qualcomm/serialization/schema.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,9 @@ table QnnExecuTorchOptions {
/// Check if on-device graph construction. Default is false.
online_prepare:bool;

/// Tensor dump output path. If a path is given, Delegate would write
/// outputs of each OP there.
/// In ALL cases, we don't recommend to set this option.
/// This option exist just for debugging some accuracy issues.
tensor_dump_output_path:string;
/// If tensor dump is enabled, all intermediate tensors output will be dumped.
/// This option exists for debugging accuracy issues. Default is off.
dump_intermediate_outputs:bool;

/// Profiling level of the delegate and the backend. Default is off.
profile_level:QnnExecuTorchProfileLevel;
Expand Down
42 changes: 38 additions & 4 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def setUp(self):
debug=False,
saver=False,
online_prepare=TestQNN.online_prepare,
tensor_dump_output_path="",
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
profile=TestQNN.enable_profile,
shared_buffer=TestQNN.shared_buffer,
)
Expand Down Expand Up @@ -490,7 +490,7 @@ def setUp(self):
debug=False,
saver=False,
online_prepare=TestQNN.online_prepare,
tensor_dump_output_path="",
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
profile=TestQNN.enable_profile,
shared_buffer=TestQNN.shared_buffer,
)
Expand Down Expand Up @@ -604,7 +604,7 @@ def setUp(self):
debug=False,
saver=False,
online_prepare=TestQNN.online_prepare,
tensor_dump_output_path="",
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
profile=TestQNN.enable_profile,
shared_buffer=TestQNN.shared_buffer,
)
Expand Down Expand Up @@ -1121,7 +1121,7 @@ def setUp(self):
debug=False,
saver=False,
online_prepare=TestQNN.online_prepare,
tensor_dump_output_path="",
dump_intermediate_outputs=TestQNN.dump_intermediate_outputs,
profile=TestQNN.enable_profile,
shared_buffer=TestQNN.shared_buffer,
)
Expand Down Expand Up @@ -1287,6 +1287,22 @@ def setUp(self):
saver=False,
)

def test_qnn_backend_dump_intermediate_outputs(self):
backend_options = generate_htp_compiler_spec(use_fp16=True)
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
soc_model=self.arch_table[TestQNN.model],
backend_options=backend_options,
dump_intermediate_outputs=True,
)
module = Relu() # noqa: F405
sample_input = (torch.randn([2, 5, 1, 3]),)
self.lower_module_and_test_output(
module,
sample_input,
expected_partitions=1,
expected_intermediate_events=3,
)

def test_qnn_backend_skip_node_id(self):
module = SimpleModel() # noqa: F405
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
Expand Down Expand Up @@ -1442,6 +1458,23 @@ def setUp(self):
saver=False,
)

def test_qnn_backend_dump_intermediate_outputs(self):
backend_options = generate_htp_compiler_spec(use_fp16=False)
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
soc_model=self.arch_table[TestQNN.model],
backend_options=backend_options,
dump_intermediate_outputs=True,
)
module = Relu() # noqa: F405
sample_input = (torch.randn([2, 5, 1, 3]),)
module = self.get_qdq_module(module, sample_input)
self.lower_module_and_test_output(
module,
sample_input,
expected_partitions=1,
expected_intermediate_events=5,
)

def test_qnn_backend_skip_node_id_partitioner(self):
module = SimpleModel() # noqa: F405
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
Expand Down Expand Up @@ -2720,6 +2753,7 @@ def setup_environment():
TestQNN.oss_repo = args.oss_repo
TestQNN.shared_buffer = args.shared_buffer
TestQNN.enable_x86_64 = args.enable_x86_64
TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
return sys.argv[:1] + ns_args


Expand Down
45 changes: 39 additions & 6 deletions backends/qualcomm/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
QcomChipset,
)
from executorch.backends.qualcomm.utils.utils import capture_program
from executorch.devtools import generate_etrecord
from executorch.devtools.inspector import Inspector
from executorch.devtools import generate_etrecord, Inspector
from executorch.examples.qualcomm.utils import (
generate_inputs,
make_output_dir,
Expand Down Expand Up @@ -181,13 +180,14 @@ def _save_model_and_expected_output(

return input_list, ref_outputs, pte_fname

def verify_output(
def verify_output( # noqa: C901
self,
module: torch.nn.Module,
sample_inputs: Tuple[torch.Tensor],
executorch_prog: ExecutorchProgram | LoweredBackendModule,
etrecord_path: str = "etrecord.bin",
expected_profile_events: int = -1,
expected_intermediate_events: int = -1,
):
with tempfile.TemporaryDirectory() as tmp_dir:
buffer = (
Expand All @@ -211,6 +211,7 @@ def verify_output(
output_dir = f"{tmp_dir}/outputs"
outputs = []
etdump_path = f"{tmp_dir}/etdump.etdp"
debug_output_path = f"{tmp_dir}/debug_output.bin"

def post_process():
for i, f in enumerate(sorted(os.listdir(output_dir))):
Expand All @@ -225,6 +226,16 @@ def validate_profile():
len(inspector.to_dataframe().index) == expected_profile_events
)

def validate_intermediate_tensor():
inspector = Inspector(
etdump_path=etdump_path, debug_buffer_path=debug_output_path
)
for event_block in inspector.event_blocks:
if event_block.name == "Execute":
self.assertTrue(
len(event_block.events) == expected_intermediate_events
)

if self.enable_x86_64:
generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list)
make_output_dir(output_dir)
Expand Down Expand Up @@ -277,6 +288,9 @@ def validate_profile():
# Verify the etdump
if expected_profile_events != -1:
validate_profile()

if expected_intermediate_events != -1:
validate_intermediate_tensor()
else:
adb = SimpleADB(
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
Expand All @@ -287,6 +301,9 @@ def validate_profile():
host_id=self.host,
soc_model=self.model,
error_only=self.error_only,
dump_intermediate_outputs=(
True if expected_intermediate_events != -1 else False
),
)
adb.push(inputs=[sample_inputs], input_list=input_list)
adb.execute()
Expand All @@ -296,12 +313,20 @@ def validate_profile():
if expected_profile_events != -1:
adb.pull_etdump(etdump_path, callback=validate_profile)

if expected_intermediate_events != -1:
adb.pull_debug_output(
etdump_path,
debug_output_path,
callback=validate_intermediate_tensor,
)

def lower_module_and_test_output(
self,
module: torch.nn.Module,
sample_inputs: Tuple[torch.Tensor],
expected_partitions: int = 1,
expected_profile_events: int = -1,
expected_intermediate_events: int = -1,
assert_output_equal: bool = True,
skip_node_id_set: set = None,
skip_node_op_set: set = None,
Expand Down Expand Up @@ -346,11 +371,19 @@ def lower_module_and_test_output(
etrecord_path = "etrecord.bin"
if self.enable_profile:
generate_etrecord(etrecord_path, edge_copy, exec_prog)

# Check numerics
if assert_output_equal or expected_profile_events != -1:
if (
assert_output_equal
or expected_profile_events != -1
or expected_intermediate_events != -1
):
self.verify_output(
module, sample_inputs, exec_prog, etrecord_path, expected_profile_events
module,
sample_inputs,
exec_prog,
etrecord_path,
expected_profile_events,
expected_intermediate_events,
)

def get_qdq_module(
Expand Down
Loading
Loading