From 61b71d526452733b91c8a22efe25397faa6c6b96 Mon Sep 17 00:00:00 2001
From: haowhsu <haowhsu@qti.qualcomm.com>
Date: Wed, 2 Jul 2025 00:26:53 +0800
Subject: [PATCH] lpai e2e & minimum inference runtime support

---
 backends/qualcomm/CMakeLists.txt              |  34 +-
 backends/qualcomm/_passes/build_quant_io.py   |  17 +-
 .../qualcomm/aot/wrappers/TensorWrapper.cpp   |  10 +
 backends/qualcomm/fastrpc/CMakeLists.txt      |  78 +++++
 .../qualcomm/fastrpc/qnn_executor_runner.cpp  | 148 +++++++++
 backends/qualcomm/fastrpc/qnn_executorch.idl  |  27 ++
 .../qualcomm/fastrpc/qnn_executorch_impl.cpp  | 314 ++++++++++++++++++
 backends/qualcomm/runtime/Logging.cpp         |  10 +
 .../qualcomm/runtime/QnnExecuTorchBackend.cpp |   2 +
 backends/qualcomm/runtime/Utils.cpp           |   2 +
 .../qualcomm/runtime/backends/CMakeLists.txt  |  35 +-
 .../runtime/backends/QnnBackendCache.cpp      |   3 +-
 .../runtime/backends/QnnBackendFactory.cpp    |  35 +-
 .../runtime/backends/QnnBackendFactory.h      |   2 +
 .../backends/QnnBackendUnifiedRegistry.cpp    |  14 +-
 .../backends/QnnBackendUnifiedRegistry.h      |   2 +-
 .../runtime/backends/QnnCustomProtocol.cpp    |   9 +-
 .../runtime/backends/QnnCustomProtocol.h      |   1 +
 .../runtime/backends/QnnFunctionInterface.h   |   2 +
 .../runtime/backends/QnnGraphCommon.h         |   6 +-
 .../runtime/backends/QnnImplementation.cpp    |   7 +
 .../backends/QnnSysFunctionInterface.h        |   2 +-
 .../runtime/backends/lpai/LpaiBackend.cpp     |  62 ++++
 .../runtime/backends/lpai/LpaiBackend.h       |  40 +++
 .../backends/lpai/LpaiBackendCustomConfig.cpp |  66 ++++
 .../backends/lpai/LpaiBackendCustomConfig.h   |  45 +++
 .../runtime/backends/lpai/LpaiContext.cpp     |  56 ++++
 .../runtime/backends/lpai/LpaiContext.h       |  38 +++
 .../backends/lpai/LpaiContextCustomConfig.h   |  44 +++
 .../runtime/backends/lpai/LpaiDevice.h        |  31 ++
 .../runtime/backends/lpai/LpaiGraph.h         |  81 +++++
 .../backends/lpai/LpaiGraphCustomConfig.h     |  94 ++++++
 .../lpai/aarch64/LpaiContextCustomConfig.cpp  |  32 ++
 .../backends/lpai/aarch64/LpaiDevice.cpp      |  25 ++
 .../backends/lpai/aarch64/LpaiGraph.cpp       |  30 ++
 .../lpai/aarch64/LpaiGraphCustomConfig.cpp    |  95 ++++++
 .../lpai/x86_64/LpaiContextCustomConfig.cpp   |  22 ++
 .../backends/lpai/x86_64/LpaiDevice.cpp       |  21 ++
 .../backends/lpai/x86_64/LpaiGraph.cpp        |  21 ++
 .../lpai/x86_64/LpaiGraphCustomConfig.cpp     |  32 ++
 backends/qualcomm/scripts/build.sh            |  56 +++-
 backends/qualcomm/scripts/lpai_utils.sh       |  90 +++++
 backends/qualcomm/scripts/sqnr_verifier.py    |  38 +++
 .../serialization/qc_compiler_spec.fbs        |  63 +++-
 backends/qualcomm/serialization/qc_schema.py  |  62 +++-
 backends/qualcomm/tests/test_qnn_delegate.py  |  97 +++++-
 backends/qualcomm/tests/utils.py              |  28 +-
 backends/qualcomm/utils/utils.py              |  78 ++++-
 .../executor_runner/qnn_executor_runner.cpp   |   3 +-
 examples/qualcomm/utils.py                    |  23 +-
 extension/data_loader/file_data_loader.cpp    |  57 ++--
 third-party/CMakeLists.txt                    |   5 +-
 52 files changed, 2110 insertions(+), 85 deletions(-)
 create mode 100644 backends/qualcomm/fastrpc/CMakeLists.txt
 create mode 100644 backends/qualcomm/fastrpc/qnn_executor_runner.cpp
 create mode 100644 backends/qualcomm/fastrpc/qnn_executorch.idl
 create mode 100644 backends/qualcomm/fastrpc/qnn_executorch_impl.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiBackend.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiBackend.h
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.h
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiContext.h
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiContextCustomConfig.h
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiDevice.h
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiGraph.h
 create mode 100644 backends/qualcomm/runtime/backends/lpai/LpaiGraphCustomConfig.h
 create mode 100644 backends/qualcomm/runtime/backends/lpai/aarch64/LpaiContextCustomConfig.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/aarch64/LpaiDevice.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraph.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraphCustomConfig.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/x86_64/LpaiContextCustomConfig.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/x86_64/LpaiDevice.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraph.cpp
 create mode 100644 backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraphCustomConfig.cpp
 create mode 100755 backends/qualcomm/scripts/lpai_utils.sh
 create mode 100644 backends/qualcomm/scripts/sqnr_verifier.py

diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 8ce1ce1bdbf..b9a23e243c6 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -239,11 +239,28 @@ target_link_libraries(
           shared_buffer
           qnn_dlc_manager
 )
-target_link_libraries(
-  qnn_executorch_backend
-  PRIVATE qnn_executorch_header qnn_schema qnn_manager executorch_core
-          extension_tensor qnn_backend_options
-)
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
+  link_directories(
+    $ENV{HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/$ENV{HEXAGON_ARCH}/G0/pic
+  )
+  target_link_libraries(
+    qnn_executorch_backend
+    PRIVATE qnn_executorch_header
+            qnn_schema
+            qnn_manager
+            executorch_core
+            extension_tensor
+            qnn_backend_options
+            c
+            c++
+  )
+else()
+  target_link_libraries(
+    qnn_executorch_backend
+    PRIVATE qnn_executorch_header qnn_schema qnn_manager executorch_core
+            extension_tensor qnn_backend_options
+  )
+endif()
 set_target_properties(
   qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
 )
@@ -278,6 +295,13 @@ install(
   RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}/executorch/backends/qualcomm
 )
 
+if(DEFINED ENV{HEXAGON_SDK_ROOT})
+  add_subdirectory(
+    ${QNN_EXECUTORCH_ROOT_DIR}/fastrpc
+    ${CMAKE_CURRENT_BINARY_DIR}/qnn_executorch/fastrpc
+  )
+endif()
+
 # QNN pybind
 if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
   add_subdirectory(
diff --git a/backends/qualcomm/_passes/build_quant_io.py b/backends/qualcomm/_passes/build_quant_io.py
index d43842e84a5..3bd90f8f4fc 100644
--- a/backends/qualcomm/_passes/build_quant_io.py
+++ b/backends/qualcomm/_passes/build_quant_io.py
@@ -5,9 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
-from executorch.exir.delegate import executorch_call_delegate
 
-from executorch.exir.pass_base import ExportPass, ProxyValue
+from executorch.exir.pass_base import ExportPass
 from executorch.exir.tensor import TensorSpec
 from torch.utils import _pytree as pytree
 
@@ -39,11 +38,17 @@ def call_getitem(self, value, key: int, meta):
         return super().call_getitem(value, key, meta)
 
     def call_delegate(self, lowered_module, args, kwargs, meta):
-        args_data, _ = pytree.tree_map_only(
-            ProxyValue, lambda x: x.data, (args, kwargs)
-        )
+        output_node = [
+            node
+            for node in lowered_module.original_module.graph.nodes
+            if node.target == "output"
+        ][0]
+        tensors = [
+            node.meta["val"].to(node.meta[QCOM_QUANTIZED_IO])
+            for node in output_node.args[0]
+        ]
         meta["spec"] = pytree.tree_map(
             self._make_spec,
-            executorch_call_delegate(lowered_module, *args_data),
+            tuple(tensors),
         )
         return super().call_delegate(lowered_module, args, kwargs, meta)
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
index 17d76aac412..68dfa775469 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
@@ -118,7 +118,17 @@ TensorWrapper::TensorWrapper(
 Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
   if (data != nullptr) {
     QNN_TENSOR_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
+#ifdef __hexagon__
+    // alignment is required
+    auto align_size = [](size_t alignment, size_t sz) {
+      return (sz + (alignment - 1)) & ~(alignment - 1);
+    };
+    const size_t alignment = 64;
+    QNN_TENSOR_VER_PTR(tensor_)->clientBuf.dataSize =
+        align_size(alignment, bytes_);
+#else
     QNN_TENSOR_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
+#endif
     if (copy_data) {
       owned_data_ = std::make_unique<char[]>(bytes_);
       const char* src_data = static_cast<const char*>(data);
diff --git a/backends/qualcomm/fastrpc/CMakeLists.txt b/backends/qualcomm/fastrpc/CMakeLists.txt
new file mode 100644
index 00000000000..d49a8003f3f
--- /dev/null
+++ b/backends/qualcomm/fastrpc/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(_qnn_fastrpc__dir ${CMAKE_BINARY_DIR}/backends/qualcomm/fastrpc)
+set(_qnn_fastrpc__srcs ${CMAKE_CURRENT_LIST_DIR}/qnn_executorch.idl)
+set(_qnn_fastrpc__outputs
+    ${_qnn_fastrpc__dir}/qnn_executorch.h
+    ${_qnn_fastrpc__dir}/qnn_executorch_stub.c
+    ${_qnn_fastrpc__dir}/qnn_executorch_skel.c
+)
+
+if(DEFINED ENV{HEXAGON_SDK_ROOT})
+  add_custom_command(
+    OUTPUT ${_qnn_fastrpc__outputs}
+    COMMAND mkdir -p ${_qnn_fastrpc__dir}
+    COMMAND
+      $ENV{HEXAGON_SDK_ROOT}/ipc/fastrpc/qaic/bin/qaic -I
+      $ENV{HEXAGON_SDK_ROOT}/incs -I $ENV{HEXAGON_SDK_ROOT}/incs/stddef -o
+      ${_qnn_fastrpc__dir} ${_qnn_fastrpc__srcs}
+    WORKING_DIRECTORY ${EXECUTORCH_SOURCE_DIR}
+    DEPENDS qnn_executorch_backend
+    COMMENT "Codegen for fastrpc files"
+  )
+  add_custom_target(
+    fastrpc_codegen
+    DEPENDS ${_qnn_fastrpc__outputs}
+    COMMENT "Codegen for fastrpc files"
+  )
+
+endif()
+
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
+  add_library(
+    qnn_executorch_skel SHARED
+    ${_qnn_fastrpc__dir}/qnn_executorch.h
+    ${_qnn_fastrpc__dir}/qnn_executorch_skel.c qnn_executorch_impl.cpp
+  )
+  target_include_directories(qnn_executorch_skel PRIVATE ${_qnn_fastrpc__dir})
+  target_link_libraries(
+    qnn_executorch_skel PRIVATE extension_data_loader qnn_executorch_backend
+                                c++ c
+  )
+  add_dependencies(qnn_executorch_skel fastrpc_codegen)
+endif()
+
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES aarch64)
+  include_directories(
+    $ENV{HEXAGON_SDK_ROOT}/incs $ENV{HEXAGON_SDK_ROOT}/incs/stddef
+    ${_qnn_fastrpc__dir}
+  )
+  link_directories(
+    $ENV{HEXAGON_SDK_ROOT}/ipc/fastrpc/remote/ship/android_aarch64
+  )
+  add_library(
+    qnn_executorch_stub SHARED ${_qnn_fastrpc__dir}/qnn_executorch.h
+                               ${_qnn_fastrpc__dir}/qnn_executorch_stub.c
+  )
+  # TODO: support cdsp if necessary
+  target_link_libraries(qnn_executorch_stub PRIVATE adsprpc)
+  add_dependencies(qnn_executorch_stub fastrpc_codegen)
+
+  # build minimum example app
+  add_executable(qnn_executor_runner qnn_executor_runner.cpp)
+  target_link_libraries(
+    qnn_executor_runner PRIVATE executorch_core gflags qnn_executorch_stub
+                                adsprpc
+  )
+  # TODO: support cdsp if necessary
+  target_link_libraries(qnn_executor_runner PRIVATE adsprpc)
+endif()
diff --git a/backends/qualcomm/fastrpc/qnn_executor_runner.cpp b/backends/qualcomm/fastrpc/qnn_executor_runner.cpp
new file mode 100644
index 00000000000..7e5b53e34da
--- /dev/null
+++ b/backends/qualcomm/fastrpc/qnn_executor_runner.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <chrono>
+#include <fstream>
+#include <memory>
+#include <numeric>
+
+#include <executorch/runtime/platform/assert.h>
+#include <gflags/gflags.h>
+
+#include "qnn_executorch.h"
+
+DEFINE_string(
+    model_path,
+    "model.pte",
+    "Model serialized in flatbuffer format.");
+DEFINE_string(
+    output_folder_path,
+    ".",
+    "Executorch inference data output path.");
+DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (argc != 1) {
+    std::string msg = "extra commandline args:";
+    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+      msg += std::string(" ") + argv[i];
+    }
+    ET_LOG(Error, "%s", msg.c_str());
+    return 1;
+  }
+
+  // fastrpc related
+  // adsp
+  const int adsp_domain_id = 0;
+  // signed PD
+  const int enable_unsigned_pd = 0;
+  // domain uri
+  std::string domain_uri(qnn_executorch_URI);
+  domain_uri += "&_dom=adsp";
+  // init session
+  struct remote_rpc_control_unsigned_module data;
+  data.domain = adsp_domain_id;
+  data.enable = enable_unsigned_pd;
+  int err = AEE_SUCCESS;
+  ET_CHECK_MSG(
+      AEE_SUCCESS ==
+          (err = remote_session_control(
+               DSPRPC_CONTROL_UNSIGNED_MODULE, (void*)&data, sizeof(data))),
+      "remote_session_control failed: 0x%x",
+      err);
+  // start session
+  remote_handle64 handle = -1;
+  ET_CHECK_MSG(
+      AEE_SUCCESS == (err = qnn_executorch_open(domain_uri.data(), &handle)),
+      "qnn_executorch_open failed: 0x%x",
+      err);
+  // load model
+  const char* model_path = FLAGS_model_path.c_str();
+  qnn_executorch_load(handle, model_path);
+
+  // prepare io
+  std::vector<std::vector<uint8_t>> input_data, output_data;
+  std::vector<tensor> input_tensor, output_tensor;
+  for (int i = 0;; ++i) {
+    int nbytes = 0;
+    qnn_executorch_get_input_size(handle, model_path, i, &nbytes);
+    if (nbytes == -1) {
+      break;
+    }
+    input_data.emplace_back(std::vector<uint8_t>(nbytes));
+    input_tensor.emplace_back(
+        tensor({input_data.back().data(), (int)input_data.back().size()}));
+  }
+  for (int i = 0;; ++i) {
+    int nbytes = 0;
+    qnn_executorch_get_output_size(handle, model_path, i, &nbytes);
+    if (nbytes == -1) {
+      break;
+    }
+    output_data.emplace_back(std::vector<uint8_t>(nbytes));
+    output_tensor.emplace_back(
+        tensor({output_data.back().data(), (int)output_data.back().size()}));
+  }
+
+  // prepare input data
+  std::ifstream input_list(FLAGS_input_list_path);
+  // TODO: should check IO info via fastrpc first
+  if (input_list.is_open()) {
+    auto split = [](std::string s, std::string delimiter) {
+      size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+      std::string token;
+      std::vector<std::string> res;
+
+      while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+        token = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back(token);
+      }
+      res.push_back(s.substr(pos_start));
+      return res;
+    };
+
+    std::string file_path;
+    int inference_index = 0;
+    while (std::getline(input_list, file_path)) {
+      auto input_files = split(file_path, " ");
+      if (input_files.size() == 0) {
+        break;
+      }
+      size_t num_inputs = input_files.size();
+      for (int i = 0; i < num_inputs; ++i) {
+        std::ifstream fin(input_files[i], std::ios::binary);
+        fin.seekg(0, fin.end);
+        size_t file_size = fin.tellg();
+        fin.seekg(0, fin.beg);
+        fin.read((char*)input_data[i].data(), file_size);
+        fin.close();
+      }
+      qnn_executorch_set_input(
+          handle, model_path, input_tensor.data(), input_tensor.size());
+      qnn_executorch_execute(handle, model_path);
+      qnn_executorch_get_output(
+          handle, model_path, output_tensor.data(), output_tensor.size());
+      for (size_t i = 0; i < output_tensor.size(); i++) {
+        auto output_file_name = FLAGS_output_folder_path + "/output_" +
+            std::to_string(inference_index) + "_" + std::to_string(i) + ".raw";
+        std::ofstream fout(output_file_name.c_str(), std::ios::binary);
+        fout.write(
+            (const char*)output_tensor[i].data, output_tensor[i].dataLen);
+        fout.close();
+      }
+    }
+  }
+
+  // unload model
+  qnn_executorch_unload(handle, model_path);
+  // tear down
+  qnn_executorch_close(handle);
+  return 0;
+}
diff --git a/backends/qualcomm/fastrpc/qnn_executorch.idl b/backends/qualcomm/fastrpc/qnn_executorch.idl
new file mode 100644
index 00000000000..fb1d5cc342f
--- /dev/null
+++ b/backends/qualcomm/fastrpc/qnn_executorch.idl
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "AEEStdDef.idl"
+#include "remote.idl"
+
+/// Enabling stub-skel mismatch check feature in the auto-gen files.
+/// Please refer to the IDL documentation for more details on the feature.
+/// It is fully supported only on Kailua and later targets.
+const string IDL_VERSION = "0.0.0";
+
+typedef sequence<uint8> tensor;
+
+interface qnn_executorch : remote_handle64 {
+   long load(in string pte_path);
+   long get_input_size(in string pte_path, in long index, rout long nbytes);
+   long set_input(in string pte_path, in sequence<tensor> tensors);
+   long execute(in string pte_path);
+   long get_output_size(in string pte_path, in long index, rout long nbytes);
+   long get_output(in string pte_path, rout sequence<tensor> tensors);
+   long unload(in string pte_path);
+};
diff --git a/backends/qualcomm/fastrpc/qnn_executorch_impl.cpp b/backends/qualcomm/fastrpc/qnn_executorch_impl.cpp
new file mode 100644
index 00000000000..5fafb3794a0
--- /dev/null
+++ b/backends/qualcomm/fastrpc/qnn_executorch_impl.cpp
@@ -0,0 +1,314 @@
+#include <chrono>
+#include <fstream>
+#include <memory>
+#include <unordered_map>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <dlfcn.h>
+#include "System/QnnSystemInterface.h"
+#include "qnn_executorch.h"
+
+#include "HAP_farf.h"
+
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::extension::FileDataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::Method;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+class SimpleWrapper {
+ public:
+  SimpleWrapper(const char* pte_path) {
+    auto loader = FileDataLoader::from(pte_path, 256);
+    if (!loader.ok()) {
+      FARF(
+          RUNTIME_ERROR,
+          "FileDataLoader::from() failed: 0x%x",
+          (int)loader.error());
+      return;
+    }
+    loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    auto program = Program::load(loader_.get());
+    if (!program.ok()) {
+      FARF(RUNTIME_ERROR, "failed to parse model file %s", pte_path);
+      return;
+    }
+    program_ = std::make_unique<Program>(std::move(program.get()));
+
+    auto method_name = program_->get_method_name(0);
+    if (!method_name.ok()) {
+      FARF(RUNTIME_ERROR, "program has no methods");
+      return;
+    }
+    FARF(RUNTIME_HIGH, "using method %s", *method_name);
+
+    auto method_meta = program_->method_meta(*method_name);
+    if (!method_meta.ok()) {
+      FARF(
+          RUNTIME_ERROR,
+          "failed to get method_meta for %s: 0x%x",
+          *method_name,
+          (unsigned int)method_meta.error());
+      return;
+    }
+    method_meta_ = std::make_unique<MethodMeta>(std::move(method_meta.get()));
+
+    method_allocator_ = std::make_unique<MemoryAllocator>(
+        sizeof(method_allocator_pool_), method_allocator_pool_);
+
+    for (size_t id = 0; id < method_meta_->num_memory_planned_buffers(); ++id) {
+      size_t buffer_size = static_cast<size_t>(
+          method_meta->memory_planned_buffer_size(id).get());
+      planned_buffers_.push_back(std::make_unique<uint8_t[]>(buffer_size));
+      planned_spans_.push_back({planned_buffers_.back().get(), buffer_size});
+    }
+    planned_memory_ = std::make_unique<HierarchicalAllocator>(
+        Span<Span<uint8_t>>{planned_spans_.data(), planned_spans_.size()});
+
+    memory_manager_ = std::make_unique<MemoryManager>(
+        method_allocator_.get(), planned_memory_.get());
+
+    auto method = program_->load_method(*method_name, memory_manager_.get());
+    if (!method.ok()) {
+      FARF(
+          RUNTIME_ERROR,
+          "loading of method %s failed with status 0x%x",
+          *method_name,
+          (int)method.error());
+    }
+    method_ = std::make_unique<Method>(std::move(method.get()));
+
+    input_tensors_.resize(method_->inputs_size());
+    for (int i = 0; i < input_tensors_.size(); ++i) {
+      auto tensor_meta = method_meta_->input_tensor_meta(i);
+      input_tensors_[i].resize(padded_size(tensor_meta->nbytes()));
+      input_tensor_impls_.emplace_back(TensorImpl(
+          tensor_meta->scalar_type(),
+          tensor_meta->sizes().size(),
+          const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
+          align_ptr(input_tensors_[i].data()),
+          const_cast<TensorImpl::DimOrderType*>(
+              tensor_meta->dim_order().data())));
+      Error ret = method_->set_input(Tensor(&input_tensor_impls_.back()), i);
+      if (ret != Error::Ok) {
+        FARF(RUNTIME_ERROR, "failed to set input tensor: %d", (int)ret);
+        return;
+      }
+    }
+    output_tensors_.resize(method_->outputs_size());
+    for (int i = 0; i < output_tensors_.size(); ++i) {
+      auto tensor_meta = method_meta_->output_tensor_meta(i);
+      output_tensors_[i].resize(padded_size(tensor_meta->nbytes()));
+      Error ret = method_->set_output_data_ptr(
+          align_ptr(output_tensors_[i].data()), tensor_meta->nbytes(), i);
+      if (ret != Error::Ok) {
+        FARF(RUNTIME_ERROR, "failed to set output tensor: %d", (int)ret);
+        return;
+      }
+    }
+  }
+
+  size_t padded_size(size_t sz) {
+    size_t new_sz = alignment_ + sz;
+    return new_sz;
+  }
+
+  void* align_ptr(void* ptr) {
+    void* addr = reinterpret_cast<void*>(
+        ((size_t)ptr + (alignment_ - 1)) & ~(alignment_ - 1));
+    return addr;
+  }
+
+  int get_input_size(const int index) {
+    if (index < input_tensors_.size()) {
+      auto tensor_meta = method_meta_->input_tensor_meta(index);
+      return tensor_meta.ok() ? tensor_meta->nbytes() : -1;
+    }
+    return -1;
+  }
+
+  void set_input(int index, const tensor& t) {
+    if (padded_size(t.dataLen) > input_tensors_[index].size()) {
+      FARF(
+          RUNTIME_ERROR,
+          "input tensor %d size mismatched: %d vs %d",
+          index,
+          input_tensors_[index].size(),
+          t.dataLen);
+      return;
+    }
+    std::memcpy(align_ptr(input_tensors_[index].data()), t.data, t.dataLen);
+  }
+
+  int get_output_size(const int index) {
+    if (index < output_tensors_.size()) {
+      auto tensor_meta = method_meta_->output_tensor_meta(index);
+      return tensor_meta.ok() ? tensor_meta->nbytes() : -1;
+    }
+    return -1;
+  }
+
+  void get_output(int index, tensor& t) {
+    if (padded_size(t.dataLen) > output_tensors_[index].size()) {
+      FARF(
+          RUNTIME_ERROR,
+          "output tensor %d size mismatched: %d vs %d",
+          index,
+          output_tensors_[index].size(),
+          t.dataLen);
+      return;
+    }
+    std::memcpy(t.data, align_ptr(output_tensors_[index].data()), t.dataLen);
+  }
+
+  void execute() {
+    Error status = method_->execute();
+    if (status != Error::Ok) {
+      FARF(
+          RUNTIME_ERROR,
+          "Execution of method failed with status 0x%x",
+          (int)status);
+    }
+  }
+
+ private:
+  uint8_t method_allocator_pool_[4 * 1024U];
+  const size_t alignment_ = 64;
+  std::unique_ptr<FileDataLoader> loader_;
+  std::unique_ptr<HierarchicalAllocator> planned_memory_;
+  std::unique_ptr<MethodMeta> method_meta_;
+  std::unique_ptr<MemoryAllocator> method_allocator_;
+  std::unique_ptr<MemoryManager> memory_manager_;
+  std::unique_ptr<Method> method_;
+  std::unique_ptr<Program> program_;
+  std::vector<std::unique_ptr<uint8_t[]>> planned_buffers_;
+  std::vector<Span<uint8_t>> planned_spans_;
+  std::vector<std::vector<uint8_t>> input_tensors_;
+  std::vector<std::vector<uint8_t>> output_tensors_;
+  std::vector<TensorImpl> input_tensor_impls_;
+  std::vector<TensorImpl> output_tensor_impls_;
+};
+
+std::unordered_map<std::string, std::unique_ptr<SimpleWrapper>>
+    g_cached_request;
+
+AEEResult qnn_executorch_open(const char* uri, remote_handle64* h) {
+  FARF(RUNTIME_HIGH, __func__);
+  executorch::runtime::runtime_init();
+  return 0;
+}
+
+AEEResult qnn_executorch_close(remote_handle64 h) {
+  FARF(RUNTIME_HIGH, __func__);
+  g_cached_request.clear();
+  return 0;
+}
+
+AEEResult qnn_executorch_load(remote_handle64 _h, const char* pte_path) {
+  FARF(RUNTIME_HIGH, __func__);
+  std::string key(pte_path);
+  if (!g_cached_request.count(key)) {
+    g_cached_request[key] = std::make_unique<SimpleWrapper>(pte_path);
+  }
+  return 0;
+}
+
+AEEResult qnn_executorch_get_input_size(
+    remote_handle64 _h,
+    const char* pte_path,
+    const int index,
+    int* nbytes) {
+  FARF(RUNTIME_HIGH, __func__);
+  std::string key(pte_path);
+  *nbytes = -1;
+  if (g_cached_request.count(key)) {
+    *nbytes = g_cached_request[key]->get_input_size(index);
+  }
+  return 0;
+}
+
+AEEResult qnn_executorch_set_input(
+    remote_handle64 _h,
+    const char* pte_path,
+    const tensor* tensors,
+    int tensorsLen) {
+  FARF(RUNTIME_HIGH, __func__);
+  std::string key(pte_path);
+  if (g_cached_request.count(key)) {
+    auto& wrapper = g_cached_request[key];
+    for (int i = 0; i < tensorsLen; ++i) {
+      wrapper->set_input(i, tensors[i]);
+    }
+  }
+  return 0;
+}
+
+AEEResult qnn_executorch_execute(remote_handle64 _h, const char* pte_path) {
+  FARF(RUNTIME_HIGH, __func__);
+  std::string key(pte_path);
+  if (g_cached_request.count(key)) {
+    auto before_exec = std::chrono::high_resolution_clock::now();
+    g_cached_request[key]->execute();
+    auto after_exec = std::chrono::high_resolution_clock::now();
+    double interval_infs =
+        std::chrono::duration_cast<std::chrono::microseconds>(
+            after_exec - before_exec)
+            .count() /
+        1000.0;
+    FARF(RUNTIME_HIGH, "inferences took %f ms", interval_infs);
+  }
+  return 0;
+}
+
+AEEResult qnn_executorch_get_output_size(
+    remote_handle64 _h,
+    const char* pte_path,
+    const int index,
+    int* nbytes) {
+  FARF(RUNTIME_HIGH, __func__);
+  std::string key(pte_path);
+  *nbytes = -1;
+  if (g_cached_request.count(key)) {
+    *nbytes = g_cached_request[key]->get_output_size(index);
+  }
+  return 0;
+}
+
+AEEResult qnn_executorch_get_output(
+    remote_handle64 _h,
+    const char* pte_path,
+    tensor* tensors,
+    int tensorsLen) {
+  FARF(RUNTIME_HIGH, __func__);
+  std::string key(pte_path);
+  if (g_cached_request.count(key)) {
+    auto& wrapper = g_cached_request[key];
+    for (int i = 0; i < tensorsLen; ++i) {
+      wrapper->get_output(i, tensors[i]);
+    }
+  }
+  return 0;
+}
+
+AEEResult qnn_executorch_unload(remote_handle64 _h, const char* pte_path) {
+  FARF(RUNTIME_HIGH, __func__);
+  std::string key(pte_path);
+  if (g_cached_request.count(key)) {
+    g_cached_request.erase(key);
+  }
+  return 0;
+}
diff --git a/backends/qualcomm/runtime/Logging.cpp b/backends/qualcomm/runtime/Logging.cpp
index acd39c52e08..44824bf31e8 100644
--- a/backends/qualcomm/runtime/Logging.cpp
+++ b/backends/qualcomm/runtime/Logging.cpp
@@ -11,6 +11,9 @@
 #ifdef __ANDROID__
 #include <android/log.h>
 #endif
+#ifdef __hexagon__
+#include "HAP_farf.h"
+#endif
 namespace executorch {
 namespace backends {
 namespace qnn {
@@ -58,10 +61,17 @@ void Log(QnnExecuTorchLogLevel log_level, const char* format, ...) {
   }
   __android_log_vprint(android_severity, "[Qnn ExecuTorch]", format, args);
 #endif
+#ifndef __hexagon__
   fprintf(stderr, "[%s] [Qnn ExecuTorch]: ", serverity_name);
   vfprintf(stderr, format, args);
   va_end(args);
   fputc('\n', stderr);
+#else
+  char buf[128] = {0};
+  vsprintf(buf, format, args);
+  va_end(args);
+  FARF(RUNTIME_HIGH, "[%s] [Qnn ExecuTorch]: %s\n", serverity_name, buf);
+#endif
 }
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 41c2370e4cb..f0f9ac26d3c 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -113,7 +113,9 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
   }
   add_cached_delegate(signature, qnn_manager);
   // This backend does not need its processed data after Init.
+#ifndef __hexagon__
   processed->Free();
+#endif
   return qnn_manager;
 }
 
diff --git a/backends/qualcomm/runtime/Utils.cpp b/backends/qualcomm/runtime/Utils.cpp
index f11e25c4ec2..b70040fc792 100644
--- a/backends/qualcomm/runtime/Utils.cpp
+++ b/backends/qualcomm/runtime/Utils.cpp
@@ -13,6 +13,7 @@ namespace backends {
 namespace qnn {
 
 void CreateDirectory(const std::string& path) {
+#ifndef __hexagon__
   // Create any recursive directory
   if (path.empty()) {
     QNN_EXECUTORCH_LOG_ERROR("Create folder shouldn't be empty");
@@ -29,6 +30,7 @@ void CreateDirectory(const std::string& path) {
     std::string err_msg = "Failed to create " + subdir + " folder\n";
     QNN_EXECUTORCH_LOG_ERROR(err_msg.c_str());
   }
+#endif
 }
 
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
index d0f025bfbaa..19298e3eb2f 100644
--- a/backends/qualcomm/runtime/backends/CMakeLists.txt
+++ b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -43,13 +43,16 @@ target_sources(
                        ${CMAKE_CURRENT_LIST_DIR}/QnnProfiler.cpp
 )
 
-set(HOST_ARCHITECTURE_GPU
-    ${CMAKE_CURRENT_LIST_DIR}/gpu/${CMAKE_SYSTEM_PROCESSOR}
-)
-set(HOST_ARCHITECTURE_HTP
-    ${CMAKE_CURRENT_LIST_DIR}/htp/${CMAKE_SYSTEM_PROCESSOR}
-)
-set(HOST_ARCHITECTURE_IR ${CMAKE_CURRENT_LIST_DIR}/ir/${CMAKE_SYSTEM_PROCESSOR})
+# quick workaround for hexagon target
+set(target_platform aarch64)
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES x86_64)
+  set(target_platform x86_64)
+endif()
+
+set(HOST_ARCHITECTURE_GPU ${CMAKE_CURRENT_LIST_DIR}/gpu/${target_platform})
+set(HOST_ARCHITECTURE_HTP ${CMAKE_CURRENT_LIST_DIR}/htp/${target_platform})
+set(HOST_ARCHITECTURE_IR ${CMAKE_CURRENT_LIST_DIR}/ir/${target_platform})
+set(HOST_ARCHITECTURE_LPAI ${CMAKE_CURRENT_LIST_DIR}/lpai/${target_platform})
 
 # qnn_device
 target_sources(
@@ -57,6 +60,7 @@ target_sources(
   PUBLIC ${CMAKE_CURRENT_LIST_DIR}/QnnDeviceCommon.h
          ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuDevice.h
          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpDevice.h
+         ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiDevice.h
   PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnDeviceCommon.cpp
           ${CMAKE_CURRENT_LIST_DIR}/htp/HtpDevice.cpp
           ${CMAKE_CURRENT_LIST_DIR}/htp/HtpDevicePlatformInfoConfig.h
@@ -65,6 +69,7 @@ target_sources(
           # platform infomation and SocModel to Qnn
           ${HOST_ARCHITECTURE_HTP}/HtpDevicePlatformInfoConfig.cpp
           ${HOST_ARCHITECTURE_HTP}/HtpDeviceCustomConfig.cpp
+          ${HOST_ARCHITECTURE_LPAI}/LpaiDevice.cpp
 )
 
 # qnn_context
@@ -74,13 +79,17 @@ target_sources(
          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpContext.h
          ${CMAKE_CURRENT_LIST_DIR}/ir/IrContext.h
          ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuContext.h
+         ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiContext.h
   PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnContextCommon.cpp
-          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpContext.cpp
-          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpContextCustomConfig.h
           ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuContext.cpp
           ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuContextCustomConfig.h
+          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpContext.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpContextCustomConfig.h
+          ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiContext.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiContextCustomConfig.h
           ${HOST_ARCHITECTURE_GPU}/GpuContextCustomConfig.cpp
           ${HOST_ARCHITECTURE_HTP}/HtpContextCustomConfig.cpp
+          ${HOST_ARCHITECTURE_LPAI}/LpaiContextCustomConfig.cpp
           ${HOST_ARCHITECTURE_IR}/IrContext.cpp
 )
 
@@ -99,6 +108,7 @@ target_sources(
   PUBLIC ${CMAKE_CURRENT_LIST_DIR}/QnnGraphCommon.h
          ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuGraph.h
          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpGraph.h
+         ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiGraph.h
   PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnGraphCommon.cpp
           ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuGraph.cpp
           ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuGraphCustomConfig.h
@@ -107,6 +117,9 @@ target_sources(
           ${CMAKE_CURRENT_LIST_DIR}/htp/HtpGraphCustomConfig.h
           ${CMAKE_CURRENT_LIST_DIR}/htp/HtpGraphCustomConfig.cpp
           ${HOST_ARCHITECTURE_HTP}/HtpGraphCustomConfig.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiGraphCustomConfig.h
+          ${HOST_ARCHITECTURE_LPAI}/LpaiGraph.cpp
+          ${HOST_ARCHITECTURE_LPAI}/LpaiGraphCustomConfig.cpp
 )
 
 # qnn_op_package_manager
@@ -123,10 +136,14 @@ target_sources(
          ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuBackend.h
          ${CMAKE_CURRENT_LIST_DIR}/htp/HtpBackend.h
          ${CMAKE_CURRENT_LIST_DIR}/ir/IrBackend.h
+         ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiBackend.h
   PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCommon.cpp
           ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuBackend.cpp
           ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuBackendCustomConfig.h
           ${CMAKE_CURRENT_LIST_DIR}/gpu/GpuBackendCustomConfig.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiBackend.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiBackendCustomConfig.h
+          ${CMAKE_CURRENT_LIST_DIR}/lpai/LpaiBackendCustomConfig.cpp
 )
 
 # qnn_mem_manager
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 3dd1738d33b..011ac68c33d 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -22,11 +22,10 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary(
   std::uint32_t num_graphs;
   QnnSystemContext_GraphInfo_t* graphs = nullptr;
   const QnnSystemContext_BinaryInfo_t* binaryinfo{nullptr};
-  Qnn_ContextBinarySize_t binaryinfo_size = 0;
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
   error = qnn_sys_interface.qnn_system_context_get_binary_info(
-      sys_context_handle_, buffer, nbytes, &binaryinfo, &binaryinfo_size);
+      sys_context_handle_, buffer, nbytes, &binaryinfo);
 
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_WARN(
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index 9c559d83fcc..9b5e22c1d64 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -119,7 +119,40 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           options->profile_level(),
           gpu_options);
     } break;
-    case QnnExecuTorchBackendType::kDspBackend:
+    case QnnExecuTorchBackendType::kLpaiBackend: {
+      auto lpai_options = options->backend_options()->lpai_options();
+      if (options->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo) {
+        QNN_EXECUTORCH_LOG_INFO("fps in lpai_options: %d", lpai_options->fps());
+        QNN_EXECUTORCH_LOG_INFO(
+            "ftrt_ratio in lpai_options: %d", lpai_options->ftrt_ratio());
+        QNN_EXECUTORCH_LOG_INFO(
+            "client_perf_type in lpai_options: %s",
+            EnumNameQnnExecuTorchLpaiClientPerf(
+                lpai_options->client_perf_type()));
+        QNN_EXECUTORCH_LOG_INFO(
+            "affinity in lpai_options: %s",
+            QnnExecuTorchLpaiCoreAffinity(lpai_options->affinity()));
+        QNN_EXECUTORCH_LOG_INFO(
+            "core_selection in lpai_options: %d",
+            lpai_options->core_selection());
+      }
+      backend_params->qnn_backend_cache_ptr_ =
+          std::make_unique<QnnBackendCache>(qnn_context_blob);
+
+      backend_params->qnn_context_ptr_ = std::make_unique<LpaiContext>(
+          implementation_ptr,
+          qnn_backend_ptr,
+          qnn_device_ptr,
+          backend_params->qnn_backend_cache_ptr_.get(),
+          qnn_dlc_manager);
+
+      backend_params->qnn_graph_ptr_ = std::make_unique<LpaiGraph>(
+          implementation_ptr,
+          qnn_backend_ptr,
+          backend_params->qnn_context_ptr_.get(),
+          options->profile_level(),
+          lpai_options);
+    } break;
     case QnnExecuTorchBackendType::kUndefinedBackend:
     default:
       return nullptr;
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
index c125d5ffca4..753d8cf3007 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
@@ -22,6 +22,8 @@
 #include <executorch/backends/qualcomm/runtime/backends/htp/HtpBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/htp/HtpContext.h>
 #include <executorch/backends/qualcomm/runtime/backends/htp/HtpGraph.h>
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiContext.h>
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h>
 
 #include <memory>
 namespace executorch {
diff --git a/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.cpp b/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.cpp
index 8b1dcdf7a9d..35e58d22d78 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.cpp
@@ -13,6 +13,8 @@
 #include <executorch/backends/qualcomm/runtime/backends/gpu/GpuDevice.h>
 #include <executorch/backends/qualcomm/runtime/backends/htp/HtpBackend.h>
 #include <executorch/backends/qualcomm/runtime/backends/htp/HtpDevice.h>
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiBackend.h>
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiDevice.h>
 
 #include <string>
 
@@ -56,7 +58,10 @@ Error QnnBackendUnifiedRegistry::GetOrCreateBackendBundle(
         current_lib_path = gpu_library_name_;
         break;
       }
-      case QnnExecuTorchBackendType::kDspBackend:
+      case QnnExecuTorchBackendType::kLpaiBackend: {
+        current_lib_path = lpai_library_name_;
+        break;
+      }
       case QnnExecuTorchBackendType::kUndefinedBackend:
       default:
         QNN_EXECUTORCH_LOG_ERROR(
@@ -118,7 +123,12 @@ Error QnnBackendUnifiedRegistry::GetOrCreateBackendBundle(
       device = std::make_unique<GpuDevice>(implementation.get(), logger.get());
       break;
     }
-    case QnnExecuTorchBackendType::kDspBackend:
+    case QnnExecuTorchBackendType::kLpaiBackend: {
+      backend = std::make_unique<LpaiBackend>(
+          implementation.get(), logger.get(), options->soc_info());
+      device = std::make_unique<LpaiDevice>(implementation.get(), logger.get());
+      break;
+    }
     case QnnExecuTorchBackendType::kUndefinedBackend:
     default:
       return Error::NotFound;
diff --git a/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h b/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h
index b2549a3356c..b401688ac59 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendUnifiedRegistry.h
@@ -73,7 +73,7 @@ class QnnBackendUnifiedRegistry {
 
   static constexpr const char* htp_library_name_ = "libQnnHtp.so";
   static constexpr const char* gpu_library_name_ = "libQnnGpu.so";
-  static constexpr const char* dsp_library_name_ = "libQnnDsp.so";
+  static constexpr const char* lpai_library_name_ = "libQnnLpai.so";
 
   std::unique_ptr<const QnnSaver_Config_t*[]> GetImplementationConfig(
       const QnnExecuTorchOptions* options) {
diff --git a/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp b/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp
index b01d7ab6d80..bb0a77d033c 100644
--- a/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp
+++ b/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp
@@ -20,8 +20,7 @@ void QnnContextCustomProtocol::BuildContextCustomBuffer() {
     uint8_t magic_number_proto_size = sizeof(magic_number_);
     uint8_t binary_proto_size = sizeof(binary_size_);
     uint8_t signature_proto_size = sizeof(signature_);
-    uint64_t buffer_size = magic_number_proto_size + signature_proto_size +
-        binary_proto_size + binary_size_;
+    uint64_t buffer_size = alignment_ + binary_size_;
     qnn_custom_buffer_.resize(buffer_size, 0);
 
     size_t pos = 0;
@@ -62,6 +61,8 @@ QnnContextCustomProtocol::DeserializeContextCustomBuffer(void* processed_data) {
   uint8_t magic_number_proto_size = sizeof(magic_number_);
   uint8_t binary_proto_size = sizeof(binary_size_);
   uint8_t signature_proto_size = sizeof(signature_);
+  uint32_t padding_size = alignment_ - magic_number_proto_size -
+      binary_proto_size - signature_proto_size;
 
   uint32_t magic_number;
   std::memcpy(&magic_number, ptr, magic_number_proto_size);
@@ -80,13 +81,13 @@ QnnContextCustomProtocol::DeserializeContextCustomBuffer(void* processed_data) {
 
   uint64_t binary_size;
   std::memcpy(&binary_size, ptr, binary_proto_size);
-  ptr += binary_proto_size;
+  ptr += binary_proto_size + padding_size;
 
   return {status, signature_, binary_size, static_cast<void*>(ptr)};
 }
 
 uint64_t QnnContextCustomProtocol::GetContextBinaryOffset() {
-  return sizeof(magic_number_) + sizeof(signature_) + sizeof(binary_size_);
+  return alignment_;
 }
 
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/QnnCustomProtocol.h b/backends/qualcomm/runtime/backends/QnnCustomProtocol.h
index 3cc6a6e25dc..4007c7ccabd 100644
--- a/backends/qualcomm/runtime/backends/QnnCustomProtocol.h
+++ b/backends/qualcomm/runtime/backends/QnnCustomProtocol.h
@@ -83,6 +83,7 @@ class QnnContextCustomProtocol : public QnnCustomProtocol {
   static constexpr uint32_t magic_number_ = 0x5678ABCD;
   int64_t signature_{0};
   uint64_t binary_size_{0};
+  uint32_t alignment_{256};
 };
 
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h
index 0e1e4727aa3..2a49505a672 100644
--- a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h
+++ b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h
@@ -65,6 +65,8 @@ class QnnInterface {
   DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
   DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
   DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
+  DEFINE_SHIM_FUNCTION_INTERFACE(graph_set_config, graphSetConfig);
+  DEFINE_SHIM_FUNCTION_INTERFACE(graph_get_property, graphGetProperty);
   // --------- QnnLog ---------
   DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
   DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
diff --git a/backends/qualcomm/runtime/backends/QnnGraphCommon.h b/backends/qualcomm/runtime/backends/QnnGraphCommon.h
index fbb5ab80140..ea83bfcab1b 100644
--- a/backends/qualcomm/runtime/backends/QnnGraphCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnGraphCommon.h
@@ -34,7 +34,7 @@ class QnnGraph {
 
   virtual ~QnnGraph(){};
 
-  executorch::runtime::Error Configure(const std::string& graph_name);
+  virtual executorch::runtime::Error Configure(const std::string& graph_name);
 
   Qnn_ErrorHandle_t GraphExecute(
       const std::string& graph_name,
@@ -81,10 +81,10 @@ class QnnGraph {
       std::vector<const QnnGraph_Config_t*>& config) {
     return executorch::runtime::Error::Ok;
   };
-
- private:
   std::unordered_map<std::string, Qnn_GraphHandle_t> handle_;
   QnnImplementation* implementation_;
+
+ private:
   QnnBackend* backend_;
   QnnContext* context_;
   QnnExecuTorchProfileLevel profile_level_;
diff --git a/backends/qualcomm/runtime/backends/QnnImplementation.cpp b/backends/qualcomm/runtime/backends/QnnImplementation.cpp
index 246800791e6..7900b5cc5e3 100644
--- a/backends/qualcomm/runtime/backends/QnnImplementation.cpp
+++ b/backends/qualcomm/runtime/backends/QnnImplementation.cpp
@@ -25,6 +25,7 @@ struct DlCloser {
 Error QnnImplementation::InitBackend(
     void* const lib_handle,
     const QnnSaver_Config_t** saver_config) {
+#ifndef __hexagon__
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
   // saver_config must be set before backend initialization
   auto saver_initialize =
@@ -39,6 +40,7 @@ Error QnnImplementation::InitBackend(
       return Error::Internal;
     }
   }
+#endif
   return Error::Ok;
 }
 
@@ -50,6 +52,10 @@ const QnnInterface_t* QnnImplementation::StartBackend(
     const std::string& lib_path,
     const QnnSaver_Config_t** saver_config) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
+#ifdef __hexagon__
+  std::unique_ptr<void, DlCloser> lib_handle(
+      dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL));
+#else
   // If the library is already loaded, return the handle.
   std::unique_ptr<void, DlCloser> lib_handle(
       dlopen(lib_path.c_str(), RTLD_NOW | RTLD_NOLOAD));
@@ -57,6 +63,7 @@ const QnnInterface_t* QnnImplementation::StartBackend(
     lib_handle = std::unique_ptr<void, DlCloser>(
         dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL));
   }
+#endif
   if (lib_handle == nullptr) {
     QNN_EXECUTORCH_LOG_ERROR(
         "Cannot Open QNN library %s, with error: %s",
diff --git a/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h b/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h
index b77c7c2903e..28c3ed733f4 100644
--- a/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h
+++ b/backends/qualcomm/runtime/backends/QnnSysFunctionInterface.h
@@ -40,7 +40,7 @@ class QnnSystemInterface {
       systemContextCreate);
   DEFINE_SHIM_FUNCTION_SYS_INTERFACE(
       system_context_get_binary_info,
-      systemContextGetBinaryInfo);
+      systemContextGetMetaData);
   DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
 
  private:
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiBackend.cpp b/backends/qualcomm/runtime/backends/lpai/LpaiBackend.cpp
new file mode 100644
index 00000000000..c5d7294492b
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiBackend.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiBackend.h>
+
+#include "LPAI/QnnLpaiCommon.h"
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+using executorch::runtime::Error;
+
+LpaiBackend::LpaiBackend(
+    QnnImplementation* implementation,
+    QnnLogger* logger,
+    const SocInfo* soc_info)
+    : QnnBackend(implementation, logger) {
+  lpai_backend_custom_config_ =
+      std::make_unique<LpaiBackendCustomConfig>(soc_info);
+}
+
+Qnn_Version_t LpaiBackend::GetExpectedBackendVersion() const {
+  Qnn_Version_t backend_version;
+  backend_version.major = QNN_LPAI_API_VERSION_MAJOR;
+  backend_version.minor = QNN_LPAI_API_VERSION_MINOR;
+  backend_version.patch = QNN_LPAI_API_VERSION_PATCH;
+  return backend_version;
+}
+
+bool LpaiBackend::IsProfileEventTypeParentOfNodeTime(
+    QnnProfile_EventType_t event_type) {
+  return (event_type == QNN_PROFILE_EVENTTYPE_EXECUTE);
+}
+
+Error LpaiBackend::MakeConfig(std::vector<const QnnBackend_Config_t*>& config) {
+  const std::vector<QnnBackend_CustomConfig_t>& backend_custom_config =
+      lpai_backend_custom_config_->CreateBackendCustomConfig();
+
+  uint32_t num_custom_configs = backend_custom_config.size();
+  backend_config_.resize(num_custom_configs);
+  // +1 for null terminated
+  config.reserve(num_custom_configs + 1);
+
+  for (std::size_t i = 0; i < num_custom_configs; ++i) {
+    backend_config_[i].option = QNN_BACKEND_CONFIG_OPTION_CUSTOM;
+    backend_config_[i].customConfig = backend_custom_config[i];
+    config.push_back(&backend_config_[i]);
+  }
+
+  config.push_back(nullptr);
+  return Error::Ok;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiBackend.h b/backends/qualcomm/runtime/backends/lpai/LpaiBackend.h
new file mode 100644
index 00000000000..1b5b0ffa779
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiBackend.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+class LpaiBackend : public QnnBackend {
+ public:
+  LpaiBackend(
+      QnnImplementation* implementation,
+      QnnLogger* logger,
+      const SocInfo* soc_info);
+
+  Qnn_Version_t GetExpectedBackendVersion() const override;
+
+  bool IsProfileEventTypeParentOfNodeTime(
+      QnnProfile_EventType_t event_type) override;
+
+ protected:
+  executorch::runtime::Error MakeConfig(
+      std::vector<const QnnBackend_Config_t*>& config) override;
+
+ private:
+  std::vector<QnnBackend_Config_t> backend_config_;
+  std::unique_ptr<LpaiBackendCustomConfig> lpai_backend_custom_config_;
+};
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.cpp b/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.cpp
new file mode 100644
index 00000000000..30647f61211
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+LpaiBackendCustomConfig::LpaiBackendCustomConfig(const SocInfo* soc_info)
+    : soc_info_(soc_info) {}
+
+QnnLpaiBackend_CustomConfig_t*
+LpaiBackendCustomConfig::AllocBackendCustomConfig() {
+  lpai_backend_config_.emplace_back(
+      std::make_unique<QnnLpaiBackend_CustomConfig_t>());
+  lpai_backend_config_.back()->option = QNN_LPAI_BACKEND_CUSTOM_CFG_UNDEFINED;
+  return lpai_backend_config_.back().get();
+}
+
+QnnLpaiBackend_CustomConfigHwInfo_t* LpaiBackendCustomConfig::AllocHwInfo() {
+  lpai_hw_info_.emplace_back(
+      std::make_unique<QnnLpaiBackend_CustomConfigHwInfo_t>());
+  lpai_hw_info_.back()->hwVersion = QNN_LPAI_BACKEND_HW_VERSION_UNKNOWN;
+  lpai_hw_info_.back()->lpaiTarget = QNN_LPAI_BACKEND_TARGET_UNKNOWN;
+  return lpai_hw_info_.back().get();
+}
+
+std::vector<QnnBackend_CustomConfig_t>
+LpaiBackendCustomConfig::CreateBackendCustomConfig() {
+  std::vector<QnnBackend_CustomConfig_t> ret;
+  QnnLpaiBackend_CustomConfig_t* p_custom_config = nullptr;
+
+  std::unordered_map<LpaiHardwareVersion, QnnLpaiBackend_HwVersion_t>
+      lpai_hw_ver = {
+          {LpaiHardwareVersion::V1, QNN_LPAI_BACKEND_HW_VERSION_V1},
+          {LpaiHardwareVersion::V2, QNN_LPAI_BACKEND_HW_VERSION_V2},
+          {LpaiHardwareVersion::V3, QNN_LPAI_BACKEND_HW_VERSION_V3},
+          {LpaiHardwareVersion::V4, QNN_LPAI_BACKEND_HW_VERSION_V4},
+          {LpaiHardwareVersion::V5, QNN_LPAI_BACKEND_HW_VERSION_V5},
+          {LpaiHardwareVersion::V5_1, QNN_LPAI_BACKEND_HW_VERSION_V5_1},
+          {LpaiHardwareVersion::V6, QNN_LPAI_BACKEND_HW_VERSION_V6},
+          {LpaiHardwareVersion::V7, QNN_LPAI_BACKEND_HW_VERSION_V7},
+      };
+
+  p_custom_config = AllocBackendCustomConfig();
+  auto p_hw_info = AllocHwInfo();
+  p_custom_config->option = QNN_LPAI_BACKEND_CUSTOM_CFG_HW_INFO;
+  auto lpai_info = soc_info_->lpai_info();
+  if (lpai_info && lpai_hw_ver.count(lpai_info->lpai_hardware_version())) {
+    p_hw_info->hwVersion = lpai_hw_ver[lpai_info->lpai_hardware_version()];
+  }
+  p_hw_info->lpaiTarget = QNN_LPAI_BACKEND_TARGET_ADSP;
+  p_custom_config->config = p_hw_info;
+  ret.push_back(static_cast<QnnBackend_CustomConfig_t>(p_custom_config));
+  return ret;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.h b/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.h
new file mode 100644
index 00000000000..3abbbaa8508
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiBackendCustomConfig.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "LPAI/QnnLpaiBackend.h"
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+using namespace qnn_delegate;
+
+class LpaiBackendCustomConfig {
+ public:
+  explicit LpaiBackendCustomConfig(const SocInfo* soc_info);
+
+  std::vector<QnnBackend_CustomConfig_t> CreateBackendCustomConfig();
+
+ private:
+  QnnLpaiBackend_CustomConfig_t* AllocBackendCustomConfig();
+  std::vector<std::unique_ptr<QnnLpaiBackend_CustomConfig_t>>
+      lpai_backend_config_;
+  const SocInfo* soc_info_;
+
+  std::vector<std::unique_ptr<QnnLpaiBackend_CustomConfigHwInfo_t>>
+      lpai_hw_info_;
+  QnnLpaiBackend_CustomConfigHwInfo_t* AllocHwInfo();
+};
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp
new file mode 100644
index 00000000000..d5203898f6b
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiContext.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiContext.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+using executorch::runtime::Error;
+
+LpaiContext::LpaiContext(
+    QnnImplementation* implementation,
+    QnnBackend* backend,
+    QnnDevice* device,
+    QnnBackendCache* cache,
+    QnnDlcManager* qnn_dlc_manager)
+    : QnnContext(implementation, backend, device, cache, qnn_dlc_manager) {
+  lpai_context_custom_config_ = std::make_unique<LpaiContextCustomConfig>();
+}
+
+Error LpaiContext::MakeConfig(std::vector<const QnnContext_Config_t*>& config) {
+  const std::vector<QnnContext_CustomConfig_t>& context_custom_config =
+      lpai_context_custom_config_->CreateContextCustomConfig();
+
+  uint32_t num_custom_configs = context_custom_config.size();
+  context_config_.resize(num_custom_configs);
+  // +1 for null terminated
+  config.reserve(num_custom_configs + 1);
+
+  for (std::size_t i = 0; i < num_custom_configs; ++i) {
+    context_config_[i].option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
+    context_config_[i].customConfig = context_custom_config[i];
+    config.push_back(&context_config_[i]);
+  }
+
+#ifdef __hexagon__
+  QnnContext_Config_t adsp_context_config;
+  adsp_context_config.option = QNN_CONTEXT_CONFIG_PERSISTENT_BINARY;
+  adsp_context_config.isPersistentBinary = 1;
+  context_config_.push_back(adsp_context_config);
+  config.push_back(&context_config_.back());
+#endif
+
+  config.push_back(nullptr);
+  return Error::Ok;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiContext.h b/backends/qualcomm/runtime/backends/lpai/LpaiContext.h
new file mode 100644
index 00000000000..b05dac469bf
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiContext.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiContextCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+class QnnDlcManager;
+class LpaiContext : public QnnContext {
+ public:
+  LpaiContext(
+      QnnImplementation* implementation,
+      QnnBackend* backend,
+      QnnDevice* device,
+      QnnBackendCache* cache,
+      QnnDlcManager* qnn_dlc_manager);
+
+ protected:
+  executorch::runtime::Error MakeConfig(
+      std::vector<const QnnContext_Config_t*>& config) override;
+
+ private:
+  std::vector<QnnContext_Config_t> context_config_;
+  std::unique_ptr<LpaiContextCustomConfig> lpai_context_custom_config_;
+};
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiContextCustomConfig.h b/backends/qualcomm/runtime/backends/lpai/LpaiContextCustomConfig.h
new file mode 100644
index 00000000000..d415eeb51df
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiContextCustomConfig.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
+
+#include <memory>
+#include <vector>
+
+#include "LPAI/QnnLpaiContext.h"
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+using namespace qnn_delegate;
+
+class LpaiContextCustomConfig {
+ public:
+  explicit LpaiContextCustomConfig() {}
+
+  std::vector<QnnContext_CustomConfig_t> CreateContextCustomConfig();
+
+ private:
+  QnnLpaiContext_CustomConfig_t* AllocContextCustomConfig() {
+    lpai_context_config_.emplace_back(
+        std::make_unique<QnnLpaiContext_CustomConfig_t>());
+    lpai_context_config_.back()->option = QNN_LPAI_CONTEXT_SET_CFG_UNDEFINED;
+    return lpai_context_config_.back().get();
+  }
+  std::vector<std::unique_ptr<QnnLpaiContext_CustomConfig_t>>
+      lpai_context_config_;
+};
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiDevice.h b/backends/qualcomm/runtime/backends/lpai/LpaiDevice.h
new file mode 100644
index 00000000000..14edcb2d66a
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiDevice.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <executorch/backends/qualcomm/runtime/backends/QnnDeviceCommon.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+using executorch::runtime::Error;
+
+class LpaiDevice : public QnnDevice {
+ public:
+  LpaiDevice(QnnImplementation* implementation, QnnLogger* logger)
+      : QnnDevice(implementation, logger){};
+
+  executorch::runtime::Error Configure() override;
+
+ private:
+  std::vector<QnnDevice_Config_t> device_config_;
+};
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h b/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h
new file mode 100644
index 00000000000..cf1cce90295
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiGraphCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+class LpaiGraph : public QnnGraph {
+ public:
+  LpaiGraph(
+      QnnImplementation* implementation,
+      QnnBackend* backend,
+      QnnContext* context,
+      const QnnExecuTorchProfileLevel& profile_level,
+      const QnnExecuTorchLpaiBackendOptions* lpai_options)
+      : QnnGraph(implementation, backend, context, profile_level) {
+    lpai_graph_custom_config_ =
+        std::make_unique<LpaiGraphCustomConfig>(lpai_options, this);
+  };
+
+  executorch::runtime::Error Configure(const std::string& graph_name) override {
+    Error configure_status = QnnGraph::Configure(graph_name);
+    if (configure_status != Error::Ok) {
+      return configure_status;
+    }
+    const std::vector<QnnGraph_CustomConfig_t>& graph_custom_config =
+        lpai_graph_custom_config_->CreateGraphCustomConfig(graph_name);
+
+    std::vector<const QnnGraph_Config_t*> config;
+    uint32_t num_custom_configs = graph_custom_config.size();
+    graph_config_.resize(num_custom_configs);
+    // +1 for null terminated
+    config.reserve(num_custom_configs + 1);
+
+    for (std::size_t i = 0; i < num_custom_configs; ++i) {
+      graph_config_[i].option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_config_[i].customConfig = graph_custom_config[i];
+      config.push_back(&graph_config_[i]);
+    }
+    config.push_back(nullptr);
+
+    // LPAI specific > configs can only be set after graph create
+    const QnnInterface& qnn_interface = implementation_->GetQnnInterface();
+    Qnn_ErrorHandle_t error =
+        qnn_interface.qnn_graph_set_config(handle_[graph_name], config.data());
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          "qnn_graph_set_config failed. Error  %d", QNN_GET_ERROR_CODE(error));
+      return Error::Internal;
+    }
+
+    // platform specific behavior
+    return AfterConfigure(graph_name);
+  }
+
+  friend LpaiGraphCustomConfig;
+
+ protected:
+  executorch::runtime::Error MakeConfig(
+      std::vector<const QnnGraph_Config_t*>& config) override {
+    return {};
+  }
+
+ private:
+  executorch::runtime::Error AfterConfigure(const std::string& graph_name);
+  std::vector<QnnGraph_Config_t> graph_config_;
+  std::unique_ptr<LpaiGraphCustomConfig> lpai_graph_custom_config_;
+};
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/LpaiGraphCustomConfig.h b/backends/qualcomm/runtime/backends/lpai/LpaiGraphCustomConfig.h
new file mode 100644
index 00000000000..67c1bca1279
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/LpaiGraphCustomConfig.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnGraphCommon.h>
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "LPAI/QnnLpaiGraph.h"
+#include "LPAI/QnnLpaiGraphPrepare.h"
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+using namespace qnn_delegate;
+
+class LpaiGraph;
+class LpaiGraphCustomConfig {
+ public:
+  explicit LpaiGraphCustomConfig(
+      const QnnExecuTorchLpaiBackendOptions* lpai_options,
+      LpaiGraph* graph)
+      : graph_(graph), lpai_options_(lpai_options){};
+
+  std::vector<QnnGraph_CustomConfig_t> CreateGraphCustomConfig(
+      const std::string& graph_name);
+
+ private:
+  [[maybe_unused]] LpaiGraph* graph_;
+  [[maybe_unused]] const QnnExecuTorchLpaiBackendOptions* lpai_options_;
+
+  std::vector<std::unique_ptr<QnnLpaiGraph_Mem_t>> lpai_mem_;
+  QnnLpaiGraph_Mem_t* AllocMem() {
+    lpai_mem_.emplace_back(std::make_unique<QnnLpaiGraph_Mem_t>());
+    lpai_mem_.back()->memType = QNN_LPAI_MEM_TYPE_UNDEFINED;
+    lpai_mem_.back()->size = 0;
+    lpai_mem_.back()->addr = nullptr;
+    return lpai_mem_.back().get();
+  }
+
+  std::vector<std::unique_ptr<QnnLpaiGraph_CustomConfig_t>> lpai_graph_config_;
+  QnnLpaiGraph_CustomConfig_t* AllocGraphCustomConfig() {
+    lpai_graph_config_.emplace_back(
+        std::make_unique<QnnLpaiGraph_CustomConfig_t>());
+    lpai_graph_config_.back()->option = QNN_LPAI_GRAPH_SET_CFG_UNDEFINED;
+    return lpai_graph_config_.back().get();
+  }
+
+  std::vector<std::unique_ptr<QnnLpaiGraph_PerfCfg_t>> lpai_perf_cfg_;
+  QnnLpaiGraph_PerfCfg_t* AllocPerfCfg() {
+    lpai_perf_cfg_.emplace_back(std::make_unique<QnnLpaiGraph_PerfCfg_t>());
+    lpai_perf_cfg_.back()->fps = 0;
+    lpai_perf_cfg_.back()->ftrtRatio = 0;
+    lpai_perf_cfg_.back()->clientType =
+        QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_UNDEFINED;
+    return lpai_perf_cfg_.back().get();
+  }
+
+  std::vector<std::unique_ptr<QnnLpaiGraph_CoreAffinity_t>> lpai_core_affinity_;
+  QnnLpaiGraph_CoreAffinity_t* AllocCoreAffinity() {
+    lpai_core_affinity_.emplace_back(
+        std::make_unique<QnnLpaiGraph_CoreAffinity_t>());
+    lpai_core_affinity_.back()->affinity =
+        QNN_LPAI_GRAPH_CORE_AFFINITY_UNDEFINED;
+    lpai_core_affinity_.back()->coreSelection = 0;
+    return lpai_core_affinity_.back().get();
+  }
+
+  std::vector<std::unique_ptr<QnnLpaiGraph_CustomConfigPrepare_t>>
+      lpai_prepare_;
+  QnnLpaiGraph_CustomConfigPrepare_t* AllocPrepare() {
+    lpai_prepare_.emplace_back(
+        std::make_unique<QnnLpaiGraph_CustomConfigPrepare_t>());
+    lpai_prepare_.back()->enablePerLayer = 0;
+    lpai_prepare_.back()->enableCoreSelection = nullptr;
+    return lpai_prepare_.back().get();
+  }
+
+  std::vector<uint8_t> scratch_buf_, persistent_buf_;
+};
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiContextCustomConfig.cpp
new file mode 100644
index 00000000000..3f578ff0636
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiContextCustomConfig.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiContextCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+std::vector<QnnContext_CustomConfig_t>
+LpaiContextCustomConfig::CreateContextCustomConfig() {
+  std::vector<QnnContext_CustomConfig_t> ret;
+#ifndef __hexagon__
+  QnnLpaiContext_CustomConfig_t* p_custom_config = nullptr;
+
+  // TODO: support graph based execution
+  p_custom_config = AllocContextCustomConfig();
+  p_custom_config->option = QNN_LPAI_CONTEXT_SET_CFG_ENABLE_ISLAND;
+  p_custom_config->config = nullptr;
+  ret.push_back(static_cast<QnnContext_CustomConfig_t>(p_custom_config));
+#endif
+  return ret;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiDevice.cpp b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiDevice.cpp
new file mode 100644
index 00000000000..4bcba99f5e3
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiDevice.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiDevice.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+Error LpaiDevice::Configure() {
+#ifndef __hexagon__
+  return QnnDevice::Configure();
+#else
+  return Error::Ok;
+#endif
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraph.cpp b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraph.cpp
new file mode 100644
index 00000000000..8ed8df998e3
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraph.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+Error LpaiGraph::AfterConfigure(const std::string& graph_name) {
+  // LPAI does not support online prepare and require graph to be finalized
+  // again
+  Qnn_ErrorHandle_t error = GraphFinalize(graph_name);
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Failed to finalize Qnn Graph with error: %d",
+        QNN_GET_ERROR_CODE(error));
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraphCustomConfig.cpp
new file mode 100644
index 00000000000..1d0f231b99e
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/aarch64/LpaiGraphCustomConfig.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+std::vector<QnnGraph_CustomConfig_t>
+LpaiGraphCustomConfig::CreateGraphCustomConfig(const std::string& graph_name) {
+  std::vector<QnnGraph_CustomConfig_t> configs;
+  QnnLpaiGraph_CustomConfig_t* p_custom_config = nullptr;
+
+#ifdef __hexagon__
+  uint32_t scratch_size = 0;
+  uint32_t persistent_size = 0;
+  QnnLpaiGraph_CustomProperty_t custom_props[2];
+  custom_props[0].option = QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE;
+  custom_props[0].property = &scratch_size;
+  custom_props[1].option = QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE;
+  custom_props[1].property = &persistent_size;
+
+  QnnGraph_Property_t graph_props[2];
+  graph_props[0].option = QNN_GRAPH_PROPERTY_OPTION_CUSTOM;
+  graph_props[0].customProperty = &custom_props[0];
+  graph_props[1].option = QNN_GRAPH_PROPERTY_OPTION_CUSTOM;
+  graph_props[1].customProperty = &custom_props[1];
+  QnnGraph_Property_t* graph_prop_ptrs[3] = {0};
+  graph_prop_ptrs[0] = &graph_props[0];
+  graph_prop_ptrs[1] = &graph_props[1];
+
+  const QnnInterface& qnn_interface =
+      graph_->implementation_->GetQnnInterface();
+  Qnn_ErrorHandle_t error = qnn_interface.qnn_graph_get_property(
+      graph_->handle_[graph_name], graph_prop_ptrs);
+
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "failed to get graph property: %d", QNN_GET_ERROR_CODE(error));
+    return {};
+  }
+
+  scratch_buf_.resize(scratch_size);
+  p_custom_config = AllocGraphCustomConfig();
+  p_custom_config->option = QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM;
+  auto p_scratch_config = AllocMem();
+  p_scratch_config->memType = QNN_LPAI_MEM_TYPE_DDR;
+  p_scratch_config->size = scratch_size;
+  p_scratch_config->addr = scratch_buf_.data();
+  p_custom_config->config = p_scratch_config;
+  configs.push_back(p_custom_config);
+
+  persistent_buf_.resize(persistent_size);
+  p_custom_config = AllocGraphCustomConfig();
+  p_custom_config->option = QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_DEFAULT;
+  auto p_persistent_config = AllocMem();
+  p_persistent_config->memType = QNN_LPAI_MEM_TYPE_DDR;
+  p_persistent_config->size = persistent_size;
+  p_persistent_config->addr = persistent_buf_.data();
+  p_custom_config->config = p_persistent_config;
+  configs.push_back(p_custom_config);
+  // TODO: figure out how to add perf control (internal enum required)
+  //       e.g. QNN_LPAI_GRAPH_SET_ENPU_CLOCK
+#endif
+  // perf config
+  p_custom_config = AllocGraphCustomConfig();
+  auto p_perf_cfg = AllocPerfCfg();
+  p_custom_config->option = QNN_LPAI_GRAPH_SET_CFG_PERF_CFG;
+  p_perf_cfg->fps = lpai_options_->fps();
+  p_perf_cfg->ftrtRatio = lpai_options_->ftrt_ratio();
+  p_perf_cfg->clientType = static_cast<QnnLpaiGraph_ClientPerfType_t>(
+      lpai_options_->client_perf_type());
+  p_custom_config->config = p_perf_cfg;
+  configs.push_back(p_custom_config);
+  // core affinity
+  p_custom_config = AllocGraphCustomConfig();
+  auto p_core_affinity = AllocCoreAffinity();
+  p_custom_config->option = QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY;
+  p_core_affinity->affinity =
+      static_cast<QnnLpaiGraph_CoreAffinityType_t>(lpai_options_->affinity());
+  p_core_affinity->coreSelection = lpai_options_->core_selection();
+  p_custom_config->config = p_core_affinity;
+  configs.push_back(p_custom_config);
+  return configs;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiContextCustomConfig.cpp
new file mode 100644
index 00000000000..7f6ece6e1df
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiContextCustomConfig.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiContextCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+std::vector<QnnContext_CustomConfig_t>
+LpaiContextCustomConfig::CreateContextCustomConfig() {
+  return {};
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiDevice.cpp b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiDevice.cpp
new file mode 100644
index 00000000000..739159e95b2
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiDevice.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiDevice.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+Error LpaiDevice::Configure() {
+  return Error::Ok;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraph.cpp b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraph.cpp
new file mode 100644
index 00000000000..391bbcc83fc
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraph.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiGraph.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+Error LpaiGraph::AfterConfigure(const std::string& graph_name) {
+  return Error::Ok;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraphCustomConfig.cpp b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraphCustomConfig.cpp
new file mode 100644
index 00000000000..5720fb19997
--- /dev/null
+++ b/backends/qualcomm/runtime/backends/lpai/x86_64/LpaiGraphCustomConfig.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/lpai/LpaiGraphCustomConfig.h>
+
+namespace executorch {
+namespace backends {
+namespace qnn {
+
+std::vector<QnnGraph_CustomConfig_t>
+LpaiGraphCustomConfig::CreateGraphCustomConfig(const std::string& graph_name) {
+  std::vector<QnnGraph_CustomConfig_t> configs;
+  QnnLpaiGraph_CustomConfig_t* p_custom_config = nullptr;
+
+  p_custom_config = AllocGraphCustomConfig();
+  auto p_core_prepare = AllocPrepare();
+  static char core_selection = lpai_options_->core_selection() + '0';
+  p_custom_config->option = QNN_LPAI_GRAPH_SET_CFG_PREPARE;
+  p_core_prepare->enableCoreSelection = &core_selection;
+  p_custom_config->config = p_core_prepare;
+  configs.push_back(static_cast<QnnBackend_CustomConfig_t>(p_custom_config));
+  return configs;
+}
+
+} // namespace qnn
+} // namespace backends
+} // namespace executorch
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index b8f366d2f7c..e8fd342bc61 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -44,6 +44,8 @@ BUILD_ANDROID="true"
 CMAKE_ANDROID="build-android"
 BUILD_OE_LINUX="false"
 CMAKE_OE_LINUX="build-oe-linux"
+BUILD_HEXAGON="false"
+CMAKE_HEXAGON="build-hexagon"
 CLEAN="true"
 BUILD_TYPE="RelWithDebInfo"
 BUILD_JOB_NUMBER="16"
@@ -56,7 +58,7 @@ if [ -z BUCK2 ]; then
   BUCK2="buck2"
 fi
 
-long_options=skip_x86_64,skip_linux_android,skip_linux_embedded,enable_linux_embedded,no_clean,release,job_number:
+long_options=skip_x86_64,skip_linux_android,skip_linux_embedded,skip_hexagon,enable_linux_embedded,enable_hexagon,no_clean,release,job_number:
 
 parsed_args=$(getopt -a --options '' --longoptions $long_options --name "$0" -- "$@")
 eval set -- "$parsed_args"
@@ -68,6 +70,8 @@ while true ; do
         --skip_linux_android) BUILD_ANDROID="false"; shift;;
         --skip_linux_embedded) BUILD_OE_LINUX="false"; shift;;
         --enable_linux_embedded) BUILD_ANDROID="false"; BUILD_OE_LINUX="true"; shift;;
+        --skip_hexagon) BUILD_HEXAGON="false"; shift;;
+        --enable_hexagon) BUILD_HEXAGON="true"; shift;;
         --no_clean) CLEAN="false"; shift;;
         --release) BUILD_TYPE="Release"; shift;;
         --job_number) BUILD_JOB_NUMBER="$2"; shift 2;;
@@ -239,6 +243,56 @@ if [ "$BUILD_OE_LINUX" = true ]; then
     cmake --build $LLAMA_EXAMPLE_ROOT -j$BUILD_JOB_NUMBER
 fi
 
+if [ "$BUILD_HEXAGON" = true ]; then
+    if [[ -z ${ANDROID_NDK_ROOT} ]]; then
+        echo "Please export ANDROID_NDK_ROOT=/path/to/android_ndkXX"
+        exit -1
+    fi
+    if [[ -z ${HEXAGON_SDK_ROOT} ]]; then
+        echo "Please export HEXAGON_SDK_ROOT=/path/to/hexagon-sdk-x.x.x"
+        exit -1
+    fi
+    if [[ -z ${HEXAGON_TOOLS_ROOT} ]]; then
+        echo "Please export HEXAGON_TOOLS_ROOT=/path/to/hexagon-sdk-x.x.x/tools/HEXAGON_Tools/x.x.x"
+        exit -1
+    fi
+    if [[ -z ${HEXAGON_ARCH} ]]; then
+        echo "Please export HEXAGON_ARCH=xx. e.g. SM8750=v79"
+        exit -1
+    fi
+
+    BUILD_ROOT=$PRJ_ROOT/$CMAKE_HEXAGON
+    if [ "$CLEAN" = true ]; then
+        rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
+    else
+        # force rebuild flatccrt for the correct platform
+        cd $BUILD_ROOT/third-party/flatcc && make clean
+    fi
+    cd $BUILD_ROOT
+    cmake .. \
+        -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+        -DEXECUTORCH_BUILD_QNN=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DFLATCC_ALLOW_WERROR=OFF \
+        -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
+        -DHEXAGON_SDK_ROOT=$HEXAGON_SDK_ROOT \
+        -DHEXAGON_TOOLS_ROOT=$HEXAGON_TOOLS_ROOT \
+        -DHEXAGON_ARCH=$HEXAGON_ARCH \
+        -DCMAKE_TOOLCHAIN_FILE=$HEXAGON_SDK_ROOT/build/cmake/hexagon_toolchain.cmake \
+        -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+        -B$BUILD_ROOT
+
+    cmake --build $BUILD_ROOT -j$BUILD_JOB_NUMBER --target install
+fi
+
 if [ "$BUILD_X86_64" = true ]; then
     BUILD_ROOT=$PRJ_ROOT/$CMAKE_X86_64
     if [ "$CLEAN" = true ]; then
diff --git a/backends/qualcomm/scripts/lpai_utils.sh b/backends/qualcomm/scripts/lpai_utils.sh
new file mode 100755
index 00000000000..13c8f8c2430
--- /dev/null
+++ b/backends/qualcomm/scripts/lpai_utils.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+if [[ -z $HEXAGON_SDK_ROOT || -z $QNN_SDK_ROOT ]]; then
+  echo "please export HEXAGON_SDK_ROOT and QNN_SDK_ROOT"
+  exit -1
+fi
+
+usage() { echo "usage: $0 [--serial abc] [--workspace /data/tmp/local/xxx] [--direct] [--lpai v6] [--hexagon v81] [--artifact /path/to/artifacts]" 1>&2; exit 1; }
+
+short=H:,s:,w:,l:,x:,a:,d,h,
+long=host:,serial:,workspace:,lpai:,hexagon:,artifact:,direct,help
+args=$(getopt -a -o $short -l $long -n $0 -- $@)
+eval set -- $args
+
+host=""
+serial=""
+workspace=""
+mode="hlos"
+lpai=""
+hexagon=""
+artifact=""
+while true; do
+  case $1 in
+    -H | --host) host=$2; shift 2;;
+    -s | --serial) serial=$2; shift 2;;
+    -w | --workspace) workspace=$2; shift 2;;
+    -l | --lpai) lpai=$2; shift 2;;
+    -x | --hexagon) hexagon=$2; shift 2;;
+    -a | --artifact) artifact=$2; shift 2;;
+    -d | --direct) mode="direct"; shift;;
+    -h | --help) usage;;
+    --) shift; break;;
+    *) echo "unknown keyword: $1"; usage;;
+  esac
+done
+
+if [[ -z $lpai ]]; then
+  echo "please specify lpai version"
+  usage
+elif [[ $mode == "direct" && -z $workspace ]]; then
+  echo "please provide device serial and workspace while using direct mode"
+  usage
+fi
+
+signed_folder=$QNN_SDK_ROOT/lib/lpai-$lpai/signed
+signer=$HEXAGON_SDK_ROOT/tools/elfsigner/elfsigner.py
+mkdir -p $signed_folder
+
+if [[ $mode == "hlos" ]]; then
+  yes 2>/dev/null | python $signer -i $QNN_SDK_ROOT/lib/lpai-$lpai/unsigned/libQnnLpaiSkel.so -o $signed_folder
+else
+  if [[ -z $hexagon ]]; then
+    echo "please specify hexagon arch"
+  fi
+  adb_args=""
+  if [[ ! -z $host ]]; then
+    adb_args="$adb_args -H $host"
+  fi
+  if [[ ! -z $serial ]]; then
+    adb_args="$adb_args -s $serial"
+  fi
+  adb $adb_args shell mkdir -p $workspace
+  yes 2>/dev/null | python $signer -i $QNN_SDK_ROOT/lib/lpai-$lpai/unsigned/libQnnLpai.so -o $signed_folder
+  yes 2>/dev/null | python $signer -i $QNN_SDK_ROOT/lib/hexagon-$hexagon/unsigned/libQnnSystem.so -o $signed_folder
+  yes 2>/dev/null | python $signer -i $HEXAGON_TOOLS_ROOT/Tools/target/hexagon/lib/$hexagon/G0/pic/libc++.so.1.0 -o $signed_folder
+  yes 2>/dev/null | python $signer -i build-hexagon/backends/qualcomm/qnn_executorch/fastrpc/libqnn_executorch_skel.so -o $signed_folder
+  yes 2>/dev/null | python $signer -i build-hexagon/backends/qualcomm/libqnn_executorch_backend.so -o $signed_folder
+  if [[ ! -z $artifact ]]; then
+    adb $adb_args push $(find $QNN_SDK_ROOT/lib/lpai-$lpai/signed/ -name 'lib*' ! -name '*LpaiSkel*') $workspace
+    adb $adb_args push build-android/backends/qualcomm/qnn_executorch/fastrpc/libqnn_executorch_stub.so $workspace
+    adb $adb_args push build-android/backends/qualcomm/qnn_executorch/fastrpc/qnn_executor_runner $workspace
+    adb $adb_args shell "cd $workspace && rm -f libc++.so.1 && ln -s libc++.so.1.0 libc++.so.1"
+    pte=$(find $artifact -type f -name "*.pte")
+    input_list=$(find $artifact -type f -name "input*.txt")
+    input_data=$(find $artifact -type f -name "input*.raw")
+    output_folder=output_direct
+    adb $adb_args shell "rm -rf $workspace/$output_folder && mkdir -p $workspace/$output_folder"
+    adb $adb_args push $pte $input_list $input_data $workspace
+    adb $adb_args shell "cd $workspace && \
+      export LD_LIBRARY_PATH=. && export ADSP_LIBRARY_PATH=. \
+      && echo 0x0C > qnn_executor_runner.farf && logcat -c \
+      && ./qnn_executor_runner --model_path $(basename $pte) --output_folder_path $output_folder"
+    adb $adb_args pull $workspace/$output_folder $artifact
+  fi
+fi
diff --git a/backends/qualcomm/scripts/sqnr_verifier.py b/backends/qualcomm/scripts/sqnr_verifier.py
new file mode 100644
index 00000000000..77ecfd97097
--- /dev/null
+++ b/backends/qualcomm/scripts/sqnr_verifier.py
@@ -0,0 +1,38 @@
+import argparse
+
+import numpy as np
+import torch
+from torchao.quantization.utils import compute_error
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-g",
+    "--golden",
+    nargs="+",
+    type=str,
+)
+parser.add_argument(
+    "-o",
+    "--output",
+    nargs="+",
+    type=str,
+)
+parser.add_argument(
+    "-d",
+    "--dtype",
+    type=str,
+)
+parser.add_argument(
+    "-e",
+    "--encoding",
+    type=str,
+)
+args = parser.parse_args()
+golden = [np.fromfile(f, dtype=np.float32) for f in args.golden]
+output = [np.fromfile(f, dtype=eval(f"np.{args.dtype}")) for f in args.output]
+
+with open(args.encoding, "r") as f:
+    for i, g in enumerate(golden):
+        enc = [float(x) for x in f.readline().split()]
+        o = torch.from_numpy(output[i]).to(torch.float).sub(enc[1]).mul(enc[0])
+        print(f"SQNR_{i}: {compute_error(torch.from_numpy(g), o)}")
diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
index c75c5cb3662..56e5af0c7b6 100644
--- a/backends/qualcomm/serialization/qc_compiler_spec.fbs
+++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -29,6 +29,24 @@ table HtpInfo {
   vtcm_size_in_mb:uint;
 }
 
+/// Defines the LPAI hardware architecture available for LPAI backend.
+enum LpaiHardwareVersion: int {
+  NONE = 0,
+  V1,
+  V2,
+  V3,
+  V4,
+  V5,
+  V5_1 = 0x10005,
+  V6 = 6,
+  V7,
+}
+
+table LpaiInfo {
+  /// Represent the LPAI hardware version
+  lpai_hardware_version:LpaiHardwareVersion;
+}
+
 /// You could refer to Qualcomm AI Engine Direct SDK
 /// to get SoC Model in supported snapdragon devices
 enum QcomChipset: int {
@@ -60,6 +78,9 @@ table SocInfo {
 
   /// Identifies the htp information of the specified SoC.
   htp_info:HtpInfo;
+
+  /// Identifies the lpai information of the specified SoC.
+  lpai_info:LpaiInfo;
 }
 
 /// Defines performance modes available for GPU backend.
@@ -133,7 +154,7 @@ enum QnnExecuTorchBackendType: int {
   kUndefinedBackend = 0,
   kGpuBackend,
   kHtpBackend,
-  kDspBackend,
+  kLpaiBackend,
 }
 
 /// Defines pd sessions available for HTP backend.
@@ -181,7 +202,43 @@ table QnnExecuTorchHtpBackendOptions {
   use_weight_sharing:bool;
 }
 
-/// Logging level of the delegate and QNN backend.
+/// Real-time: Indicates that the model is intended for real-time use cases, where a specific performance threshold must be met.
+/// Non-real-time: Refers to models without strict performance requirements.
+enum QnnExecuTorchLpaiClientPerf: int {
+  /// this is the limit of old version flatbuffer
+  kUndefined = 0,
+  kRealTime,
+  kNonRealTime,
+}
+
+/// Offloaded Ops shall be executed on core with affinity specified.
+enum QnnExecuTorchLpaiCoreAffinity: int {
+  /// this is the limit of old version flatbuffer
+  kUndefined = 0,
+  kSoft,
+  kHard,
+}
+
+/// Specifies the backend options for the LPAI backend.
+table QnnExecuTorchLpaiBackendOptions {
+  /// Specifies how frequently inference must be completed.
+  fps:int;
+
+  /// Determines the hardware configuration to meet the latency requirement for inference.
+  ftrt_ratio:int;
+
+  /// Refers to models w/wo strict performance requirements.
+  client_perf_type:QnnExecuTorchLpaiClientPerf;
+
+  /// Offloaded Ops shall be executed on core with affinity specified.
+  affinity:QnnExecuTorchLpaiCoreAffinity;
+
+  /// Select which core for execution
+  core_selection:int;
+}
+
+/// Determines the hardware configuration to meet the latency requirement for inference.
+/// To ensure inference completes within this reduced time window, the eNPU must be boosted.
 enum QnnExecuTorchLogLevel: int {
   kLogOff = 0,
   kLogLevelError,
@@ -251,6 +308,8 @@ table QnnExecuTorchBackendOptions {
   htp_options:QnnExecuTorchHtpBackendOptions;
 
   gpu_options:QnnExecuTorchGpuBackendOptions;
+
+  lpai_options:QnnExecuTorchLpaiBackendOptions;
 }
 
 table QnnExecuTorchOptions {
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index 7967e80d18b..24c45c5f3e3 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -9,7 +9,7 @@
 """
 
 from dataclasses import dataclass, field
-from enum import IntEnum, unique
+from enum import auto, IntEnum, unique
 from typing import List, Optional
 
 
@@ -36,6 +36,25 @@ class HtpInfo:
     vtcm_size_in_mb: int = 0
 
 
+@unique
+class LpaiHardwareVersion(IntEnum):
+    NONE = 0
+    V1 = auto()
+    V2 = auto()
+    V3 = auto()
+    V4 = auto()
+    V5 = auto()
+    V5_1 = 0x10005
+    V6 = 6
+    V7 = auto()
+
+
+@dataclass
+class LpaiInfo:
+    lpai_hardware_version: LpaiHardwareVersion = LpaiHardwareVersion.NONE
+    # TODO: see if we need to expose num_cores of adsp
+
+
 @unique
 class QcomChipset(IntEnum):
     UNKNOWN_SM = 0
@@ -63,6 +82,7 @@ class QcomChipset(IntEnum):
 class SocInfo:
     soc_model: QcomChipset = QcomChipset.UNKNOWN_SM
     htp_info: HtpInfo = field(default_factory=HtpInfo)
+    lpai_info: Optional[LpaiInfo] = None
 
 
 _soc_info_table = {
@@ -73,15 +93,21 @@ class SocInfo:
     QcomChipset.SM8550: SocInfo(QcomChipset.SM8550, HtpInfo(HtpArch.V73, 8)),
     QcomChipset.SA8255: SocInfo(QcomChipset.SA8255, HtpInfo(HtpArch.V73, 8)),
     QcomChipset.SM8650: SocInfo(QcomChipset.SM8650, HtpInfo(HtpArch.V75, 8)),
-    QcomChipset.SM8750: SocInfo(QcomChipset.SM8750, HtpInfo(HtpArch.V79, 8)),
-    QcomChipset.SM8850: SocInfo(QcomChipset.SM8850, HtpInfo(HtpArch.V81, 8)),
+    QcomChipset.SM8750: SocInfo(
+        QcomChipset.SM8750, HtpInfo(HtpArch.V79, 8), LpaiInfo(LpaiHardwareVersion.V5)
+    ),
+    QcomChipset.SM8850: SocInfo(
+        QcomChipset.SM8850, HtpInfo(HtpArch.V81, 8), LpaiInfo(LpaiHardwareVersion.V6)
+    ),
     QcomChipset.SSG2115P: SocInfo(QcomChipset.SSG2115P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SSG2125P: SocInfo(QcomChipset.SSG2125P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SXR1230P: SocInfo(QcomChipset.SXR1230P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SXR2230P: SocInfo(QcomChipset.SXR2230P, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)),
     QcomChipset.QCS9100: SocInfo(QcomChipset.QCS9100, HtpInfo(HtpArch.V73, 8)),
-    QcomChipset.SAR2230P: SocInfo(QcomChipset.SAR2230P, HtpInfo(HtpArch.V81, 4)),
+    QcomChipset.SAR2230P: SocInfo(
+        QcomChipset.SAR2230P, HtpInfo(HtpArch.V81, 4), LpaiInfo(LpaiHardwareVersion.V6)
+    ),
     QcomChipset.SW6100: SocInfo(QcomChipset.SW6100, HtpInfo(HtpArch.V81, 4)),
     QcomChipset.QCM6490: SocInfo(QcomChipset.QCM6490, HtpInfo(HtpArch.V68, 2)),
 }
@@ -146,7 +172,7 @@ class QnnExecuTorchBackendType(IntEnum):
     kUndefinedBackend = 0
     kGpuBackend = 1
     kHtpBackend = 2
-    kDspBackend = 3
+    kLpaiBackend = 3
 
 
 @dataclass
@@ -164,6 +190,31 @@ class QnnExecuTorchHtpBackendOptions:
     use_weight_sharing: bool = False
 
 
+@unique
+class QnnExecuTorchLpaiClientPerf(IntEnum):
+    kUndefined = 0
+    kRealTime = 1
+    kNonRealTime = 2
+
+
+@unique
+class QnnExecuTorchLpaiCoreAffinity(IntEnum):
+    kUndefined = 0
+    kSoft = 1
+    kHard = 2
+
+
+@dataclass
+class QnnExecuTorchLpaiBackendOptions:
+    fps: int = 1
+    ftrt_ratio: int = 10
+    client_perf_type: QnnExecuTorchLpaiClientPerf = (
+        QnnExecuTorchLpaiClientPerf.kRealTime
+    )
+    affinity: QnnExecuTorchLpaiCoreAffinity = QnnExecuTorchLpaiCoreAffinity.kSoft
+    core_selection: int = 0
+
+
 @unique
 class QnnExecuTorchLogLevel(IntEnum):
     kLogOff = 0
@@ -187,6 +238,7 @@ class QnnExecuTorchBackendOptions:
     backend_type: QnnExecuTorchBackendType
     htp_options: Optional[QnnExecuTorchHtpBackendOptions] = None
     gpu_options: Optional[QnnExecuTorchGpuBackendOptions] = None
+    lpai_options: Optional[QnnExecuTorchLpaiBackendOptions] = None
 
 
 @unique
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index c57dbbcc332..50e0be974fb 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -51,6 +51,7 @@
     from_context_binary,
     generate_gpu_compiler_spec,
     generate_htp_compiler_spec,
+    generate_lpai_compiler_spec,
     generate_qnn_executorch_compiler_spec,
     is_qnn_sdk_version_less_than,
     PyQnnManagerAdaptor,
@@ -2088,9 +2089,15 @@ def test_qnn_backend_example_models(self):
 class TestQNNQuantizedOperator(TestQNN):
     # TODO: refactor to support different backends
     def setUp(self):
+        match self.get_backend_type():
+            case QnnExecuTorchBackendType.kHtpBackend:
+                backend_options = generate_htp_compiler_spec(use_fp16=False)
+            case QnnExecuTorchBackendType.kLpaiBackend:
+                backend_options = generate_lpai_compiler_spec()
+            case _:
+                raise ValueError("Backend is not implemented yet")
         TestQNN.atol = 1e-1
         TestQNN.rtol = 1
-        backend_options = generate_htp_compiler_spec(use_fp16=False)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.model],
             backend_options=backend_options,
@@ -2474,6 +2481,90 @@ def test_qnn_backend_conv1d(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_lpai(self):
+        from executorch.backends.qualcomm._passes.build_quant_io import BuildQuantIo
+        from executorch.backends.qualcomm.utils.constants import (
+            QCOM_DTYPE,
+            QCOM_QUANT_ATTRS,
+        )
+        from executorch.exir.capture._config import ExecutorchBackendConfig
+        from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
+
+        """ fp_module = Conv2dSequential()  # noqa: F405
+        input_shape = (1, 1, 3, 3)
+        sample_input = (torch.randn(input_shape),) """
+        """ fp_module = torch.load(
+            "/local2/mnt/workspace/executorch_artifacts/meta_models/four_class.pt",
+            weights_only=False,
+        ).eval()
+        input_shape = (1,1,372,496)
+        sample_input = (torch.randn(input_shape),) """
+        fp_module = torch.load(
+            "/local2/mnt/workspace/executorch_artifacts/meta_models/ocr_haptic.pt",
+            weights_only=False,
+        ).eval()
+        input_shape = (1,1,372,496)
+        sample_input = (torch.randn(input_shape),)
+        """ fp_module = Add()  # noqa: F405
+        input_shape = (1, 1, 256, 256)
+        sample_input = (torch.randn(input_shape), torch.randn(input_shape)) """
+        with torch.no_grad():
+            module = self.get_qdq_module(
+                fp_module,
+                sample_input,
+                quant_dtype=QuantDtype.use_8a8w,
+            )
+            # strip unsupported quantize / dequantize ops generated in preprocess
+            pass_jobs = get_capture_program_passes()
+            pass_jobs[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True
+            pass_jobs[TagQuantIO][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][
+                "get_quant_io_dtype_fn"
+            ] = lambda n: (
+                torch.uint8
+                # if n.name in {"x", "aten_convolution_default_1"}
+                if n.name in {
+                    "x_1", "aten_permute_copy_default", "aten_permute_copy_default_1"
+                }
+                # if n.name in {"x", "y", "aten_add_tensor"}
+                else None
+            )
+            edge_prog_mgr = to_edge_transform_and_lower_to_qnn(
+                module,
+                sample_input,
+                self.compiler_specs,
+                passes_job=pass_jobs,
+            )
+
+            # collect encodings for ios
+            input_encodings, output_encodings = [], []
+            for n in edge_prog_mgr.exported_program().graph.nodes:
+                if n.op == "placeholder":
+                    input_encodings.append(n.meta[QCOM_QUANT_ATTRS])
+                    input_encodings[-1][QCOM_DTYPE] = torch.uint8
+                elif n.op == "output":
+                    output_encodings = n.meta[QCOM_QUANT_ATTRS_MAP].values()
+                    for output_encoding in output_encodings:
+                        output_encoding[QCOM_DTYPE] = torch.uint8
+
+            exec_prog = edge_prog_mgr.to_executorch(
+                ExecutorchBackendConfig(
+                    passes=[BuildQuantIo()],
+                    memory_planning_pass=MemoryPlanningPass(
+                        alloc_graph_input=False,
+                        alloc_graph_output=False,
+                    ),
+                    segment_alignment=256,
+                )
+            )
+            self.verify_output(
+                fp_module,
+                sample_input,
+                exec_prog,
+                input_encodings=tuple(input_encodings),
+                output_encodings=tuple(output_encodings),
+                artifact_dir=self.artifact_dir,
+            )
+
     def test_qnn_backend_conv2d(self):
         modules = [Conv2dSequential(), Conv2dSequential(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
@@ -8906,8 +8997,8 @@ def setup_environment():
     )
     parser.add_argument(
         "--backend",
-        help="Backend to be deployed ('htp'/'gpu' are currently supported).",
-        choices=["htp", "gpu"],
+        help="Backend to be deployed ('htp'/'gpu'/'lpai' are currently supported).",
+        choices=["htp", "gpu", "lpai"],
         default="htp",
         type=str,
     )
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index f4b9339e1c2..a05d01d765e 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -160,6 +160,7 @@ class TestQNN(unittest.TestCase):
     compiler_specs: List[CompileSpec] = None
     chipset_table = get_soc_to_chipset_map()
     error_only = False
+    oss_repo: str = ""
     ip = "localhost"
     port = 8080
     executorch_root: str = ""
@@ -168,6 +169,7 @@ class TestQNN(unittest.TestCase):
     qa_dataset: str = ""
     sentence_dataset: str = ""
     pretrained_weight: str = ""
+    model_name: str = ""
     enable_profile: bool = False
     op_package_dir: str = ""
     target: str = ""
@@ -216,6 +218,9 @@ def _save_model_and_expected_output(
         else:
             ref_outputs.append(ref_output.detach())
 
+        for i, output in enumerate(ref_output):
+            output.numpy().tofile(f"{dir_name}/golden_{0}_{i}.raw")
+
         pte_fname = f"{dir_name}/qnn_executorch_test.pte"
         with open(pte_fname, "wb") as file:
             file.write(buffer)
@@ -253,8 +258,12 @@ def verify_output(  # noqa: C901
         save_inference_speed: bool = False,
         expected_compared_events: int = -1,
         qnn_intermediate_debugger: QNNIntermediateDebugger = None,
+        artifact_dir: str = None,
     ):
         with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_dir = artifact_dir if artifact_dir is not None else tmp_dir
+            if artifact_dir is not None:
+                make_output_dir(artifact_dir)
             (
                 ref_outputs,
                 pte_fname,
@@ -293,6 +302,9 @@ def post_process():
                             .sub(enc[QCOM_ZERO_POINT])
                             .mul(enc[QCOM_SCALE])
                         )
+                    from torchao.quantization.utils import compute_error
+
+                    print(f"SQNR:{compute_error(ref_outputs[i], output)}")
                     outputs.append(output)
 
             def validate_profile():
@@ -332,6 +344,10 @@ def validate_intermediate_tensor():
                         )
 
             processed_inputs = list(sample_inputs)
+            with open(f"{tmp_dir}/encoding.txt", "w") as f:
+                for enc in output_encodings:
+                    f.write(f"{enc[QCOM_SCALE]} {enc[QCOM_ZERO_POINT]}\n")
+
             for i, enc in enumerate(input_encodings):
                 processed_inputs[i] = (
                     processed_inputs[i]
@@ -458,6 +474,7 @@ def validate_intermediate_tensor():
                 adb.push(
                     inputs=[processed_inputs],
                     files=op_package_paths,
+                    artifact_dir=tmp_dir,
                 )
                 adb.extra_cmds += extra_cmds
                 if save_inference_speed:
@@ -591,9 +608,14 @@ def get_qdq_module(
         block_size_map: Dict[str, Tuple] = None,
         submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None,
     ) -> torch.fx.GraphModule:
-        m = torch.export.export(
-            module, inputs, dynamic_shapes=dynamic_shapes, strict=True
-        ).module()
+        if isinstance(module, torch.jit.ScriptModule):
+            from torch._export.converter import TS2EPConverter
+
+            m = TS2EPConverter(module, inputs).convert().module()
+        else:
+            m = torch.export.export(
+                module, inputs, dynamic_shapes=dynamic_shapes, strict=True
+            ).module()
 
         quantizer = make_quantizer(
             quant_dtype=quant_dtype,
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index cccf198e924..e603f1b234e 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -31,6 +31,7 @@
 from executorch.backends.qualcomm.serialization.qc_schema import (
     _soc_info_table,
     HtpArch,
+    LpaiHardwareVersion,
     QcomChipset,
     QnnExecuTorchBackendOptions,
     QnnExecuTorchBackendType,
@@ -40,6 +41,9 @@
     QnnExecuTorchHtpPerformanceMode,
     QnnExecuTorchHtpPrecision,
     QnnExecuTorchLogLevel,
+    QnnExecuTorchLpaiBackendOptions,
+    QnnExecuTorchLpaiClientPerf,
+    QnnExecuTorchLpaiCoreAffinity,
     QnnExecuTorchOpPackageOptions,
     QnnExecuTorchOptions,
     QnnExecuTorchProfileLevel,
@@ -957,7 +961,7 @@ def generate_gpu_compiler_spec(
     Returns:
         QnnExecuTorchGpuBackendOptions: backend options for QNN GPU.
     """
-    # TODO: enable performance hint mechanism in runtime and make this as an option
+    # TODO: enable power config mechanism in runtime and make this as an option
     gpu_options = QnnExecuTorchGpuBackendOptions()
     gpu_options.precision = precision
     gpu_options.use_memory_optimizations = use_memory_optimizations
@@ -1014,6 +1018,68 @@ def generate_htp_compiler_spec(
     )
 
 
+def generate_lpai_compiler_spec(
+    fps: int = 1,
+    ftrt_ratio: int = 10,
+    client_perf_type: QnnExecuTorchLpaiClientPerf = QnnExecuTorchLpaiClientPerf.kRealTime,
+    affinity: QnnExecuTorchLpaiCoreAffinity = QnnExecuTorchLpaiCoreAffinity.kSoft,
+    core_selection: int = 0,
+) -> QnnExecuTorchBackendOptions:
+    """
+    Helper function generating backend options for QNN LPAI
+
+    Args:
+        fps:
+            Specifies how frequently inference must be completed.
+            This sets the overall time budget for each frame, including pre-processing,
+            inference, and post-processing.
+        ftrt_ratio:
+            Determines the hardware configuration to meet the latency requirement for inference.
+            Setting ftrt_ratio = 50 applies a multiplication factor of 5.0 to the base clock frequency,
+            helping the eNPU meet the tighter latency constraint.
+        client_perf_type:
+            kRealtime - Indicates that the model is intended for real-time use cases,
+                where a specific performance threshold must be met.
+                If the required performance cannot be achieved, the finalize function will return an error.
+            kNonRealTime - Refers to models without strict performance requirements.
+                In these cases, LPAI will make a best-effort attempt to accommodate the workload,
+                and finalize will not fail due to performance limitations.
+        affinity:
+            kSoft - Default affinity. Scheduler will assign jobs to requested cores when feasible
+            kHard - Scheduler will honour affinity requested by the client
+        core_selection:
+            A bit mask for core selection. Each bit corresponds to a core, set the bit to use the core
+            Note that all zeros and all ones mean any core can be used for the eAI instance
+
+        Example for 2 cores:
+        +--------+--------+---------------+-------------------------------------------------------------------------+
+        | bit 1  | bit 0  | affinity      |                 scheduler behavior                                      |
+        +--------+--------+---------------+-------------------------------------------------------------------------+
+        | 0      |      0 |      any      | Default affinity, scheduler will pick any core based on load            |
+        | 1      |      1 |      any      | Same as default affinity                                                |
+        | 0      |      1 |      hard     | All jobs will only be sent to core 0                                    |
+        | 1      |      0 |      hard     | All jobs will only be sent to core 1                                    |
+        | 0      |      1 |      soft     | Scheduler will attempt to send jobs to core 0                           |
+        | 1      |      0 |      soft     | Scheduler will attempt to send jobs to core 1                           |
+        +--------+--------+---------------+-------+-----------------------------------------------------------------+
+
+    Returns:
+        QnnExecuTorchBackendOptions: backend options for QNN LPAI.
+    """
+    # TODO: enable power config mechanism in runtime and make this as an option
+    lpai_options = QnnExecuTorchLpaiBackendOptions()
+    lpai_options.fps = fps
+    lpai_options.ftrt_ratio = ftrt_ratio
+    lpai_options.client_perf_type = client_perf_type
+    lpai_options.affinity = affinity
+    lpai_options.core_selection = core_selection
+
+    return QnnExecuTorchBackendOptions(
+        backend_type=QnnExecuTorchBackendType.kLpaiBackend,
+        lpai_options=lpai_options,
+    )
+
+
 def generate_qnn_executorch_compiler_spec(
     soc_model: QcomChipset,
     backend_options: QnnExecuTorchBackendOptions,
@@ -1126,7 +1192,7 @@ def generate_qnn_executorch_compiler_spec(
     ]
 
 
-def get_soc_to_arch_map():
+def get_soc_to_htp_arch_map():
     return {
         "SA8295": HtpArch.V68,
         "SM8350": HtpArch.V68,
@@ -1149,6 +1215,14 @@ def get_soc_to_arch_map():
     }
 
 
+def get_soc_to_lpai_hw_ver_map():
+    return {
+        "SM8750": LpaiHardwareVersion.V5,
+        "SM8850": LpaiHardwareVersion.V6,
+        "SAR2230P": LpaiHardwareVersion.V6,
+    }
+
+
 def get_soc_to_chipset_map():
     return {
         "SA8295": QcomChipset.SA8295,
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index bd48c553698..59bb1645a3d 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -465,7 +465,8 @@ int main(int argc, char** argv) {
 
         if (expected_input_shapes.empty()) {
           ET_CHECK_MSG(
-              file_size == tensor_meta->nbytes(),
+              // workaround for LPAI (== → <=), should figure out root cause of graph without io QDQ
+              file_size <= tensor_meta->nbytes(),
               "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu",
               input_index,
               file_size,
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index ca1d655c0db..e849daf9a6a 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -39,7 +39,8 @@
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
-    get_soc_to_arch_map,
+    get_soc_to_htp_arch_map,
+    get_soc_to_lpai_hw_ver_map,
     to_edge_transform_and_lower_to_qnn,
 )
 from executorch.exir.backend.utils import get_delegates
@@ -103,7 +104,8 @@ def __init__(
         self.dump_intermediate_outputs = dump_intermediate_outputs
         self.debug_output_path = f"{self.workspace}/debug_output.bin"
         self.output_folder = f"{self.workspace}/outputs"
-        self.htp_arch = get_soc_to_arch_map()[soc_model]
+        self.htp_arch = get_soc_to_htp_arch_map()[soc_model]
+        self.soc_model = soc_model
         self.error_only = error_only
         self.shared_buffer = shared_buffer
         self.runner = runner
@@ -130,7 +132,9 @@ def _adb(self, cmd, output_callback: Optional[Callable[[str], None]] = None):
                 cmds, stdout=subprocess.DEVNULL if self.error_only else sys.stdout
             )
 
-    def push(self, inputs=None, input_list=None, files=None, init_env=True):
+    def push(
+        self, inputs=None, input_list=None, files=None, init_env=True, artifact_dir=None
+    ):
         artifacts = []
         if init_env:
             self._adb(["shell", f"rm -rf {self.workspace}"])
@@ -153,6 +157,15 @@ def push(self, inputs=None, input_list=None, files=None, init_env=True):
             QnnExecuTorchBackendType.kGpuBackend: [
                 f"{self.qnn_sdk}/lib/{self.target}/libQnnGpu.so",
             ],
+            # please note that users need to sign LPAI related libs manually
+            QnnExecuTorchBackendType.kLpaiBackend: [
+                f"{self.qnn_sdk}/lib/{self.target}/libQnnLpai.so",
+                (
+                    f"{self.qnn_sdk}/lib/lpai-v{get_soc_to_lpai_hw_ver_map()[self.soc_model]}/"
+                    f"signed/libQnnLpaiSkel.so"
+                ),
+                f"{self.qnn_sdk}/lib/{self.target}/libQnnLpaiStub.so",
+            ],
         }[self.backend]
 
         artifacts.extend(
@@ -165,6 +178,7 @@ def push(self, inputs=None, input_list=None, files=None, init_env=True):
             ]
         )
         with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_dir = artifact_dir if artifact_dir is not None else tmp_dir
             input_list_file, input_files = generate_inputs(
                 tmp_dir, self.input_list_filename, inputs
             )
@@ -229,6 +243,8 @@ def execute(
             qnn_executor_runner_cmds = " ".join(
                 [
                     f"cd {self.workspace} &&",
+                    "export ADSP_LIBRARY_PATH=. &&",
+                    "export LD_LIBRARY_PATH=. &&",
                     "chmod +x ./qnn_executor_runner &&",
                     f"./qnn_executor_runner {qnn_executor_runner_args}",
                 ]
@@ -519,6 +535,7 @@ def make_output_dir(path: str):
     if os.path.exists(path):
         shutil.rmtree(path, ignore_errors=True)
     os.makedirs(path)
+    os.chmod(path, 0o777)
 
 
 def topk_accuracy(predictions, targets, k):
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index 92c4cd61eea..f64b84894f3 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -26,7 +26,7 @@
 // Some platforms (e.g. Xtensa) do not support pread() that we use to read the
 // file at different offsets simultaneously from multiple threads not affecting
 // each other. We list them below and use a workaround for them.
-#if defined(__xtensa__)
+#if defined(__xtensa__) || defined(__hexagon__)
 #define ET_HAVE_PREAD 0
 #endif // defined(__xtensa__)
 
@@ -42,7 +42,27 @@ namespace executorch {
 namespace extension {
 
 namespace {
+inline void* et_aligned_alloc(size_t size, std::align_val_t alignment) {
+  return ::operator new(size, alignment);
+}
+
+inline void et_aligned_free(void* ptr, std::align_val_t alignment) {
+  return ::operator delete(ptr, alignment);
+}
 
+/**
+ * FreeableBuffer::FreeFn-compatible callback.
+ *
+ * `data` is the original buffer pointer.
+ * `context` is the original alignment.
+ *
+ * `size` is unused.
+ */
+void FreeSegment(void* context, void* data, ET_UNUSED size_t size) {
+  et_aligned_free(
+      data,
+      static_cast<std::align_val_t>(reinterpret_cast<uintptr_t>(context)));
+}
 /**
  * Returns true if the value is an integer power of 2.
  */
@@ -54,7 +74,7 @@ static bool is_power_of_2(size_t value) {
 FileDataLoader::~FileDataLoader() {
   // file_name_ can be nullptr if this instance was moved from, but freeing a
   // null pointer is safe.
-  std::free(const_cast<char*>(file_name_));
+  et_aligned_free(const_cast<char*>(file_name_), alignment_);
   // fd_ can be -1 if this instance was moved from, but closing a negative fd is
   // safe (though it will return an error).
   if (fd_ == -1) {
@@ -99,44 +119,21 @@ Result<FileDataLoader> FileDataLoader::from(
     return Error::AccessFailed;
   }
   size_t file_size = st.st_size;
-
   // Copy the filename so we can print better debug messages if reads fail.
-  const char* file_name_copy = ::strdup(file_name);
+  size_t file_name_len = ::strlen(file_name);
+  char* file_name_copy =
+      (char*)et_aligned_alloc(file_name_len, std::align_val_t(alignment));
+
   if (file_name_copy == nullptr) {
     ET_LOG(Error, "strdup(%s) failed", file_name);
     ::close(fd);
     return Error::MemoryAllocationFailed;
   }
+  ::strcpy(file_name_copy, file_name);
 
   return FileDataLoader(fd, file_size, alignment, file_name_copy);
 }
 
-namespace {
-
-inline void* et_aligned_alloc(size_t size, std::align_val_t alignment) {
-  return ::operator new(size, alignment);
-}
-
-inline void et_aligned_free(void* ptr, std::align_val_t alignment) {
-  return ::operator delete(ptr, alignment);
-}
-
-/**
- * FreeableBuffer::FreeFn-compatible callback.
- *
- * `data` is the original buffer pointer.
- * `context` is the original alignment.
- *
- * `size` is unused.
- */
-void FreeSegment(void* context, void* data, ET_UNUSED size_t size) {
-  et_aligned_free(
-      data,
-      static_cast<std::align_val_t>(reinterpret_cast<uintptr_t>(context)));
-}
-
-} // namespace
-
 Result<FreeableBuffer> FileDataLoader::load(
     size_t offset,
     size_t size,
diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt
index 767ac367e19..d51059363c8 100644
--- a/third-party/CMakeLists.txt
+++ b/third-party/CMakeLists.txt
@@ -6,7 +6,10 @@
 
 set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
 add_subdirectory(json)
-add_subdirectory(gflags)
+# [workaround]: mkdir was not supported in hexagon
+if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
+  add_subdirectory(gflags)
+endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
   add_subdirectory(pybind11)