diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index f0e4c7959c0..91404fb164f 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -50,9 +50,15 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) {
 WebGPUGraph::WebGPUGraph() = default;
 
 WebGPUGraph::~WebGPUGraph() {
-  for (auto& t : tensors_) {
-    if (t.buffer) {
-      wgpuBufferRelease(t.buffer);
+  for (size_t i = 0; i < tensors_.size(); i++) {
+    if (tensors_[i].buffer &&
+        (i >= tensor_mem_obj_ids_.size() || tensor_mem_obj_ids_[i] < 0)) {
+      wgpuBufferRelease(tensors_[i].buffer);
+    }
+  }
+  for (auto& buf : shared_buffers_) {
+    if (buf) {
+      wgpuBufferRelease(buf);
     }
   }
   for (auto& buf : output_staging_buffers_) {
@@ -68,6 +74,21 @@ WebGPUGraph::~WebGPUGraph() {
       wgpuBindGroupRelease(d.bind_group);
     }
   }
+  for (auto& [_, shader] : shader_cache_) {
+    if (shader) {
+      wgpuShaderModuleRelease(shader);
+    }
+  }
+  for (auto& [_, pipeline] : pipeline_cache_) {
+    if (pipeline) {
+      wgpuComputePipelineRelease(pipeline);
+    }
+  }
+  for (auto& [_, bgl] : bgl_cache_) {
+    if (bgl) {
+      wgpuBindGroupLayoutRelease(bgl);
+    }
+  }
 }
 
 void WebGPUGraph::build(
@@ -94,6 +115,7 @@ void WebGPUGraph::build(
   const int num_vals = values ? values->size() : 0;
   value_types_.resize(num_vals, ValueType::Null);
   tensors_.resize(num_vals);
+  tensor_mem_obj_ids_.resize(num_vals, -1);
   ints_.resize(num_vals, 0);
   doubles_.resize(num_vals, 0.0);
   bools_.resize(num_vals, false);
@@ -121,27 +143,40 @@ void WebGPUGraph::build(
         }
         tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype());
 
-        // Create GPU buffer
-        WGPUBufferDescriptor buf_desc = {};
-        buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4;
-        buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-            WGPUBufferUsage_CopySrc;
-        buf_desc.mappedAtCreation = false;
-        tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
-
-        // Upload constant data if this tensor has a constant_id
         int constant_id = vk_tensor->constant_id();
-        if (constant_id >= 0 && constant_data) {
-          const auto* constants = graph->constants();
-          if (constants && constant_id < static_cast<int>(constants->size())) {
-            const auto* vk_bytes = constants->Get(constant_id);
-            // Only upload from embedded bytes (not named data map)
-            if (vk_bytes->offset() != UINT64_MAX) {
-              const uint8_t* src = constant_data + vk_bytes->offset();
-              wgpuQueueWriteBuffer(
-                  queue_, tensor.buffer, 0, src, tensor.nbytes);
+        int mem_obj_id = vk_tensor->mem_obj_id();
+
+        // Constants always get dedicated buffers regardless of mem_obj_id
+        if (constant_id >= 0 || mem_obj_id < 0) {
+          tensor_mem_obj_ids_[i] = -1;
+          WGPUBufferDescriptor buf_desc = {};
+          buf_desc.size = std::max(tensor.nbytes, size_t(4));
+          buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+              WGPUBufferUsage_CopySrc;
+          buf_desc.mappedAtCreation = false;
+          tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
+
+          if (constant_id >= 0 && constant_data && tensor.nbytes > 0) {
+            const auto* constants = graph->constants();
+            if (constants &&
+                constant_id < static_cast<int>(constants->size())) {
+              const auto* vk_bytes = constants->Get(constant_id);
+              if (vk_bytes->offset() != UINT64_MAX) {
+                const uint8_t* src = constant_data + vk_bytes->offset();
+                wgpuQueueWriteBuffer(
+                    queue_, tensor.buffer, 0, src, tensor.nbytes);
+              }
             }
           }
+        } else {
+          // Shared buffer: track required size, defer allocation to pass 2
+          tensor_mem_obj_ids_[i] = mem_obj_id;
+          size_t id = static_cast<size_t>(mem_obj_id);
+          if (id >= shared_buffer_sizes_.size()) {
+            shared_buffer_sizes_.resize(id + 1, 0);
+          }
+          shared_buffer_sizes_[id] =
+              std::max(shared_buffer_sizes_[id], tensor.nbytes);
         }
         break;
       }
@@ -166,6 +201,23 @@ void WebGPUGraph::build(
     }
   }
 
+  // Allocate shared buffers and assign to tensors
+  shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr);
+  for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) {
+    WGPUBufferDescriptor buf_desc = {};
+    buf_desc.size = std::max(shared_buffer_sizes_[id], size_t(4));
+    buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+        WGPUBufferUsage_CopySrc;
+    buf_desc.mappedAtCreation = false;
+    shared_buffers_[id] = wgpuDeviceCreateBuffer(device_, &buf_desc);
+  }
+  for (int i = 0; i < num_vals; i++) {
+    int mid = tensor_mem_obj_ids_[i];
+    if (mid >= 0) {
+      tensors_[i].buffer = shared_buffers_[mid];
+    }
+  }
+
   // Phase 2: Record input and output IDs
   const auto* fb_input_ids = graph->input_ids();
   if (fb_input_ids) {
@@ -181,7 +233,7 @@ void WebGPUGraph::build(
 
       // Create staging buffer for output readback
       WGPUBufferDescriptor staging_desc = {};
-      staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4;
+      staging_desc.size = std::max(tensors_[oid].nbytes, size_t(4));
       staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
       staging_desc.mappedAtCreation = false;
       output_staging_buffers_.push_back(
@@ -189,6 +241,14 @@ void WebGPUGraph::build(
     }
   }
 
+  for (size_t i = 0; i < output_ids_.size(); i++) {
+    int oid = output_ids_[i];
+    output_copies_.push_back(
+        {tensors_[oid].buffer,
+         output_staging_buffers_[i],
+         tensors_[oid].nbytes});
+  }
+
   // Phase 3: Build operator dispatch chain
   const auto* chain = graph->chain();
   if (chain) {
@@ -213,9 +273,70 @@ void WebGPUGraph::build(
   }
 }
 
+WGPUShaderModule WebGPUGraph::get_or_create_shader(
+    const std::string& key,
+    const char* wgsl_source) {
+  auto it = shader_cache_.find(key);
+  if (it != shader_cache_.end()) {
+    return it->second;
+  }
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {wgsl_source, WGPU_STRLEN};
+
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device_, &shader_desc);
+
+  shader_cache_[key] = shader;
+  return shader;
+}
+
+WGPUComputePipeline WebGPUGraph::get_or_create_pipeline(
+    const std::string& key,
+    WGPUShaderModule shader,
+    WGPUPipelineLayout layout) {
+  auto it = pipeline_cache_.find(key);
+  if (it != pipeline_cache_.end()) {
+    return it->second;
+  }
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device_, &pipeline_desc);
+
+  pipeline_cache_[key] = pipeline;
+  return pipeline;
+}
+
+WGPUBindGroupLayout WebGPUGraph::get_or_create_bgl(
+    const std::string& key,
+    const WGPUBindGroupLayoutEntry* entries,
+    uint32_t count) {
+  auto it = bgl_cache_.find(key);
+  if (it != bgl_cache_.end()) {
+    return it->second;
+  }
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = count;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device_, &bgl_desc);
+
+  bgl_cache_[key] = bgl;
+  return bgl;
+}
+
 void WebGPUGraph::copy_inputs(
     const std::vector<std::pair<const void*, size_t>>& inputs) {
   for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) {
+    if (inputs[i].second == 0) {
+      continue;
+    }
     int tid = input_ids_[i];
     const auto& tensor = tensors_[tid];
     wgpuQueueWriteBuffer(
@@ -224,43 +345,89 @@ void WebGPUGraph::copy_inputs(
 }
 
 void WebGPUGraph::execute() {
-  WGPUCommandEncoderDescriptor enc_desc = {};
-  WGPUCommandEncoder encoder =
-      wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
-
-  WGPUComputePassDescriptor pass_desc = {};
-  WGPUComputePassEncoder pass =
-      wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
-
-  for (const auto& dispatch : dispatches_) {
-    wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline);
-    wgpuComputePassEncoderSetBindGroup(
-        pass, 0, dispatch.bind_group, 0, nullptr);
-    wgpuComputePassEncoderDispatchWorkgroups(
-        pass, dispatch.workgroup_count_x, 1, 1);
-  }
+  const size_t n = dispatches_.size();
+  const size_t chunk = execute_config_.chunk_size;
+
+  if (chunk == 0 || n <= chunk) {
+    WGPUCommandEncoderDescriptor enc_desc = {};
+    WGPUCommandEncoder encoder =
+        wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
+
+    WGPUComputePassDescriptor pass_desc = {};
+    WGPUComputePassEncoder pass =
+        wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
+
+    for (const auto& dispatch : dispatches_) {
+      wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline);
+      wgpuComputePassEncoderSetBindGroup(
+          pass, 0, dispatch.bind_group, 0, nullptr);
+      wgpuComputePassEncoderDispatchWorkgroups(
+          pass, dispatch.workgroup_count_x, 1, 1);
+    }
 
-  wgpuComputePassEncoderEnd(pass);
-  wgpuComputePassEncoderRelease(pass);
+    wgpuComputePassEncoderEnd(pass);
+    wgpuComputePassEncoderRelease(pass);
 
-  // Copy outputs to staging buffers
-  for (size_t i = 0; i < output_ids_.size(); i++) {
-    int oid = output_ids_[i];
-    wgpuCommandEncoderCopyBufferToBuffer(
-        encoder,
-        tensors_[oid].buffer,
-        0,
-        output_staging_buffers_[i],
-        0,
-        tensors_[oid].nbytes);
+    for (const auto& copy : output_copies_) {
+      wgpuCommandEncoderCopyBufferToBuffer(
+          encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes);
+    }
+
+    WGPUCommandBufferDescriptor cmd_desc = {};
+    WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
+    wgpuQueueSubmit(queue_, 1, &cmd);
+
+    wgpuCommandBufferRelease(cmd);
+    wgpuCommandEncoderRelease(encoder);
+    return;
   }
 
-  WGPUCommandBufferDescriptor cmd_desc = {};
-  WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
-  wgpuQueueSubmit(queue_, 1, &cmd);
+  const size_t first_chunk = execute_config_.initial_chunk_size > 0
+      ? execute_config_.initial_chunk_size
+      : chunk;
+
+  size_t start = 0;
+  size_t current_chunk = first_chunk;
 
-  wgpuCommandBufferRelease(cmd);
-  wgpuCommandEncoderRelease(encoder);
+  while (start < n) {
+    size_t end = std::min(start + current_chunk, n);
+
+    WGPUCommandEncoderDescriptor enc_desc = {};
+    WGPUCommandEncoder encoder =
+        wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
+
+    WGPUComputePassDescriptor pass_desc = {};
+    WGPUComputePassEncoder pass =
+        wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
+
+    for (size_t i = start; i < end; i++) {
+      wgpuComputePassEncoderSetPipeline(pass, dispatches_[i].pipeline);
+      wgpuComputePassEncoderSetBindGroup(
+          pass, 0, dispatches_[i].bind_group, 0, nullptr);
+      wgpuComputePassEncoderDispatchWorkgroups(
+          pass, dispatches_[i].workgroup_count_x, 1, 1);
+    }
+
+    wgpuComputePassEncoderEnd(pass);
+    wgpuComputePassEncoderRelease(pass);
+
+    if (end == n) {
+      for (const auto& copy : output_copies_) {
+        wgpuCommandEncoderCopyBufferToBuffer(
+            encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes);
+      }
+    }
+
+    WGPUCommandBufferDescriptor cmd_desc = {};
+    WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
+    wgpuQueueSubmit(queue_, 1, &cmd);
+
+    wgpuCommandBufferRelease(cmd);
+    wgpuCommandEncoderRelease(encoder);
+
+    start = end;
+    current_chunk = chunk;
+  }
 }
 
 namespace {
@@ -283,24 +450,35 @@ void buffer_map_callback(
 } // namespace
 
 void WebGPUGraph::copy_outputs(std::vector<std::pair<void*, size_t>>& outputs) {
-  for (size_t i = 0; i < outputs.size() && i < output_staging_buffers_.size();
-       i++) {
-    MapCallbackData cb_data;
+  const size_t count = std::min(outputs.size(), output_staging_buffers_.size());
+
+  std::vector<MapCallbackData> cb_data(count);
+
+  for (size_t i = 0; i < count; i++) {
+    if (outputs[i].second == 0) {
+      cb_data[i].done = true;
+      cb_data[i].status = WGPUMapAsyncStatus_Success;
+      continue;
+    }
     WGPUBufferMapCallbackInfo cb_info = {};
     cb_info.mode = WGPUCallbackMode_AllowSpontaneous;
     cb_info.callback = buffer_map_callback;
-    cb_info.userdata1 = &cb_data;
+    cb_info.userdata1 = &cb_data[i];
     wgpuBufferMapAsync(
         output_staging_buffers_[i],
         WGPUMapMode_Read,
         0,
         outputs[i].second,
         cb_info);
+  }
 
-    // Poll until the map callback fires.
-    wgpuDevicePoll(device_, true, nullptr);
+  wgpuDevicePoll(device_, true, nullptr);
 
-    if (cb_data.status == WGPUMapAsyncStatus_Success) {
+  for (size_t i = 0; i < count; i++) {
+    if (outputs[i].second == 0) {
+      continue;
+    }
+    if (cb_data[i].status == WGPUMapAsyncStatus_Success) {
       const void* mapped = wgpuBufferGetConstMappedRange(
           output_staging_buffers_[i], 0, outputs[i].second);
       std::memcpy(outputs[i].first, mapped, outputs[i].second);
@@ -315,15 +493,28 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
   WebGPUMemoryStats stats;
   for (size_t i = 0; i < value_types_.size(); i++) {
     if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
-      stats.tensor_buffer_bytes += tensors_[i].nbytes;
       stats.num_tensors++;
+      // Shared tensors are tracked via shared_buffer_sizes_
+      bool is_shared =
+          i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0;
+      if (!is_shared) {
+        stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
+      }
     }
   }
+  for (size_t s : shared_buffer_sizes_) {
+    stats.shared_buffer_bytes += s;
+  }
+  stats.num_shared_objects = static_cast<int>(shared_buffers_.size());
+  stats.tensor_buffer_bytes =
+      stats.shared_buffer_bytes + stats.unshared_tensor_buffer_bytes;
   for (size_t i = 0; i < output_ids_.size(); i++) {
     stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes;
   }
   stats.uniform_buffer_bytes = uniform_buffer_bytes_;
   stats.num_dispatches = static_cast<int>(dispatches_.size());
+  stats.num_cached_pipelines = static_cast<int>(pipeline_cache_.size());
+  stats.num_cached_shaders = static_cast<int>(shader_cache_.size());
   return stats;
 }
 
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index 2d6996e9219..3aa96917a4e 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -12,6 +12,7 @@
 
 #include <cstdint>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace executorch {
@@ -30,12 +31,28 @@ struct WebGPUDispatch {
   uint32_t workgroup_count_x = 1;
 };
 
+struct OutputCopy {
+  WGPUBuffer src_buffer = nullptr;
+  WGPUBuffer staging_buffer = nullptr;
+  size_t nbytes = 0;
+};
+
+struct ExecuteConfig {
+  size_t chunk_size = 0;
+  size_t initial_chunk_size = 0;
+};
+
 struct WebGPUMemoryStats {
   size_t tensor_buffer_bytes = 0;
+  size_t shared_buffer_bytes = 0;
+  int num_shared_objects = 0;
+  size_t unshared_tensor_buffer_bytes = 0;
   size_t staging_buffer_bytes = 0;
   size_t uniform_buffer_bytes = 0;
   int num_tensors = 0;
   int num_dispatches = 0;
+  int num_cached_pipelines = 0;
+  int num_cached_shaders = 0;
 
   size_t total_bytes() const {
     return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes;
@@ -99,6 +116,20 @@ class WebGPUGraph {
     uniform_buffer_bytes_ += bytes;
   }
 
+  WGPUShaderModule get_or_create_shader(
+      const std::string& key,
+      const char* wgsl_source);
+
+  WGPUComputePipeline get_or_create_pipeline(
+      const std::string& key,
+      WGPUShaderModule shader,
+      WGPUPipelineLayout layout);
+
+  WGPUBindGroupLayout get_or_create_bgl(
+      const std::string& key,
+      const WGPUBindGroupLayoutEntry* entries,
+      uint32_t count);
+
   void set_instance(WGPUInstance instance) {
     instance_ = instance;
   }
@@ -134,11 +165,26 @@ class WebGPUGraph {
   std::vector<int> input_ids_;
   std::vector<int> output_ids_;
 
+  // Memory aliasing: tensors with the same mem_obj_id share a WGPUBuffer.
+  std::vector<int> tensor_mem_obj_ids_;
+  std::vector<WGPUBuffer> shared_buffers_;
+  std::vector<size_t> shared_buffer_sizes_;
+
   // Staging buffers for reading back outputs (MapRead | CopyDst).
   std::vector<WGPUBuffer> output_staging_buffers_;
 
+  // Pre-computed output copy descriptors for execute().
+  std::vector<OutputCopy> output_copies_;
+
   std::vector<WebGPUDispatch> dispatches_;
 
+  ExecuteConfig execute_config_;
+
+  // Caches for reusing GPU objects across dispatches.
+  std::unordered_map<std::string, WGPUShaderModule> shader_cache_;
+  std::unordered_map<std::string, WGPUComputePipeline> pipeline_cache_;
+  std::unordered_map<std::string, WGPUBindGroupLayout> bgl_cache_;
+
   size_t uniform_buffer_bytes_ = 0;
 };
 
diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py
index f4b33ced76d..e8da644a1f9 100644
--- a/backends/webgpu/test/ops/add/test_add.py
+++ b/backends/webgpu/test/ops/add/test_add.py
@@ -31,6 +31,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         z = x + y
         z = z + x
         z = z + y
+        z = z + x
+        z = z + y
         return z
 
 
@@ -97,5 +99,18 @@ def export_add_model(output_path: str) -> None:
     print(f"Exported {output_path}")
 
 
+def export_chained_add_model(output_path: str) -> None:
+    """Export a chained add model (z=x+y; z=z+x; z=z+y; z=z+x; z=z+y) to .pte for memory aliasing testing."""
+    model = AddChainedModule()
+    example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024))
+    ep = torch.export.export(model, example_inputs)
+    et_program = to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+    with open(output_path, "wb") as f:
+        f.write(et_program.buffer)
+    print(f"Exported {output_path}")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
index 684926cb181..a42b2304ee7 100755
--- a/backends/webgpu/test/test_build_webgpu.sh
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -22,12 +22,14 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v
 
 # ── Step 2: Export .pte model ─────────────────────────────────────────────────
 
-echo "=== Step 2: Export test model ==="
+echo "=== Step 2: Export test models ==="
 PTE_MODEL="/tmp/webgpu_add_test.pte"
+PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
 cd "${EXECUTORCH_ROOT}"
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.add.test_add import export_add_model
+from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
 export_add_model('${PTE_MODEL}')
+export_chained_add_model('${PTE_CHAINED_MODEL}')
 "
 
 # ── Step 3: Native build + test (wgpu-native) ────────────────────────────────
@@ -60,6 +62,7 @@ cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}
 
 echo "=== Step 4: Run native test ==="
 WEBGPU_TEST_MODEL="${PTE_MODEL}" \
+WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
     "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test"
 
 echo "=== Done ==="
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index c60695e11c9..d3005debf37 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -75,6 +75,62 @@ static bool test_single_add(const std::string& model_path) {
   return true;
 }
 
+static bool test_chained_add(const std::string& model_path) {
+  printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n");
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  constexpr int dim = 1024;
+  constexpr int size = dim * dim;
+
+  std::vector<float> x_data(size);
+  std::vector<float> y_data(size);
+  for (int i = 0; i < size; i++) {
+    x_data[i] = static_cast<float>(i % 100) * 0.01f;
+    y_data[i] = static_cast<float>(i % 50) * 0.02f;
+  }
+
+  auto x = make_tensor_ptr({dim, dim}, std::vector<float>(x_data));
+  auto y = make_tensor_ptr({dim, dim}, std::vector<float>(y_data));
+
+  auto result = module.forward({EValue(x), EValue(y)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+
+  // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y
+  const auto& out_tensor = outputs[0].toTensor();
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_error = 0.0f;
+  for (int i = 0; i < size; i++) {
+    float expected = 3.0f * x_data[i] + 3.0f * y_data[i];
+    float error = std::abs(out_data[i] - expected);
+    max_error = std::max(max_error, error);
+  }
+
+  printf("Max error: %e (checked %d elements)\n", max_error, size);
+  if (max_error > 1e-3f) {
+    printf("FAIL: max error exceeds tolerance 1e-3\n");
+    return false;
+  }
+  printf("PASS: chained add test\n");
+  return true;
+}
+
 int main(int argc, char** argv) {
   std::string model_path = "webgpu_add_test.pte";
   if (argc > 1) {
@@ -84,6 +140,11 @@ int main(int argc, char** argv) {
     model_path = env;
   }
 
+  std::string chained_model_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) {
+    chained_model_path = env;
+  }
+
   WebGPUContext ctx;
   try {
     ctx = create_webgpu_context();
@@ -97,6 +158,10 @@ int main(int argc, char** argv) {
 
   bool ok = test_single_add(model_path);
 
+  if (!chained_model_path.empty()) {
+    ok = test_chained_add(chained_model_path) && ok;
+  }
+
   set_default_webgpu_context(nullptr);
   destroy_webgpu_context(ctx);