diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index f0e4c7959c0..91404fb164f 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -50,9 +50,15 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) { WebGPUGraph::WebGPUGraph() = default; WebGPUGraph::~WebGPUGraph() { - for (auto& t : tensors_) { - if (t.buffer) { - wgpuBufferRelease(t.buffer); + for (size_t i = 0; i < tensors_.size(); i++) { + if (tensors_[i].buffer && + (i >= tensor_mem_obj_ids_.size() || tensor_mem_obj_ids_[i] < 0)) { + wgpuBufferRelease(tensors_[i].buffer); + } + } + for (auto& buf : shared_buffers_) { + if (buf) { + wgpuBufferRelease(buf); } } for (auto& buf : output_staging_buffers_) { @@ -68,6 +74,21 @@ WebGPUGraph::~WebGPUGraph() { wgpuBindGroupRelease(d.bind_group); } } + for (auto& [_, shader] : shader_cache_) { + if (shader) { + wgpuShaderModuleRelease(shader); + } + } + for (auto& [_, pipeline] : pipeline_cache_) { + if (pipeline) { + wgpuComputePipelineRelease(pipeline); + } + } + for (auto& [_, bgl] : bgl_cache_) { + if (bgl) { + wgpuBindGroupLayoutRelease(bgl); + } + } } void WebGPUGraph::build( @@ -94,6 +115,7 @@ void WebGPUGraph::build( const int num_vals = values ? values->size() : 0; value_types_.resize(num_vals, ValueType::Null); tensors_.resize(num_vals); + tensor_mem_obj_ids_.resize(num_vals, -1); ints_.resize(num_vals, 0); doubles_.resize(num_vals, 0.0); bools_.resize(num_vals, false); @@ -121,27 +143,40 @@ void WebGPUGraph::build( } tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype()); - // Create GPU buffer - WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4; - buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | - WGPUBufferUsage_CopySrc; - buf_desc.mappedAtCreation = false; - tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); - - // Upload constant data if this tensor has a constant_id int constant_id = vk_tensor->constant_id(); - if (constant_id >= 0 && constant_data) { - const auto* constants = graph->constants(); - if (constants && constant_id < static_cast(constants->size())) { - const auto* vk_bytes = constants->Get(constant_id); - // Only upload from embedded bytes (not named data map) - if (vk_bytes->offset() != UINT64_MAX) { - const uint8_t* src = constant_data + vk_bytes->offset(); - wgpuQueueWriteBuffer( - queue_, tensor.buffer, 0, src, tensor.nbytes); + int mem_obj_id = vk_tensor->mem_obj_id(); + + // Constants always get dedicated buffers regardless of mem_obj_id + if (constant_id >= 0 || mem_obj_id < 0) { + tensor_mem_obj_ids_[i] = -1; + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = std::max(tensor.nbytes, size_t(4)); + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); + + if (constant_id >= 0 && constant_data && tensor.nbytes > 0) { + const auto* constants = graph->constants(); + if (constants && + constant_id < static_cast(constants->size())) { + const auto* vk_bytes = constants->Get(constant_id); + if (vk_bytes->offset() != UINT64_MAX) { + const uint8_t* src = constant_data + vk_bytes->offset(); + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, src, tensor.nbytes); + } } } + } else { + // Shared buffer: track required size, defer allocation to pass 2 + tensor_mem_obj_ids_[i] = mem_obj_id; + size_t id = static_cast(mem_obj_id); + if (id >= shared_buffer_sizes_.size()) { + shared_buffer_sizes_.resize(id + 1, 0); + } + shared_buffer_sizes_[id] = + std::max(shared_buffer_sizes_[id], tensor.nbytes); } break; } @@ -166,6 +201,23 @@ void WebGPUGraph::build( } } + // Allocate shared buffers and assign to tensors + shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr); + for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) { + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = std::max(shared_buffer_sizes_[id], size_t(4)); + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + shared_buffers_[id] = wgpuDeviceCreateBuffer(device_, &buf_desc); + } + for (int i = 0; i < num_vals; i++) { + int mid = tensor_mem_obj_ids_[i]; + if (mid >= 0) { + tensors_[i].buffer = shared_buffers_[mid]; + } + } + // Phase 2: Record input and output IDs const auto* fb_input_ids = graph->input_ids(); if (fb_input_ids) { @@ -181,7 +233,7 @@ void WebGPUGraph::build( // Create staging buffer for output readback WGPUBufferDescriptor staging_desc = {}; - staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4; + staging_desc.size = std::max(tensors_[oid].nbytes, size_t(4)); staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; staging_desc.mappedAtCreation = false; output_staging_buffers_.push_back( @@ -189,6 +241,14 @@ void WebGPUGraph::build( } } + for (size_t i = 0; i < output_ids_.size(); i++) { + int oid = output_ids_[i]; + output_copies_.push_back( + {tensors_[oid].buffer, + output_staging_buffers_[i], + tensors_[oid].nbytes}); + } + // Phase 3: Build operator dispatch chain const auto* chain = graph->chain(); if (chain) { @@ -213,9 +273,70 @@ void WebGPUGraph::build( } } +WGPUShaderModule WebGPUGraph::get_or_create_shader( + const std::string& key, + const char* wgsl_source) { + auto it = shader_cache_.find(key); + if (it != shader_cache_.end()) { + return it->second; + } + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {wgsl_source, WGPU_STRLEN}; + + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device_, &shader_desc); + + shader_cache_[key] = shader; + return shader; +} + +WGPUComputePipeline WebGPUGraph::get_or_create_pipeline( + const std::string& key, + WGPUShaderModule shader, + WGPUPipelineLayout layout) { + auto it = pipeline_cache_.find(key); + if (it != pipeline_cache_.end()) { + return it->second; + } + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device_, &pipeline_desc); + + pipeline_cache_[key] = pipeline; + return pipeline; +} + +WGPUBindGroupLayout WebGPUGraph::get_or_create_bgl( + const std::string& key, + const WGPUBindGroupLayoutEntry* entries, + uint32_t count) { + auto it = bgl_cache_.find(key); + if (it != bgl_cache_.end()) { + return it->second; + } + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = count; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device_, &bgl_desc); + + bgl_cache_[key] = bgl; + return bgl; +} + void WebGPUGraph::copy_inputs( const std::vector>& inputs) { for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) { + if (inputs[i].second == 0) { + continue; + } int tid = input_ids_[i]; const auto& tensor = tensors_[tid]; wgpuQueueWriteBuffer( @@ -224,43 +345,89 @@ void WebGPUGraph::copy_inputs( } void WebGPUGraph::execute() { - WGPUCommandEncoderDescriptor enc_desc = {}; - WGPUCommandEncoder encoder = - wgpuDeviceCreateCommandEncoder(device_, &enc_desc); - - WGPUComputePassDescriptor pass_desc = {}; - WGPUComputePassEncoder pass = - wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); - - for (const auto& dispatch : dispatches_) { - wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); - wgpuComputePassEncoderSetBindGroup( - pass, 0, dispatch.bind_group, 0, nullptr); - wgpuComputePassEncoderDispatchWorkgroups( - pass, dispatch.workgroup_count_x, 1, 1); - } + const size_t n = dispatches_.size(); + const size_t chunk = execute_config_.chunk_size; + + if (chunk == 0 || n <= chunk) { + WGPUCommandEncoderDescriptor enc_desc = {}; + WGPUCommandEncoder encoder = + wgpuDeviceCreateCommandEncoder(device_, &enc_desc); + + WGPUComputePassDescriptor pass_desc = {}; + WGPUComputePassEncoder pass = + wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); + + for (const auto& dispatch : dispatches_) { + wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); + wgpuComputePassEncoderSetBindGroup( + pass, 0, dispatch.bind_group, 0, nullptr); + wgpuComputePassEncoderDispatchWorkgroups( + pass, dispatch.workgroup_count_x, 1, 1); + } - wgpuComputePassEncoderEnd(pass); - wgpuComputePassEncoderRelease(pass); + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); - // Copy outputs to staging buffers - for (size_t i = 0; i < output_ids_.size(); i++) { - int oid = output_ids_[i]; - wgpuCommandEncoderCopyBufferToBuffer( - encoder, - tensors_[oid].buffer, - 0, - output_staging_buffers_[i], - 0, - tensors_[oid].nbytes); + for (const auto& copy : output_copies_) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes); + } + + WGPUCommandBufferDescriptor cmd_desc = {}; + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); + wgpuQueueSubmit(queue_, 1, &cmd); + + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(encoder); + return; } - WGPUCommandBufferDescriptor cmd_desc = {}; - WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); - wgpuQueueSubmit(queue_, 1, &cmd); + const size_t first_chunk = execute_config_.initial_chunk_size > 0 + ? execute_config_.initial_chunk_size + : chunk; + + size_t start = 0; + size_t current_chunk = first_chunk; - wgpuCommandBufferRelease(cmd); - wgpuCommandEncoderRelease(encoder); + while (start < n) { + size_t end = std::min(start + current_chunk, n); + + WGPUCommandEncoderDescriptor enc_desc = {}; + WGPUCommandEncoder encoder = + wgpuDeviceCreateCommandEncoder(device_, &enc_desc); + + WGPUComputePassDescriptor pass_desc = {}; + WGPUComputePassEncoder pass = + wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); + + for (size_t i = start; i < end; i++) { + wgpuComputePassEncoderSetPipeline(pass, dispatches_[i].pipeline); + wgpuComputePassEncoderSetBindGroup( + pass, 0, dispatches_[i].bind_group, 0, nullptr); + wgpuComputePassEncoderDispatchWorkgroups( + pass, dispatches_[i].workgroup_count_x, 1, 1); + } + + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); + + if (end == n) { + for (const auto& copy : output_copies_) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes); + } + } + + WGPUCommandBufferDescriptor cmd_desc = {}; + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); + wgpuQueueSubmit(queue_, 1, &cmd); + + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(encoder); + + start = end; + current_chunk = chunk; + } } namespace { @@ -283,24 +450,35 @@ void buffer_map_callback( } // namespace void WebGPUGraph::copy_outputs(std::vector>& outputs) { - for (size_t i = 0; i < outputs.size() && i < output_staging_buffers_.size(); - i++) { - MapCallbackData cb_data; + const size_t count = std::min(outputs.size(), output_staging_buffers_.size()); + + std::vector cb_data(count); + + for (size_t i = 0; i < count; i++) { + if (outputs[i].second == 0) { + cb_data[i].done = true; + cb_data[i].status = WGPUMapAsyncStatus_Success; + continue; + } WGPUBufferMapCallbackInfo cb_info = {}; cb_info.mode = WGPUCallbackMode_AllowSpontaneous; cb_info.callback = buffer_map_callback; - cb_info.userdata1 = &cb_data; + cb_info.userdata1 = &cb_data[i]; wgpuBufferMapAsync( output_staging_buffers_[i], WGPUMapMode_Read, 0, outputs[i].second, cb_info); + } - // Poll until the map callback fires. - wgpuDevicePoll(device_, true, nullptr); + wgpuDevicePoll(device_, true, nullptr); - if (cb_data.status == WGPUMapAsyncStatus_Success) { + for (size_t i = 0; i < count; i++) { + if (outputs[i].second == 0) { + continue; + } + if (cb_data[i].status == WGPUMapAsyncStatus_Success) { const void* mapped = wgpuBufferGetConstMappedRange( output_staging_buffers_[i], 0, outputs[i].second); std::memcpy(outputs[i].first, mapped, outputs[i].second); @@ -315,15 +493,28 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const { WebGPUMemoryStats stats; for (size_t i = 0; i < value_types_.size(); i++) { if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { - stats.tensor_buffer_bytes += tensors_[i].nbytes; stats.num_tensors++; + // Shared tensors are tracked via shared_buffer_sizes_ + bool is_shared = + i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0; + if (!is_shared) { + stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes; + } } } + for (size_t s : shared_buffer_sizes_) { + stats.shared_buffer_bytes += s; + } + stats.num_shared_objects = static_cast(shared_buffers_.size()); + stats.tensor_buffer_bytes = + stats.shared_buffer_bytes + stats.unshared_tensor_buffer_bytes; for (size_t i = 0; i < output_ids_.size(); i++) { stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes; } stats.uniform_buffer_bytes = uniform_buffer_bytes_; stats.num_dispatches = static_cast(dispatches_.size()); + stats.num_cached_pipelines = static_cast(pipeline_cache_.size()); + stats.num_cached_shaders = static_cast(shader_cache_.size()); return stats; } diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 2d6996e9219..3aa96917a4e 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -12,6 +12,7 @@ #include #include +#include #include namespace executorch { @@ -30,12 +31,28 @@ struct WebGPUDispatch { uint32_t workgroup_count_x = 1; }; +struct OutputCopy { + WGPUBuffer src_buffer = nullptr; + WGPUBuffer staging_buffer = nullptr; + size_t nbytes = 0; +}; + +struct ExecuteConfig { + size_t chunk_size = 0; + size_t initial_chunk_size = 0; +}; + struct WebGPUMemoryStats { size_t tensor_buffer_bytes = 0; + size_t shared_buffer_bytes = 0; + int num_shared_objects = 0; + size_t unshared_tensor_buffer_bytes = 0; size_t staging_buffer_bytes = 0; size_t uniform_buffer_bytes = 0; int num_tensors = 0; int num_dispatches = 0; + int num_cached_pipelines = 0; + int num_cached_shaders = 0; size_t total_bytes() const { return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes; @@ -99,6 +116,20 @@ class WebGPUGraph { uniform_buffer_bytes_ += bytes; } + WGPUShaderModule get_or_create_shader( + const std::string& key, + const char* wgsl_source); + + WGPUComputePipeline get_or_create_pipeline( + const std::string& key, + WGPUShaderModule shader, + WGPUPipelineLayout layout); + + WGPUBindGroupLayout get_or_create_bgl( + const std::string& key, + const WGPUBindGroupLayoutEntry* entries, + uint32_t count); + void set_instance(WGPUInstance instance) { instance_ = instance; } @@ -134,11 +165,26 @@ class WebGPUGraph { std::vector input_ids_; std::vector output_ids_; + // Memory aliasing: tensors with the same mem_obj_id share a WGPUBuffer. + std::vector tensor_mem_obj_ids_; + std::vector shared_buffers_; + std::vector shared_buffer_sizes_; + // Staging buffers for reading back outputs (MapRead | CopyDst). std::vector output_staging_buffers_; + // Pre-computed output copy descriptors for execute(). + std::vector output_copies_; + std::vector dispatches_; + ExecuteConfig execute_config_; + + // Caches for reusing GPU objects across dispatches. + std::unordered_map shader_cache_; + std::unordered_map pipeline_cache_; + std::unordered_map bgl_cache_; + size_t uniform_buffer_bytes_ = 0; }; diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py index f4b33ced76d..e8da644a1f9 100644 --- a/backends/webgpu/test/ops/add/test_add.py +++ b/backends/webgpu/test/ops/add/test_add.py @@ -31,6 +31,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: z = x + y z = z + x z = z + y + z = z + x + z = z + y return z @@ -97,5 +99,18 @@ def export_add_model(output_path: str) -> None: print(f"Exported {output_path}") +def export_chained_add_model(output_path: str) -> None: + """Export a chained add model (z=x+y; z=z+x; z=z+y; z=z+x; z=z+y) to .pte for memory aliasing testing.""" + model = AddChainedModule() + example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024)) + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + with open(output_path, "wb") as f: + f.write(et_program.buffer) + print(f"Exported {output_path}") + + if __name__ == "__main__": unittest.main() diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 684926cb181..a42b2304ee7 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -22,12 +22,14 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v # ── Step 2: Export .pte model ───────────────────────────────────────────────── -echo "=== Step 2: Export test model ===" +echo "=== Step 2: Export test models ===" PTE_MODEL="/tmp/webgpu_add_test.pte" +PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" cd "${EXECUTORCH_ROOT}" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.add.test_add import export_add_model +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model export_add_model('${PTE_MODEL}') +export_chained_add_model('${PTE_CHAINED_MODEL}') " # ── Step 3: Native build + test (wgpu-native) ──────────────────────────────── @@ -60,6 +62,7 @@ cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC} echo "=== Step 4: Run native test ===" WEBGPU_TEST_MODEL="${PTE_MODEL}" \ +WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test" echo "=== Done ===" diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index c60695e11c9..d3005debf37 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -75,6 +75,62 @@ static bool test_single_add(const std::string& model_path) { return true; } +static bool test_chained_add(const std::string& model_path) { + printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n"); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + constexpr int dim = 1024; + constexpr int size = dim * dim; + + std::vector x_data(size); + std::vector y_data(size); + for (int i = 0; i < size; i++) { + x_data[i] = static_cast(i % 100) * 0.01f; + y_data[i] = static_cast(i % 50) * 0.02f; + } + + auto x = make_tensor_ptr({dim, dim}, std::vector(x_data)); + auto y = make_tensor_ptr({dim, dim}, std::vector(y_data)); + + auto result = module.forward({EValue(x), EValue(y)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + + // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y + const auto& out_tensor = outputs[0].toTensor(); + const float* out_data = out_tensor.const_data_ptr(); + + float max_error = 0.0f; + for (int i = 0; i < size; i++) { + float expected = 3.0f * x_data[i] + 3.0f * y_data[i]; + float error = std::abs(out_data[i] - expected); + max_error = std::max(max_error, error); + } + + printf("Max error: %e (checked %d elements)\n", max_error, size); + if (max_error > 1e-3f) { + printf("FAIL: max error exceeds tolerance 1e-3\n"); + return false; + } + printf("PASS: chained add test\n"); + return true; +} + int main(int argc, char** argv) { std::string model_path = "webgpu_add_test.pte"; if (argc > 1) { @@ -84,6 +140,11 @@ int main(int argc, char** argv) { model_path = env; } + std::string chained_model_path; + if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) { + chained_model_path = env; + } + WebGPUContext ctx; try { ctx = create_webgpu_context(); @@ -97,6 +158,10 @@ int main(int argc, char** argv) { bool ok = test_single_add(model_path); + if (!chained_model_path.empty()) { + ok = test_chained_add(chained_model_path) && ok; + } + set_default_webgpu_context(nullptr); destroy_webgpu_context(ctx);