Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
163f614
[CPU/CUDA EP] Add DeformConv op support (#27393)
ShirasawaSama Mar 23, 2026
3bb032e
Fix non-ASCII Unicode model path crash across session and provider co…
sagarbhure-msft Mar 23, 2026
45b5900
[CUDA] RoiAlign for opset versions 16 and 22 (#27646)
tianleiwu Mar 23, 2026
fde4e03
[CUDA] Extend Pad support through opset 25 with wrap mode (#27774)
Copilot Mar 23, 2026
36c962f
Fix QNN SDK version propagation in Linux ort-qnn wheel build (#27800)
derdeljan-msft Mar 23, 2026
c171919
Fix NeonFp16DequantB8Bit reference to match kernel fp16 precision (#2…
jambayk Mar 23, 2026
0c3e5fc
fix webnn/where complicance tests for webgpu (#27776)
guschmue Mar 23, 2026
16b556d
fix webnn test case for webgpu ep: 'transpose float32 1D constant ten…
guschmue Mar 24, 2026
37b863c
Extend DQ→MatMulNBits fusion to support Gemm + per-tensor/per-channel…
jambayk Mar 24, 2026
883b461
Bump rollup from 4.35.0 to 4.59.0 in /js/web/test/e2e/exports/testcas…
dependabot[bot] Mar 24, 2026
793153b
Bump flatted from 3.3.3 to 3.4.2 in /js/react_native/e2e (#27785)
dependabot[bot] Mar 24, 2026
faad20f
Bump flatted from 3.3.3 to 3.4.2 in /js (#27799)
dependabot[bot] Mar 24, 2026
38a2625
[MLAS] Add fused Silu and Gelu kernels for AVX512 (#27690)
hariharans29 Mar 24, 2026
99c5dd8
Make WebGPU EP compatible with EP API (#26907)
fs-eire Mar 24, 2026
142ecca
[WebGPU] Einsum fixes for 5D tensors (#27779)
xenova Mar 24, 2026
2f66878
Fix WebGPU device destroyed on session release, breaking session recr…
nico-martin Mar 24, 2026
1b982dd
[CPU] Handle ONNX domain Gelu and HardSigmoid activations in the NCHW…
hariharans29 Mar 24, 2026
6af93e8
Merge remote-tracking branch 'origin/master' into sync_msft_25032026
AIFrameworksIntegration Mar 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions .github/workflows/windows_webgpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,86 @@ jobs:
working-directory: ${{ github.workspace }}\csharp
continue-on-error: true

webgpu_plugin_build_x64_RelWithDebInfo:
runs-on: [
"self-hosted",
"1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
"JobId=webgpu_plugin_build_x64_RelWithDebInfo-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
]
timeout-minutes: 300
env:
OnnxRuntimeBuildDirectory: ${{ github.workspace }}
setVcvars: true
ALLOW_RELEASED_ONNX_OPSET_ONLY: "0"
DocUpdateNeeded: false
NVIDIA_TF32_OVERRIDE: "0"
ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0"
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
submodules: none

- name: Setup Python 3.12
uses: actions/setup-python@v6
with:
python-version: "3.12"
architecture: x64

- name: Locate vcvarsall and Setup Env
uses: ./.github/actions/locate-vcvarsall-and-setup-env
with:
architecture: x64

- name: Install python modules
run: python -m pip install -r tools\ci_build\github\windows\python\requirements.txt
shell: cmd
working-directory: ${{ github.workspace }}

- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "20.x"

- uses: actions/cache@v5
id: onnx-node-tests-cache
with:
path: ${{ github.workspace }}/js/test/
key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}

- name: Build and Test
shell: pwsh
run: |
python.exe ${{ github.workspace }}\tools\ci_build\build.py `
--config RelWithDebInfo `
--build_dir ${{ github.workspace }} `
--skip_submodule_sync `
--parallel `
--use_binskim_compliant_compile_flags `
--cmake_generator "Visual Studio 17 2022" `
--enable_onnx_tests `
--use_webgpu shared_lib `
--wgsl_template static `
--use_vcpkg --use_vcpkg_ms_internal_asset_cache `
--cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_DAWN_BACKEND_D3D12=1 onnxruntime_ENABLE_DAWN_BACKEND_VULKAN=1 `
--disable_rtti `
--enable_lto

if ($lastExitCode -ne 0) {
exit $lastExitCode
}

- name: Publish artifacts
uses: actions/upload-artifact@v4
with:
name: webgpu-plugin-binaries
path: |
${{ github.workspace }}/RelWithDebInfo/RelWithDebInfo/onnxruntime_providers_webgpu.dll
${{ github.workspace }}/RelWithDebInfo/RelWithDebInfo/onnxruntime_providers_webgpu.pdb
${{ github.workspace }}/RelWithDebInfo/RelWithDebInfo/dxcompiler.dll
${{ github.workspace }}/RelWithDebInfo/RelWithDebInfo/dxil.dll

webgpu_external_dawn_build_x64_RelWithDebInfo:
runs-on: [
"self-hosted",
Expand Down
14 changes: 13 additions & 1 deletion cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/eltwise.h
${MLAS_SRC_DIR}/eltwise.cpp
${MLAS_SRC_DIR}/erf.cpp
${MLAS_SRC_DIR}/silu.cpp
${MLAS_SRC_DIR}/gelu.cpp
${MLAS_SRC_DIR}/compute.cpp
${MLAS_SRC_DIR}/dequantize.cpp
${MLAS_SRC_DIR}/quantize.cpp
Expand Down Expand Up @@ -201,6 +203,14 @@ function(setup_mlas_source_for_windows)
)
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "/arch:AVX2")

set(mlas_platform_srcs_avx512
${MLAS_SRC_DIR}/intrinsics/avx512/gelu_avx512f.cpp
${MLAS_SRC_DIR}/intrinsics/avx512/silu_avx512f.cpp
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
)

set_source_files_properties(${mlas_platform_srcs_avx512} PROPERTIES COMPILE_FLAGS "/arch:AVX512")

target_sources(onnxruntime_mlas PRIVATE
${MLAS_SRC_DIR}/dgemm.cpp
${mlas_platform_srcs_avx}
Expand All @@ -212,7 +222,7 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
${mlas_platform_srcs_avx512}
${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.h
${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
Expand Down Expand Up @@ -764,6 +774,8 @@ endif()
${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/TransKernelAvx512F.S
${MLAS_SRC_DIR}/intrinsics/avx512/gelu_avx512f.cpp
${MLAS_SRC_DIR}/intrinsics/avx512/silu_avx512f.cpp
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
)
set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f")
Expand Down
8 changes: 7 additions & 1 deletion cmake/onnxruntime_providers_webgpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
endif()
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_providers_webgpu_cc_srcs})

onnxruntime_add_shared_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
onnxruntime_add_shared_library_module(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
${REPO_ROOT}/include/onnxruntime/core/session
onnxruntime_common
Expand Down Expand Up @@ -119,6 +119,12 @@
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
message(FATAL_ERROR "WebGPU EP shared library build is not supported on Emscripten. Please use static library build.")
endif()

# Configure precompiled headers for shared library build
# PCH ensures ep/adapters.h is included first and improves compilation speed
target_precompile_headers(onnxruntime_providers_webgpu PRIVATE
"${REPO_ROOT}/include/onnxruntime/ep/adapters.h"
)
endif()

set_target_properties(onnxruntime_providers_webgpu PROPERTIES CXX_STANDARD_REQUIRED ON)
Expand Down
13 changes: 13 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,18 @@ function(onnxruntime_apply_test_target_workarounds target)
endif()
endfunction()

# Set environment variables for plugin EP tests when run via CTest.
function(onnxruntime_set_plugin_ep_test_environment target)
if(onnxruntime_USE_WEBGPU AND onnxruntime_USE_EP_API_ADAPTERS)
set(ORT_PLUGIN_EP_JSON_CONFIG "{\"ep_library_registration_name\": \"WebGPU_PluginEP\", \"ep_library_path\": \"$<TARGET_FILE_NAME:onnxruntime_providers_webgpu>\", \"selected_ep_name\": \"WebGpuExecutionProvider\"}")
set_tests_properties(${target} PROPERTIES
ENVIRONMENT "ORT_UNIT_TEST_MAIN_DYNAMIC_PLUGIN_EP_CONFIG_JSON=${ORT_PLUGIN_EP_JSON_CONFIG}"
)
# TODO: add for other plugin EPs if needed
# elseif()
endif()
endfunction()

function(onnxruntime_apply_emscripten_test_link_settings target)
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
set_target_properties(${target} PROPERTIES LINK_DEPENDS ${TEST_SRC_DIR}/wasm/onnxruntime_test_adapter.js)
Expand Down Expand Up @@ -1250,6 +1262,7 @@ block()
)

onnxruntime_apply_test_target_workarounds(onnxruntime_provider_test)
onnxruntime_set_plugin_ep_test_environment(onnxruntime_provider_test)

# Expose QNN SDK headers to unit tests via an interface target
if(onnxruntime_USE_QNN)
Expand Down
15 changes: 13 additions & 2 deletions docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ Do not modify directly.*
|||[11, 13]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int32), tensor(int64)|
|DFT|*in* input:**T1**<br> *in* dft_length:**T2**<br> *in* axis:**tensor(int64)**<br> *out* output:**T1**<br><br>or<br><br>*in* input:**T1**<br> *in* dft_length:**T2**<br> *out* output:**T1**|20+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int32), tensor(int64)|
|||[17, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int32), tensor(int64)|
|DeformConv|*in* X:**T**<br> *in* W:**T**<br> *in* offset:**T**<br> *in* B:**T**<br> *in* mask:**T**<br> *out* Y:**T**|22+|**T** = tensor(double), tensor(float)|
|||[19, 21]|**T** = tensor(double), tensor(float)|
|DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(uint8)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(uint8)|
|||[1, 10]|**T** = tensor(double), tensor(float)|
Expand Down Expand Up @@ -697,6 +699,8 @@ Do not modify directly.*
|Crop|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|CumSum|*in* x:**T**<br> *in* axis:**T2**<br> *out* y:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(int32), tensor(int64)|
|||[11, 13]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(int32), tensor(int64)|
|DeformConv|*in* X:**T**<br> *in* W:**T**<br> *in* offset:**T**<br> *in* B:**T**<br> *in* mask:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[19, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
|DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
Expand Down Expand Up @@ -843,7 +847,12 @@ Do not modify directly.*
|PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(double), tensor(float), tensor(float16)|
|||[9, 15]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|25+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||24|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||23|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[21, 22]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[19, 20]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||18|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[13, 17]|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[2, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
Expand Down Expand Up @@ -902,7 +911,9 @@ Do not modify directly.*
|||[11, 12]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
|||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
|ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|10+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int64)|
|RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|22+|**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(int64)|
|||[16, 21]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(int64)|
|||[10, 15]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int64)|
|RotaryEmbedding|*in* X:**T**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *in* position_ids:**M**<br> *out* Y:**T**|23+|**M** = tensor(int64)<br/> **T** = tensor(bfloat16), tensor(float), tensor(float16)|
|Round|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(double), tensor(float), tensor(float16)|
|ScaledTanh|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,13 @@ static const char* const kOrtSessionOptionsMlasDisableKleidiAi = "mlas.disable_k
// If not provided, default is 4.
static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";

// Block size used when converting per-tensor or per-axis DQ + MatMul to MatMulNBits.
// Only applies to DQ nodes without an existing block_size attribute (i.e., per-tensor or per-axis quantization).
// Positive value: explicit block_size (must be power-of-2 and >= 16, e.g., 16, 32, 64, 128).
// "0" or not provided: use default block_size of 32.
// "-1": heuristic - largest power-of-2 <= min(K, 256) that minimizes padding.
static const char* const kOrtSessionOptionsQDQMatMulNBitsBlockSize = "session.qdq_matmulnbits_block_size";

// Enable the DQ->MatMulNBits fusion graph transformer.
// "0": disabled (default). "1": enabled.
// This is typically set automatically by InferenceSession when the NvTensorRTRTX EP is registered.
Expand Down
37 changes: 33 additions & 4 deletions include/onnxruntime/ep/adapter/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,50 @@ namespace adapter {
/// </summary>
class Allocator : public OrtAllocator {
public:
/**
* Create from an existing AllocatorPtr.
*/
explicit Allocator(const OrtMemoryInfo* memory_info, AllocatorPtr impl)
: OrtAllocator{}, memory_info_(memory_info), impl_(impl) {
: Allocator{memory_info} {
ORT_ENFORCE(impl != nullptr, "Allocator implementation cannot be null.");
impl_ = impl;
}

using AllocatorFactory = AllocatorPtr (*)(const OrtMemoryInfo& memory_info);

/**
* Create from an AllocatorFactory, which will be called lazily when the first allocation is made.
*/
explicit Allocator(const OrtMemoryInfo* memory_info, AllocatorFactory get_allocator_impl)
: Allocator{memory_info} {
get_allocator_impl_ = get_allocator_impl;
}

private:
explicit Allocator(const OrtMemoryInfo* memory_info)
: OrtAllocator{}, memory_info_(memory_info) {
version = ORT_API_VERSION;
Alloc = AllocImpl;
Free = FreeImpl;
Info = InfoImpl;
}
AllocatorPtr GetImpl() {
if (!impl_) {
std::call_once(init_flag_, [this]() {
impl_ = get_allocator_impl_(*memory_info_);
});
}
return impl_;
}

private:
static void* ORT_API_CALL AllocImpl(OrtAllocator* this_ptr, size_t size) noexcept {
auto* allocator = static_cast<Allocator*>(this_ptr);
return allocator->impl_->Alloc(size);
return allocator->GetImpl()->Alloc(size);
}

static void ORT_API_CALL FreeImpl(OrtAllocator* this_ptr, void* p) noexcept {
auto* allocator = static_cast<Allocator*>(this_ptr);
allocator->impl_->Free(p);
allocator->GetImpl()->Free(p);
}

static const OrtMemoryInfo* ORT_API_CALL InfoImpl(const OrtAllocator* this_ptr) noexcept {
Expand All @@ -44,6 +71,8 @@ class Allocator : public OrtAllocator {

const OrtMemoryInfo* memory_info_;
AllocatorPtr impl_;
AllocatorFactory get_allocator_impl_;
std::once_flag init_flag_;
};

} // namespace adapter
Expand Down
1 change: 1 addition & 0 deletions include/onnxruntime/ep/adapter/ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Ep : public OrtEp {
profiler_{impl_->GetProfiler()},
temp_space_cpu_allocator_{temp_space_cpu_allocator},
temp_space_allocator_{temp_space_allocator} {
ort_version_supported = ORT_API_VERSION;
}

public:
Expand Down
2 changes: 1 addition & 1 deletion include/onnxruntime/ep/adapter/op_kernel_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include "tensor_helper.h"

namespace onnxruntime {
struct DataTransferManager;
class DataTransferManager;
struct IExecutionProvider;
} // namespace onnxruntime

Expand Down
21 changes: 21 additions & 0 deletions include/onnxruntime/ep/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,24 @@
OrtStatus* _status = (status_expr); \
Ort::Status _ignored{_status}; \
} while (false)

// Helper macros to convert exceptions to OrtStatus* return values.
// Usage:
// EXCEPTION_TO_RETURNED_STATUS_BEGIN
// ... code that may throw ...
// EXCEPTION_TO_RETURNED_STATUS_END
#define EXCEPTION_TO_RETURNED_STATUS_BEGIN try {
#define EXCEPTION_TO_RETURNED_STATUS_END \
} \
catch (const Ort::Exception& ex) { \
Ort::Status status(ex); \
return status.release(); \
} \
catch (const std::exception& ex) { \
Ort::Status status(ex.what(), ORT_EP_FAIL); \
return status.release(); \
} \
catch (...) { \
Ort::Status status("Unknown exception", ORT_EP_FAIL); \
return status.release(); \
}
16 changes: 7 additions & 9 deletions js/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions js/react_native/e2e/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading