From f0bd99c8daed15ae2e83b849b702270667fd130e Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 13 Feb 2026 15:27:24 -0800
Subject: [PATCH 1/3] Upgrade Dev containers for CICD to latest

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .github/workflows/example_tests.yml              | 12 ++++++------
 .github/workflows/gpu_tests.yml                  |  6 +++---
 .github/workflows/unit_tests.yml                 | 10 +++++-----
 .../getting_started/_installation_for_Linux.rst  |  8 ++++++--
 tox.ini                                          | 16 +++++++++-------
 5 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
index c1dab5dab..feaf0c21b 100644
--- a/.github/workflows/example_tests.yml
+++ b/.github/workflows/example_tests.yml
@@ -70,7 +70,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -87,7 +87,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
@@ -103,7 +103,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-1
@@ -117,7 +117,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
@@ -133,7 +133,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[all,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -147,7 +147,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[all,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 3e55682cd..713913681 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -63,14 +63,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda12-gpu
+          - example: py312-cuda13-gpu
             timeout: 90
-          - example: py312-cuda12-gpu-megatron
+          - example: py312-cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
-      image: nvcr.io/nvidia/pytorch:25.06-py3
+      image: nvcr.io/nvidia/pytorch:26.01-py3
       env:
         GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 6f7fad3a7..bb8ebc2d5 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -37,7 +37,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
+        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
@@ -55,7 +55,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
-        run: pip install tox && tox -e py312-torch29-tf_latest-unit
+        run: pip install tox && tox -e py312-torch210-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
     needs: [linux]
@@ -70,7 +70,7 @@ jobs:
         with:
           python-version: "3.${{ matrix.py }}"
       - name: Run unit tests
-        run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
+        run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit
   multi-torch:
     if: github.event_name == 'pull_request'
     needs: [linux]
@@ -78,7 +78,7 @@ jobs:
     timeout-minutes: 30
     strategy:
       matrix:
-        torch: [26, 27, 28]
+        torch: [26, 27, 28, 29]
     steps:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
@@ -96,7 +96,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
+        run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit
   partial-install:
     if: github.event_name == 'pull_request'
     needs: [linux]
diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
index 0a82ecd1e..74276aa3b 100644
--- a/docs/source/getting_started/_installation_for_Linux.rst
+++ b/docs/source/getting_started/_installation_for_Linux.rst
@@ -14,11 +14,11 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | Python                  |  >=3.10,<3.13               |
 +-------------------------+-----------------------------+
-| CUDA                    |  >=12.0                     |
+| CUDA                    |  12.x, 13.x                 |
 +-------------------------+-----------------------------+
 | PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
-| TensorRT-LLM (Optional) |  1.2.0rc4                   |
+| TensorRT-LLM (Optional) |  >=1.0                      |
 +-------------------------+-----------------------------+
 | ONNX Runtime (Optional) |  1.22                       |
 +-------------------------+-----------------------------+
@@ -126,6 +126,10 @@ Additionally, we support installing dependencies for following 3rd-party package
     *   - Huggingface (``transformers``, ``diffusers``, etc.)
         - ``[hf]``
 
+**CUDA specific dependencies**
+
+* By default, ``cupy-cuda12x`` is installed for INT4 ONNX quantization. If you have CUDA 13, you need to run ``pip uninstall -y cupy-cuda12x`` and ``pip install cupy-cuda13x`` after installing ``nvidia-modelopt[onnx]``.
+
 **Accelerated Quantization with Triton Kernels**
 
 ModelOpt includes optimized quantization kernels implemented with Triton language that accelerate quantization
diff --git a/tox.ini b/tox.ini
index ae296e5bd..f1e02836b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,8 +1,9 @@
 [tox]
 envlist=
     pre-commit-all
-    py312-torch28-tf_latest-unit
-    py312-cuda12-gpu
+    py312-torch210-tf_latest-unit
+    py312-cuda13-gpu
+    py312-cuda13-gpu-megatron
 skipsdist = True
 toxworkdir = /tmp/{env:USER}-modelopt-tox
 
@@ -10,13 +11,14 @@ toxworkdir = /tmp/{env:USER}-modelopt-tox
 ############################
 # CPU Unit test environments
 ############################
-[testenv:{py310,py311,py312}-torch{26,27,28,29}-tf_{min,latest}-unit]
+[testenv:{py310,py311,py312}-torch{26,27,28,29,210}-tf_{min,latest}-unit]
 deps =
     # torch version auto-selected based on torchvision version
     torch26: torchvision~=0.21.0
     torch27: torchvision~=0.22.0
     torch28: torchvision~=0.23.0
     torch29: torchvision~=0.24.0
+    torch210: torchvision~=0.25.0
 
     # Install megatron-core for special unit tests
     megatron-core
@@ -36,8 +38,8 @@ commands =
 allowlist_externals =
     bash, rm
 deps =
-    # Make sure torch 2.9 is used
-    torchvision~=0.24.0
+    # Make sure torch 2.10 is used
+    torchvision~=0.25.0
 
     # ONNX unit tests heavily rely on torch / torchvision
     onnx: .[onnx,dev-test]
@@ -57,7 +59,7 @@ commands =
 ###########################################################
 # GPU test environments (Should be used with --current-env)
 ###########################################################
-[testenv:{py310,py311,py312}-cuda12-gpu]
+[testenv:{py310,py311,py312}-cuda13-gpu]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
@@ -67,7 +69,7 @@ commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
     python -m pytest tests/gpu
 
-[testenv:{py310,py311,py312}-cuda12-gpu-megatron]
+[testenv:{py310,py311,py312}-cuda13-gpu-megatron]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
     pip install -U megatron-core

From dea2eeb63b9351122958b47ee70b5db3f106815d Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 13 Feb 2026 16:29:24 -0800
Subject: [PATCH 2/3] Fix failures

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .github/workflows/example_tests.yml           |  8 ++--
 .github/workflows/gpu_tests.yml               |  8 ++--
 .github/workflows/unit_tests.yml              |  3 +-
 .../plugins/transformers_trainer.py           | 48 +++++++++++++++++++
 tests/_test_utils/import_helper.py            | 33 ++++++++-----
 .../distill/plugins/test_distill_megatron.py  |  4 --
 .../export/test_unified_export_megatron.py    |  3 --
 .../test_vllm_fakequant_megatron_export.py    |  3 --
 .../test_megatron_gpt_dynamic_modules.py      |  4 --
 .../test_megatron_mamba_dynamic_modules.py    |  2 +-
 .../torch/peft/plugins/test_megatron_peft.py  |  9 +---
 .../test_mcore_gpt_minitron_pruning.py        |  4 --
 .../test_mcore_mamba_minitron_pruning.py      |  2 +-
 .../quantization/plugins/test_megatron.py     |  4 --
 .../test_speculative_megatron_modules.py      |  4 --
 tox.ini                                       | 20 ++++----
 16 files changed, 93 insertions(+), 66 deletions(-)

diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
index feaf0c21b..b8f5cfe4b 100644
--- a/.github/workflows/example_tests.yml
+++ b/.github/workflows/example_tests.yml
@@ -66,11 +66,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -83,11 +83,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 713913681..3c7ec0ed3 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -63,9 +63,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda13-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda13-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: ${{ matrix.timeout }}
@@ -89,9 +89,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda12-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda12-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: ${{ matrix.timeout }}
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index bb8ebc2d5..252d4b719 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -55,7 +55,8 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
-        run: pip install tox && tox -e py312-torch210-tf_latest-unit
+        # Some issues with torch 2.10 on Windows, so using 2.9 for now
+        run: pip install tox && tox -e py312-torch29-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
     needs: [linux]
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
index 5a10105a0..b92b240c0 100644
--- a/modelopt/torch/quantization/plugins/transformers_trainer.py
+++ b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -15,6 +15,7 @@
 
 """ModelOpt plugin for transformers Trainer."""
 
+import contextlib
 import gc
 import json
 import os
@@ -100,6 +101,52 @@ class QuantizationArgumentsWithConfig(QuantizationArguments):
     )
 
 
+def _patch_fsdp2_post_backward():
+    """Patch FSDP2 ``post_backward`` to handle mixed-precision gradient dtypes.
+
+    FSDP2 with bf16 mixed precision upcasts bf16 parameters to fp32 for optimizer
+    precision, while gradients are reduced in bf16. In PyTorch >= 2.6, assigning a
+    bf16 gradient to a fp32 parameter raises a ``RuntimeError`` due to the
+    ``grad_dtype`` check, and the fused Adam optimizer also rejects mixed dtypes.
+
+    This patch wraps ``FSDPParamGroup.post_backward`` to:
+    1. Set ``grad_dtype=None`` on sharded params before reduction (allowing bf16 assignment).
+    2. Cast gradients to match parameter dtype after reduction (so the optimizer sees matching dtypes).
+
+    .. note::
+        This is a workaround. The proper fix should come from PyTorch's FSDP2
+        ``foreach_reduce`` (which should cast gradients to match the parameter dtype)
+        or from accelerate (which should set ``grad_dtype`` when it upcasts params).
+        Remove this once the upstream fix is available.
+    """
+    try:
+        from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+    except ImportError:
+        return
+
+    if hasattr(FSDPParamGroup, "_modelopt_original_post_backward"):
+        return  # Already patched
+
+    FSDPParamGroup._modelopt_original_post_backward = FSDPParamGroup.post_backward
+
+    @torch.no_grad()
+    def _patched_post_backward(self):
+        # Allow bf16 gradients to be assigned to fp32 parameters
+        for fsdp_param in self.fsdp_params:
+            with contextlib.suppress(AttributeError):
+                fsdp_param.sharded_param.grad_dtype = None
+
+        self._modelopt_original_post_backward()
+
+        # Cast gradients to parameter dtype so the optimizer sees matching dtypes
+        for fsdp_param in self.fsdp_params:
+            sp = fsdp_param.sharded_param
+            if sp.grad is not None and sp.grad.dtype != sp.dtype:
+                sp.grad = sp.grad.to(sp.dtype)
+
+    FSDPParamGroup.post_backward = _patched_post_backward
+
+
 def check_awq_smoothquant(quant_cfg):
     # TODO: Remove this once deepspeed for AWQ and SmoothQuant is added
     """Get the quantization type from the configuration."""
@@ -337,6 +384,7 @@ def _patch_accelerate_for_fsdp2_fix(self):
         is causing issues with quantized models since quantization modules adds buffers which are not sharded.
         This patch hides the buffers added by quantization modules from the original accelerate prepare.
         """
+        _patch_fsdp2_post_backward()
 
         def _modelopt_prepare(self, *args, **kwargs):
             if not self.is_fsdp2:
diff --git a/tests/_test_utils/import_helper.py b/tests/_test_utils/import_helper.py
index 43f974935..c8c17f2f1 100644
--- a/tests/_test_utils/import_helper.py
+++ b/tests/_test_utils/import_helper.py
@@ -12,8 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import ctypes
 import importlib.metadata
+import os
 import shutil
 
 import pytest
@@ -28,6 +29,23 @@ def skip_if_no_tensorrt():
     except (AssertionError, ImportError) as e:
         pytest.skip(f"{e}", allow_module_level=True)
 
+    # Also verify that ORT's TensorRT EP can actually load its native library.
+    # The tensorrt Python package may be installed, but ORT's provider shared library
+    # (libonnxruntime_providers_tensorrt.so) could fail to load due to CUDA version
+    # mismatches (e.g., ORT built for CUDA 12 running on a CUDA 13 system).
+    try:
+        import onnxruntime
+
+        ort_capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
+        trt_provider_lib = os.path.join(ort_capi_dir, "libonnxruntime_providers_tensorrt.so")
+        if os.path.isfile(trt_provider_lib):
+            ctypes.CDLL(trt_provider_lib)
+    except OSError as e:
+        pytest.skip(
+            f"ORT TensorRT EP native library cannot be loaded: {e}",
+            allow_module_level=True,
+        )
+
 
 def skip_if_no_trtexec():
     if not shutil.which("trtexec"):
@@ -43,19 +61,12 @@ def skip_if_no_libcudnn():
         pytest.skip(f"{e}!", allow_module_level=True)
 
 
-def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool = False):
+def skip_if_no_megatron(*, te_required: bool = True, mamba_required: bool = False):
     try:
         import megatron  # noqa: F401
     except ImportError:
         pytest.skip("megatron not available", allow_module_level=True)
 
-    try:
-        import apex  # noqa: F401
-
-        has_apex = True
-    except ImportError:
-        has_apex = False
-
     try:
         import transformer_engine  # noqa: F401
 
@@ -70,8 +81,8 @@ def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool
     except ImportError:
         has_mamba = False
 
-    if apex_or_te_required and not has_apex and not has_te:
-        pytest.skip("Apex or TE required for Megatron test", allow_module_level=True)
+    if te_required and not has_te:
+        pytest.skip("TE required for Megatron test", allow_module_level=True)
 
     if mamba_required and not has_mamba:
         pytest.skip("Mamba required for Megatron test", allow_module_level=True)
diff --git a/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
index 6e1833dd6..b3b35e792 100644
--- a/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
+++ b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
@@ -16,10 +16,6 @@
 from functools import partial
 
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import run_mcore_inference_with_dummy_input
diff --git a/tests/gpu_megatron/torch/export/test_unified_export_megatron.py b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
index 376efff2e..e931e6a95 100644
--- a/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
+++ b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
@@ -21,14 +21,11 @@
 import pytest
 import torch
 import transformers
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import get_forward
 from _test_utils.torch.transformers_models import create_tiny_llama_dir
 
-skip_if_no_megatron(apex_or_te_required=True)
-
 import modelopt.torch.quantization as mtq
 import modelopt.torch.speculative as mtsp
 from modelopt.torch.export import KV_CACHE_FP8, export_mcore_gpt_to_hf, import_mcore_gpt_from_hf
diff --git a/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
index ea351db6a..8e4578d7b 100644
--- a/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
+++ b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
@@ -18,15 +18,12 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export import export_mcore_gpt_to_hf_vllm_fq
 
-skip_if_no_megatron(apex_or_te_required=True)
-
 
 def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
     """Test megatron-core model export for vLLM with fake quantization."""
diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
index 6771bb9a0..df1c6e240 100644
--- a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
+++ b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
@@ -17,10 +17,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import run_mcore_inference
diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
index aa499abf0..de743dc36 100644
--- a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
+++ b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
@@ -17,7 +17,7 @@
 import torch
 from _test_utils.import_helper import skip_if_no_megatron
 
-skip_if_no_megatron(apex_or_te_required=True, mamba_required=True)
+skip_if_no_megatron(mamba_required=True)
 
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model
diff --git a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
index 34d22d2fd..1615321ae 100644
--- a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
+++ b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
@@ -19,22 +19,17 @@
 import pytest
 import torch
 import torch.nn.init as init
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import initialize_for_megatron
 from megatron.core import dist_checkpointing
 
+import modelopt.torch.peft as mtpeft
+import modelopt.torch.quantization as mtq
 from modelopt.torch.opt.plugins.mcore_dist_checkpointing import (
     restore_sharded_modelopt_state,
     save_sharded_modelopt_state,
 )
-
-skip_if_no_megatron()
-
-
-import modelopt.torch.peft as mtpeft
-import modelopt.torch.quantization as mtq
 from modelopt.torch.peft.lora.layer import LoRAModule
 from modelopt.torch.utils.plugins import megatron_prefill
 
diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
index 92cffc572..b30aa11f0 100644
--- a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
+++ b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
@@ -17,10 +17,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import (
diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
index 79fe7faf2..69b286c6b 100644
--- a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
+++ b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
@@ -22,7 +22,7 @@
 import torch
 from _test_utils.import_helper import skip_if_no_megatron
 
-skip_if_no_megatron(apex_or_te_required=True, mamba_required=True)
+skip_if_no_megatron(mamba_required=True)
 
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model
diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
index b107eca71..6fdb4b60d 100644
--- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
+++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
@@ -18,7 +18,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import (
     MegatronModel,
@@ -41,9 +40,6 @@
     data_tensor_context_parallel_test_helper,
     verify_kv_cache_amax_sync,
 )
-
-skip_if_no_megatron()
-
 from megatron.core.parallel_state import (
     destroy_model_parallel,
     get_data_parallel_group,
diff --git a/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
index 5a149b77f..0bb9658ff 100644
--- a/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
+++ b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
@@ -16,10 +16,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 
diff --git a/tox.ini b/tox.ini
index f1e02836b..079e9a2ad 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,8 +2,8 @@
 envlist=
     pre-commit-all
     py312-torch210-tf_latest-unit
-    py312-cuda13-gpu
-    py312-cuda13-gpu-megatron
+    cuda13-gpu
+    cuda13-gpu-megatron
 skipsdist = True
 toxworkdir = /tmp/{env:USER}-modelopt-tox
 
@@ -59,26 +59,24 @@ commands =
 ###########################################################
 # GPU test environments (Should be used with --current-env)
 ###########################################################
-[testenv:{py310,py311,py312}-cuda13-gpu]
+[testenv:cuda13-gpu]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
-    pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
-
+    pip install --no-build-isolation git+https://github.com/Dao-AILab/fast-hadamard-transform.git
     pip install -e .[all,dev-test]
+
+    # Install cupy-cuda13x for INT4 ONNX quantization (default is cupy-cuda12x)
+    pip uninstall -y cupy-cuda12x
+    pip install cupy-cuda13x
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
     python -m pytest tests/gpu
 
-[testenv:{py310,py311,py312}-cuda13-gpu-megatron]
+[testenv:cuda13-gpu-megatron]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
     pip install -U megatron-core
-
-    # Skip triton because pytorch-triton is installed in the NGC PyTorch containers
-    pip install pip-mark-installed
-    pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
-
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"

From 4f5a65fadf1c45a0916931bf95fd4fc5655f3661 Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Wed, 18 Feb 2026 04:04:50 -0800
Subject: [PATCH 3/3] Fix test_nvfp4_onnx_export.py

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 tests/_test_utils/import_helper.py                     | 2 +-
 tests/gpu/torch/quantization/test_nvfp4_onnx_export.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/_test_utils/import_helper.py b/tests/_test_utils/import_helper.py
index c8c17f2f1..a1148480a 100644
--- a/tests/_test_utils/import_helper.py
+++ b/tests/_test_utils/import_helper.py
@@ -99,5 +99,5 @@ def skip_if_onnx_version_above_1_18():
 
     if version.parse(installed_version) > version.parse(required_version):
         pytest.skip(
-            f"{package_name} version {installed_version} is less than required {required_version}"
+            f"{package_name} version {installed_version} is greater than required {required_version}"
         )
diff --git a/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py b/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py
index c5a792209..8ffdf3829 100644
--- a/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py
+++ b/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py
@@ -98,6 +98,7 @@ def forward_loop(model, run_backward=False):
         output_names=["output"],
         export_params=True,
         opset_version=17,
+        dynamo=False,
     )
 
     onnx_model = NVFP4QuantExporter.process_model(onnx.load(onnx_path))