From f0bd99c8daed15ae2e83b849b702270667fd130e Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:27:24 -0800 Subject: [PATCH 1/3] Upgrade Dev containers for CICD to latest Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/example_tests.yml | 12 ++++++------ .github/workflows/gpu_tests.yml | 6 +++--- .github/workflows/unit_tests.yml | 10 +++++----- .../getting_started/_installation_for_Linux.rst | 8 ++++++-- tox.ini | 16 +++++++++------- 5 files changed, 29 insertions(+), 23 deletions(-) diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index c1dab5dab..feaf0c21b 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -70,7 +70,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }} + docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }} example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-l4-latest-1 @@ -87,7 +87,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }} + docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }} example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-h100-latest-2 @@ -103,7 +103,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-h100-latest-1 @@ -117,7 +117,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-h100-latest-2 @@ -133,7 +133,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3" + docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3" example: ${{ matrix.example }} pip_install_extras: "[all,dev-test]" runner: linux-amd64-gpu-l4-latest-1 @@ -147,7 +147,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3" + docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3" example: ${{ matrix.example }} pip_install_extras: "[all,dev-test]" runner: linux-amd64-gpu-l4-latest-1 diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 3e55682cd..713913681 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -63,14 +63,14 @@ jobs: fail-fast: false matrix: include: - - example: py312-cuda12-gpu + - example: py312-cuda13-gpu timeout: 90 - - example: py312-cuda12-gpu-megatron + - example: py312-cuda13-gpu-megatron timeout: 120 runs-on: linux-amd64-gpu-l4-latest-1 timeout-minutes: ${{ matrix.timeout }} container: &gpu_container - image: nvcr.io/nvidia/pytorch:25.06-py3 + image: nvcr.io/nvidia/pytorch:26.01-py3 env: GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 6f7fad3a7..bb8ebc2d5 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -37,7 +37,7 @@ jobs: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup - name: Run unit tests - run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit + run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: @@ -55,7 +55,7 @@ jobs: with: python-version: "3.12" - name: Run unit tests (without coverage) - run: pip install tox && tox -e py312-torch29-tf_latest-unit + run: pip install tox && tox -e py312-torch210-tf_latest-unit multi-py: if: github.event_name == 'pull_request' needs: [linux] @@ -70,7 +70,7 @@ jobs: with: python-version: "3.${{ matrix.py }}" - name: Run unit tests - run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit + run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit multi-torch: if: github.event_name == 'pull_request' needs: [linux] @@ -78,7 +78,7 @@ jobs: timeout-minutes: 30 strategy: matrix: - torch: [26, 27, 28] + torch: [26, 27, 28, 29] steps: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup @@ -96,7 +96,7 @@ jobs: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup - name: Run unit tests - run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit + run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit partial-install: if: github.event_name == 'pull_request' needs: [linux] diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst index 0a82ecd1e..74276aa3b 100644 --- a/docs/source/getting_started/_installation_for_Linux.rst +++ b/docs/source/getting_started/_installation_for_Linux.rst @@ -14,11 +14,11 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system +-------------------------+-----------------------------+ | Python | >=3.10,<3.13 | +-------------------------+-----------------------------+ -| CUDA | >=12.0 | +| CUDA | 12.x, 13.x | +-------------------------+-----------------------------+ | PyTorch | >=2.6 | +-------------------------+-----------------------------+ -| TensorRT-LLM (Optional) | 1.2.0rc4 | +| TensorRT-LLM (Optional) | >=1.0 | +-------------------------+-----------------------------+ | ONNX Runtime (Optional) | 1.22 | +-------------------------+-----------------------------+ @@ -126,6 +126,10 @@ Additionally, we support installing dependencies for following 3rd-party package * - Huggingface (``transformers``, ``diffusers``, etc.) - ``[hf]`` +**CUDA specific dependencies** + +* By default, ``cupy-cuda12x`` is installed for INT4 ONNX quantization. If you have CUDA 13, you need to run ``pip uninstall -y cupy-cuda12x`` and ``pip install cupy-cuda13x`` after installing ``nvidia-modelopt[onnx]``. + **Accelerated Quantization with Triton Kernels** ModelOpt includes optimized quantization kernels implemented with Triton language that accelerate quantization diff --git a/tox.ini b/tox.ini index ae296e5bd..f1e02836b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,9 @@ [tox] envlist= pre-commit-all - py312-torch28-tf_latest-unit - py312-cuda12-gpu + py312-torch210-tf_latest-unit + py312-cuda13-gpu + py312-cuda13-gpu-megatron skipsdist = True toxworkdir = /tmp/{env:USER}-modelopt-tox @@ -10,13 +11,14 @@ toxworkdir = /tmp/{env:USER}-modelopt-tox ############################ # CPU Unit test environments ############################ -[testenv:{py310,py311,py312}-torch{26,27,28,29}-tf_{min,latest}-unit] +[testenv:{py310,py311,py312}-torch{26,27,28,29,210}-tf_{min,latest}-unit] deps = # torch version auto-selected based on torchvision version torch26: torchvision~=0.21.0 torch27: torchvision~=0.22.0 torch28: torchvision~=0.23.0 torch29: torchvision~=0.24.0 + torch210: torchvision~=0.25.0 # Install megatron-core for special unit tests megatron-core @@ -36,8 +38,8 @@ commands = allowlist_externals = bash, rm deps = - # Make sure torch 2.9 is used - torchvision~=0.24.0 + # Make sure torch 2.10 is used + torchvision~=0.25.0 # ONNX unit tests heavily rely on torch / torchvision onnx: .[onnx,dev-test] @@ -57,7 +59,7 @@ commands = ########################################################### # GPU test environments (Should be used with --current-env) ########################################################### -[testenv:{py310,py311,py312}-cuda12-gpu] +[testenv:{py310,py311,py312}-cuda13-gpu] commands_pre = # Install deps here so that it gets installed even in --current-env pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git @@ -67,7 +69,7 @@ commands = # Coverage fails with "Can't combine line data with arc data" error so not using "--cov" python -m pytest tests/gpu -[testenv:{py310,py311,py312}-cuda12-gpu-megatron] +[testenv:{py310,py311,py312}-cuda13-gpu-megatron] commands_pre = # Install deps here so that it gets installed even in --current-env pip install -U megatron-core From dea2eeb63b9351122958b47ee70b5db3f106815d Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:29:24 -0800 Subject: [PATCH 2/3] Fix failures Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/example_tests.yml | 8 ++-- .github/workflows/gpu_tests.yml | 8 ++-- .github/workflows/unit_tests.yml | 3 +- .../plugins/transformers_trainer.py | 48 +++++++++++++++++++ tests/_test_utils/import_helper.py | 33 ++++++++----- .../distill/plugins/test_distill_megatron.py | 4 -- .../export/test_unified_export_megatron.py | 3 -- .../test_vllm_fakequant_megatron_export.py | 3 -- .../test_megatron_gpt_dynamic_modules.py | 4 -- .../test_megatron_mamba_dynamic_modules.py | 2 +- .../torch/peft/plugins/test_megatron_peft.py | 9 +--- .../test_mcore_gpt_minitron_pruning.py | 4 -- .../test_mcore_mamba_minitron_pruning.py | 2 +- .../quantization/plugins/test_megatron.py | 4 -- .../test_speculative_megatron_modules.py | 4 -- tox.ini | 20 ++++---- 16 files changed, 93 insertions(+), 66 deletions(-) diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index feaf0c21b..b8f5cfe4b 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -66,11 +66,11 @@ jobs: example: [llm_distill, llm_qat, llm_sparsity] include: - example: speculative_decoding - docker_image: "nvcr.io/nvidia/pytorch:26.01-py3" + docker_image: "26.01" uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }} + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-l4-latest-1 @@ -83,11 +83,11 @@ jobs: example: [llm_distill, llm_qat, llm_sparsity] include: - example: speculative_decoding - docker_image: "nvcr.io/nvidia/pytorch:26.01-py3" + docker_image: "26.01" uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }} + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-h100-latest-2 diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 713913681..3c7ec0ed3 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -63,9 +63,9 @@ jobs: fail-fast: false matrix: include: - - example: py312-cuda13-gpu + - example: cuda13-gpu timeout: 90 - - example: py312-cuda13-gpu-megatron + - example: cuda13-gpu-megatron timeout: 120 runs-on: linux-amd64-gpu-l4-latest-1 timeout-minutes: ${{ matrix.timeout }} @@ -89,9 +89,9 @@ jobs: fail-fast: false matrix: include: - - example: py312-cuda12-gpu + - example: cuda13-gpu timeout: 90 - - example: py312-cuda12-gpu-megatron + - example: cuda13-gpu-megatron timeout: 120 runs-on: linux-amd64-gpu-h100-latest-2 timeout-minutes: ${{ matrix.timeout }} diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index bb8ebc2d5..252d4b719 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -55,7 +55,8 @@ jobs: with: python-version: "3.12" - name: Run unit tests (without coverage) - run: pip install tox && tox -e py312-torch210-tf_latest-unit + # Some issues with torch 2.10 on Windows, so using 2.9 for now + run: pip install tox && tox -e py312-torch29-tf_latest-unit multi-py: if: github.event_name == 'pull_request' needs: [linux] diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 5a10105a0..b92b240c0 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -15,6 +15,7 @@ """ModelOpt plugin for transformers Trainer.""" +import contextlib import gc import json import os @@ -100,6 +101,52 @@ class QuantizationArgumentsWithConfig(QuantizationArguments): ) +def _patch_fsdp2_post_backward(): + """Patch FSDP2 ``post_backward`` to handle mixed-precision gradient dtypes. + + FSDP2 with bf16 mixed precision upcasts bf16 parameters to fp32 for optimizer + precision, while gradients are reduced in bf16. In PyTorch >= 2.6, assigning a + bf16 gradient to a fp32 parameter raises a ``RuntimeError`` due to the + ``grad_dtype`` check, and the fused Adam optimizer also rejects mixed dtypes. + + This patch wraps ``FSDPParamGroup.post_backward`` to: + 1. Set ``grad_dtype=None`` on sharded params before reduction (allowing bf16 assignment). + 2. Cast gradients to match parameter dtype after reduction (so the optimizer sees matching dtypes). + + .. note:: + This is a workaround. The proper fix should come from PyTorch's FSDP2 + ``foreach_reduce`` (which should cast gradients to match the parameter dtype) + or from accelerate (which should set ``grad_dtype`` when it upcasts params). + Remove this once the upstream fix is available. + """ + try: + from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup + except ImportError: + return + + if hasattr(FSDPParamGroup, "_modelopt_original_post_backward"): + return # Already patched + + FSDPParamGroup._modelopt_original_post_backward = FSDPParamGroup.post_backward + + @torch.no_grad() + def _patched_post_backward(self): + # Allow bf16 gradients to be assigned to fp32 parameters + for fsdp_param in self.fsdp_params: + with contextlib.suppress(AttributeError): + fsdp_param.sharded_param.grad_dtype = None + + self._modelopt_original_post_backward() + + # Cast gradients to parameter dtype so the optimizer sees matching dtypes + for fsdp_param in self.fsdp_params: + sp = fsdp_param.sharded_param + if sp.grad is not None and sp.grad.dtype != sp.dtype: + sp.grad = sp.grad.to(sp.dtype) + + FSDPParamGroup.post_backward = _patched_post_backward + + def check_awq_smoothquant(quant_cfg): # TODO: Remove this once deepspeed for AWQ and SmoothQuant is added """Get the quantization type from the configuration.""" @@ -337,6 +384,7 @@ def _patch_accelerate_for_fsdp2_fix(self): is causing issues with quantized models since quantization modules adds buffers which are not sharded. This patch hides the buffers added by quantization modules from the original accelerate prepare. """ + _patch_fsdp2_post_backward() def _modelopt_prepare(self, *args, **kwargs): if not self.is_fsdp2: diff --git a/tests/_test_utils/import_helper.py b/tests/_test_utils/import_helper.py index 43f974935..c8c17f2f1 100644 --- a/tests/_test_utils/import_helper.py +++ b/tests/_test_utils/import_helper.py @@ -12,8 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import ctypes import importlib.metadata +import os import shutil import pytest @@ -28,6 +29,23 @@ def skip_if_no_tensorrt(): except (AssertionError, ImportError) as e: pytest.skip(f"{e}", allow_module_level=True) + # Also verify that ORT's TensorRT EP can actually load its native library. + # The tensorrt Python package may be installed, but ORT's provider shared library + # (libonnxruntime_providers_tensorrt.so) could fail to load due to CUDA version + # mismatches (e.g., ORT built for CUDA 12 running on a CUDA 13 system). + try: + import onnxruntime + + ort_capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi") + trt_provider_lib = os.path.join(ort_capi_dir, "libonnxruntime_providers_tensorrt.so") + if os.path.isfile(trt_provider_lib): + ctypes.CDLL(trt_provider_lib) + except OSError as e: + pytest.skip( + f"ORT TensorRT EP native library cannot be loaded: {e}", + allow_module_level=True, + ) + def skip_if_no_trtexec(): if not shutil.which("trtexec"): @@ -43,19 +61,12 @@ def skip_if_no_libcudnn(): pytest.skip(f"{e}!", allow_module_level=True) -def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool = False): +def skip_if_no_megatron(*, te_required: bool = True, mamba_required: bool = False): try: import megatron # noqa: F401 except ImportError: pytest.skip("megatron not available", allow_module_level=True) - try: - import apex # noqa: F401 - - has_apex = True - except ImportError: - has_apex = False - try: import transformer_engine # noqa: F401 @@ -70,8 +81,8 @@ def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool except ImportError: has_mamba = False - if apex_or_te_required and not has_apex and not has_te: - pytest.skip("Apex or TE required for Megatron test", allow_module_level=True) + if te_required and not has_te: + pytest.skip("TE required for Megatron test", allow_module_level=True) if mamba_required and not has_mamba: pytest.skip("Mamba required for Megatron test", allow_module_level=True) diff --git a/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py index 6e1833dd6..b3b35e792 100644 --- a/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py +++ b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py @@ -16,10 +16,6 @@ from functools import partial import torch -from _test_utils.import_helper import skip_if_no_megatron - -skip_if_no_megatron(apex_or_te_required=True) - from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_gpt_model from _test_utils.torch.megatron.utils import run_mcore_inference_with_dummy_input diff --git a/tests/gpu_megatron/torch/export/test_unified_export_megatron.py b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py index 376efff2e..e931e6a95 100644 --- a/tests/gpu_megatron/torch/export/test_unified_export_megatron.py +++ b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py @@ -21,14 +21,11 @@ import pytest import torch import transformers -from _test_utils.import_helper import skip_if_no_megatron from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_gpt_model from _test_utils.torch.megatron.utils import get_forward from _test_utils.torch.transformers_models import create_tiny_llama_dir -skip_if_no_megatron(apex_or_te_required=True) - import modelopt.torch.quantization as mtq import modelopt.torch.speculative as mtsp from modelopt.torch.export import KV_CACHE_FP8, export_mcore_gpt_to_hf, import_mcore_gpt_from_hf diff --git a/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py index ea351db6a..8e4578d7b 100644 --- a/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py +++ b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py @@ -18,15 +18,12 @@ import pytest import torch -from _test_utils.import_helper import skip_if_no_megatron from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_gpt_model import modelopt.torch.quantization as mtq from modelopt.torch.export import export_mcore_gpt_to_hf_vllm_fq -skip_if_no_megatron(apex_or_te_required=True) - def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size): """Test megatron-core model export for vLLM with fake quantization.""" diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py index 6771bb9a0..df1c6e240 100644 --- a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py +++ b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py @@ -17,10 +17,6 @@ import pytest import torch -from _test_utils.import_helper import skip_if_no_megatron - -skip_if_no_megatron(apex_or_te_required=True) - from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_gpt_model from _test_utils.torch.megatron.utils import run_mcore_inference diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py index aa499abf0..de743dc36 100644 --- a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py +++ b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py @@ -17,7 +17,7 @@ import torch from _test_utils.import_helper import skip_if_no_megatron -skip_if_no_megatron(apex_or_te_required=True, mamba_required=True) +skip_if_no_megatron(mamba_required=True) from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model diff --git a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py index 34d22d2fd..1615321ae 100644 --- a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py +++ b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py @@ -19,22 +19,17 @@ import pytest import torch import torch.nn.init as init -from _test_utils.import_helper import skip_if_no_megatron from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_gpt_model from _test_utils.torch.megatron.utils import initialize_for_megatron from megatron.core import dist_checkpointing +import modelopt.torch.peft as mtpeft +import modelopt.torch.quantization as mtq from modelopt.torch.opt.plugins.mcore_dist_checkpointing import ( restore_sharded_modelopt_state, save_sharded_modelopt_state, ) - -skip_if_no_megatron() - - -import modelopt.torch.peft as mtpeft -import modelopt.torch.quantization as mtq from modelopt.torch.peft.lora.layer import LoRAModule from modelopt.torch.utils.plugins import megatron_prefill diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py index 92cffc572..b30aa11f0 100644 --- a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py +++ b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py @@ -17,10 +17,6 @@ import pytest import torch -from _test_utils.import_helper import skip_if_no_megatron - -skip_if_no_megatron(apex_or_te_required=True) - from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_gpt_model from _test_utils.torch.megatron.utils import ( diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py index 79fe7faf2..69b286c6b 100644 --- a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py +++ b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py @@ -22,7 +22,7 @@ import torch from _test_utils.import_helper import skip_if_no_megatron -skip_if_no_megatron(apex_or_te_required=True, mamba_required=True) +skip_if_no_megatron(mamba_required=True) from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index b107eca71..6fdb4b60d 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -18,7 +18,6 @@ import pytest import torch -from _test_utils.import_helper import skip_if_no_megatron from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import ( MegatronModel, @@ -41,9 +40,6 @@ data_tensor_context_parallel_test_helper, verify_kv_cache_amax_sync, ) - -skip_if_no_megatron() - from megatron.core.parallel_state import ( destroy_model_parallel, get_data_parallel_group, diff --git a/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py index 5a149b77f..0bb9658ff 100644 --- a/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py +++ b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py @@ -16,10 +16,6 @@ import pytest import torch -from _test_utils.import_helper import skip_if_no_megatron - -skip_if_no_megatron(apex_or_te_required=True) - from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.megatron.models import get_mcore_gpt_model diff --git a/tox.ini b/tox.ini index f1e02836b..079e9a2ad 100644 --- a/tox.ini +++ b/tox.ini @@ -2,8 +2,8 @@ envlist= pre-commit-all py312-torch210-tf_latest-unit - py312-cuda13-gpu - py312-cuda13-gpu-megatron + cuda13-gpu + cuda13-gpu-megatron skipsdist = True toxworkdir = /tmp/{env:USER}-modelopt-tox @@ -59,26 +59,24 @@ commands = ########################################################### # GPU test environments (Should be used with --current-env) ########################################################### -[testenv:{py310,py311,py312}-cuda13-gpu] +[testenv:cuda13-gpu] commands_pre = # Install deps here so that it gets installed even in --current-env - pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git - + pip install --no-build-isolation git+https://github.com/Dao-AILab/fast-hadamard-transform.git pip install -e .[all,dev-test] + + # Install cupy-cuda13x for INT4 ONNX quantization (default is cupy-cuda12x) + pip uninstall -y cupy-cuda12x + pip install cupy-cuda13x commands = # Coverage fails with "Can't combine line data with arc data" error so not using "--cov" python -m pytest tests/gpu -[testenv:{py310,py311,py312}-cuda13-gpu-megatron] +[testenv:cuda13-gpu-megatron] commands_pre = # Install deps here so that it gets installed even in --current-env pip install -U megatron-core - - # Skip triton because pytorch-triton is installed in the NGC PyTorch containers - pip install pip-mark-installed - pip-mark-installed triton pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git - pip install -e .[all,dev-test] commands = # Coverage fails with "Can't combine line data with arc data" error so not using "--cov" From 4f5a65fadf1c45a0916931bf95fd4fc5655f3661 Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Wed, 18 Feb 2026 04:04:50 -0800 Subject: [PATCH 3/3] Fix test_nvfp4_onnx_export.py Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- tests/_test_utils/import_helper.py | 2 +- tests/gpu/torch/quantization/test_nvfp4_onnx_export.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/_test_utils/import_helper.py b/tests/_test_utils/import_helper.py index c8c17f2f1..a1148480a 100644 --- a/tests/_test_utils/import_helper.py +++ b/tests/_test_utils/import_helper.py @@ -99,5 +99,5 @@ def skip_if_onnx_version_above_1_18(): if version.parse(installed_version) > version.parse(required_version): pytest.skip( - f"{package_name} version {installed_version} is less than required {required_version}" + f"{package_name} version {installed_version} is greater than required {required_version}" ) diff --git a/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py b/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py index c5a792209..8ffdf3829 100644 --- a/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py +++ b/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py @@ -98,6 +98,7 @@ def forward_loop(model, run_backward=False): output_names=["output"], export_params=True, opset_version=17, + dynamo=False, ) onnx_model = NVFP4QuantExporter.process_model(onnx.load(onnx_path))