Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .github/workflows/example_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ jobs:
example: [llm_distill, llm_qat, llm_sparsity]
include:
- example: speculative_decoding
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
docker_image: "26.01"
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-l4-latest-1
Expand All @@ -83,11 +83,11 @@ jobs:
example: [llm_distill, llm_qat, llm_sparsity]
include:
- example: speculative_decoding
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
docker_image: "26.01"
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-h100-latest-2
Expand All @@ -103,7 +103,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-h100-latest-1
Expand All @@ -117,7 +117,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-h100-latest-2
Expand All @@ -133,7 +133,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
example: ${{ matrix.example }}
pip_install_extras: "[all,dev-test]"
runner: linux-amd64-gpu-l4-latest-1
Expand All @@ -147,7 +147,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
example: ${{ matrix.example }}
pip_install_extras: "[all,dev-test]"
runner: linux-amd64-gpu-l4-latest-1
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ jobs:
fail-fast: false
matrix:
include:
- example: py312-cuda12-gpu
- example: cuda13-gpu
timeout: 90
- example: py312-cuda12-gpu-megatron
- example: cuda13-gpu-megatron
timeout: 120
runs-on: linux-amd64-gpu-l4-latest-1
timeout-minutes: ${{ matrix.timeout }}
container: &gpu_container
image: nvcr.io/nvidia/pytorch:25.06-py3
image: nvcr.io/nvidia/pytorch:26.01-py3
env:
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
Expand All @@ -89,9 +89,9 @@ jobs:
fail-fast: false
matrix:
include:
- example: py312-cuda12-gpu
- example: cuda13-gpu
timeout: 90
- example: py312-cuda12-gpu-megatron
- example: cuda13-gpu-megatron
timeout: 120
runs-on: linux-amd64-gpu-h100-latest-2
timeout-minutes: ${{ matrix.timeout }}
Expand Down
9 changes: 5 additions & 4 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
- uses: actions/checkout@v6
- uses: ./.github/actions/ubuntu-setup
- name: Run unit tests
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
Expand All @@ -55,6 +55,7 @@ jobs:
with:
python-version: "3.12"
- name: Run unit tests (without coverage)
# Some issues with torch 2.10 on Windows, so using 2.9 for now
run: pip install tox && tox -e py312-torch29-tf_latest-unit
multi-py:
if: github.event_name == 'pull_request'
Expand All @@ -70,15 +71,15 @@ jobs:
with:
python-version: "3.${{ matrix.py }}"
- name: Run unit tests
run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit
multi-torch:
if: github.event_name == 'pull_request'
needs: [linux]
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
matrix:
torch: [26, 27, 28]
torch: [26, 27, 28, 29]
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/ubuntu-setup
Expand All @@ -96,7 +97,7 @@ jobs:
- uses: actions/checkout@v6
- uses: ./.github/actions/ubuntu-setup
- name: Run unit tests
run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit
partial-install:
if: github.event_name == 'pull_request'
needs: [linux]
Expand Down
8 changes: 6 additions & 2 deletions docs/source/getting_started/_installation_for_Linux.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
+-------------------------+-----------------------------+
| Python | >=3.10,<3.13 |
+-------------------------+-----------------------------+
| CUDA | >=12.0 |
| CUDA | 12.x, 13.x |
+-------------------------+-----------------------------+
| PyTorch | >=2.6 |
+-------------------------+-----------------------------+
| TensorRT-LLM (Optional) | 1.2.0rc4 |
| TensorRT-LLM (Optional) | >=1.0 |
+-------------------------+-----------------------------+
| ONNX Runtime (Optional) | 1.22 |
+-------------------------+-----------------------------+
Expand Down Expand Up @@ -126,6 +126,10 @@ Additionally, we support installing dependencies for following 3rd-party package
* - Huggingface (``transformers``, ``diffusers``, etc.)
- ``[hf]``

**CUDA specific dependencies**

* By default, ``cupy-cuda12x`` is installed for INT4 ONNX quantization. If you have CUDA 13, you need to run ``pip uninstall -y cupy-cuda12x`` and ``pip install cupy-cuda13x`` after installing ``nvidia-modelopt[onnx]``.

**Accelerated Quantization with Triton Kernels**

ModelOpt includes optimized quantization kernels implemented with Triton language that accelerate quantization
Expand Down
48 changes: 48 additions & 0 deletions modelopt/torch/quantization/plugins/transformers_trainer.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@realAsma please help review this cursor-generate fix for passing llm_qat test on PyTorch 2.10 (pytorch:26.01-py3 container)

Failing test without this fix: https://github.com/NVIDIA/Model-Optimizer/actions/runs/22007571666/job/63595325923?pr=891

[rank0]: RuntimeError: attempting to assign a gradient with dtype 'c10::BFloat16' to a tensor with grad_dtype 'Float'. The gradient must match the tensor's grad_dtype (defaults to the tensor's dtype). You can set the tensor's grad_dtype attribute with a specific dtype, or None to allow any dtype. Set grad_dtype with caution. Diverging the dtypes of a tensor and its gradient may break downstream systems that assume they match.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error does not seem to be related to Model Optimizer. We should probably raise this to Pytorch of Huggingface. Can we add a note that temp fix and this should be removed in future?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Asma for the review. Can you please raise an issue with torch / transformers with required details to them? I will add ther issue link here for tracking

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

"""ModelOpt plugin for transformers Trainer."""

import contextlib
import gc
import json
import os
Expand Down Expand Up @@ -100,6 +101,52 @@ class QuantizationArgumentsWithConfig(QuantizationArguments):
)


def _patch_fsdp2_post_backward():
"""Patch FSDP2 ``post_backward`` to handle mixed-precision gradient dtypes.

FSDP2 with bf16 mixed precision upcasts bf16 parameters to fp32 for optimizer
precision, while gradients are reduced in bf16. In PyTorch >= 2.6, assigning a
bf16 gradient to a fp32 parameter raises a ``RuntimeError`` due to the
``grad_dtype`` check, and the fused Adam optimizer also rejects mixed dtypes.

This patch wraps ``FSDPParamGroup.post_backward`` to:
1. Set ``grad_dtype=None`` on sharded params before reduction (allowing bf16 assignment).
2. Cast gradients to match parameter dtype after reduction (so the optimizer sees matching dtypes).

.. note::
This is a workaround. The proper fix should come from PyTorch's FSDP2
``foreach_reduce`` (which should cast gradients to match the parameter dtype)
or from accelerate (which should set ``grad_dtype`` when it upcasts params).
Remove this once the upstream fix is available.
"""
try:
from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
except ImportError:
return

if hasattr(FSDPParamGroup, "_modelopt_original_post_backward"):
return # Already patched

FSDPParamGroup._modelopt_original_post_backward = FSDPParamGroup.post_backward

@torch.no_grad()
def _patched_post_backward(self):
# Allow bf16 gradients to be assigned to fp32 parameters
for fsdp_param in self.fsdp_params:
with contextlib.suppress(AttributeError):
fsdp_param.sharded_param.grad_dtype = None

self._modelopt_original_post_backward()

# Cast gradients to parameter dtype so the optimizer sees matching dtypes
for fsdp_param in self.fsdp_params:
sp = fsdp_param.sharded_param
if sp.grad is not None and sp.grad.dtype != sp.dtype:
sp.grad = sp.grad.to(sp.dtype)

FSDPParamGroup.post_backward = _patched_post_backward


def check_awq_smoothquant(quant_cfg):
# TODO: Remove this once deepspeed for AWQ and SmoothQuant is added
"""Get the quantization type from the configuration."""
Expand Down Expand Up @@ -337,6 +384,7 @@ def _patch_accelerate_for_fsdp2_fix(self):
is causing issues with quantized models since quantization modules adds buffers which are not sharded.
This patch hides the buffers added by quantization modules from the original accelerate prepare.
"""
_patch_fsdp2_post_backward()

def _modelopt_prepare(self, *args, **kwargs):
if not self.is_fsdp2:
Expand Down
35 changes: 23 additions & 12 deletions tests/_test_utils/import_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ctypes
import importlib.metadata
import os
import shutil

import pytest
Expand All @@ -28,6 +29,23 @@ def skip_if_no_tensorrt():
except (AssertionError, ImportError) as e:
pytest.skip(f"{e}", allow_module_level=True)

# Also verify that ORT's TensorRT EP can actually load its native library.
# The tensorrt Python package may be installed, but ORT's provider shared library
# (libonnxruntime_providers_tensorrt.so) could fail to load due to CUDA version
# mismatches (e.g., ORT built for CUDA 12 running on a CUDA 13 system).
try:
import onnxruntime

ort_capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
trt_provider_lib = os.path.join(ort_capi_dir, "libonnxruntime_providers_tensorrt.so")
if os.path.isfile(trt_provider_lib):
ctypes.CDLL(trt_provider_lib)
except OSError as e:
pytest.skip(
f"ORT TensorRT EP native library cannot be loaded: {e}",
allow_module_level=True,
)


def skip_if_no_trtexec():
if not shutil.which("trtexec"):
Expand All @@ -43,19 +61,12 @@ def skip_if_no_libcudnn():
pytest.skip(f"{e}!", allow_module_level=True)


def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool = False):
def skip_if_no_megatron(*, te_required: bool = True, mamba_required: bool = False):
try:
import megatron # noqa: F401
except ImportError:
pytest.skip("megatron not available", allow_module_level=True)

try:
import apex # noqa: F401

has_apex = True
except ImportError:
has_apex = False

try:
import transformer_engine # noqa: F401

Expand All @@ -70,8 +81,8 @@ def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool
except ImportError:
has_mamba = False

if apex_or_te_required and not has_apex and not has_te:
pytest.skip("Apex or TE required for Megatron test", allow_module_level=True)
if te_required and not has_te:
pytest.skip("TE required for Megatron test", allow_module_level=True)

if mamba_required and not has_mamba:
pytest.skip("Mamba required for Megatron test", allow_module_level=True)
Expand All @@ -88,5 +99,5 @@ def skip_if_onnx_version_above_1_18():

if version.parse(installed_version) > version.parse(required_version):
pytest.skip(
f"{package_name} version {installed_version} is less than required {required_version}"
f"{package_name} version {installed_version} is greater than required {required_version}"
)
1 change: 1 addition & 0 deletions tests/gpu/torch/quantization/test_nvfp4_onnx_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def forward_loop(model, run_backward=False):
output_names=["output"],
export_params=True,
opset_version=17,
dynamo=False,
)

onnx_model = NVFP4QuantExporter.process_model(onnx.load(onnx_path))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@
from functools import partial

import torch
from _test_utils.import_helper import skip_if_no_megatron

skip_if_no_megatron(apex_or_te_required=True)

from _test_utils.torch.distributed.utils import spawn_multiprocess_job
from _test_utils.torch.megatron.models import get_mcore_gpt_model
from _test_utils.torch.megatron.utils import run_mcore_inference_with_dummy_input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,11 @@
import pytest
import torch
import transformers
from _test_utils.import_helper import skip_if_no_megatron
from _test_utils.torch.distributed.utils import spawn_multiprocess_job
from _test_utils.torch.megatron.models import get_mcore_gpt_model
from _test_utils.torch.megatron.utils import get_forward
from _test_utils.torch.transformers_models import create_tiny_llama_dir

skip_if_no_megatron(apex_or_te_required=True)

import modelopt.torch.quantization as mtq
import modelopt.torch.speculative as mtsp
from modelopt.torch.export import KV_CACHE_FP8, export_mcore_gpt_to_hf, import_mcore_gpt_from_hf
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,12 @@

import pytest
import torch
from _test_utils.import_helper import skip_if_no_megatron
from _test_utils.torch.distributed.utils import spawn_multiprocess_job
from _test_utils.torch.megatron.models import get_mcore_gpt_model

import modelopt.torch.quantization as mtq
from modelopt.torch.export import export_mcore_gpt_to_hf_vllm_fq

skip_if_no_megatron(apex_or_te_required=True)


def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
"""Test megatron-core model export for vLLM with fake quantization."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@

import pytest
import torch
from _test_utils.import_helper import skip_if_no_megatron

skip_if_no_megatron(apex_or_te_required=True)

from _test_utils.torch.distributed.utils import spawn_multiprocess_job
from _test_utils.torch.megatron.models import get_mcore_gpt_model
from _test_utils.torch.megatron.utils import run_mcore_inference
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import torch
from _test_utils.import_helper import skip_if_no_megatron

skip_if_no_megatron(apex_or_te_required=True, mamba_required=True)
skip_if_no_megatron(mamba_required=True)

from _test_utils.torch.distributed.utils import spawn_multiprocess_job
from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model
Expand Down
Loading