-
Notifications
You must be signed in to change notification settings - Fork 274
Add support for Qwen3Omni30B thinking model #856
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
ajrasane
wants to merge
7
commits into
main
Choose a base branch
from
ajrasane/qwen3omni_final
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
f0519b1
Add support for Qwen3Omni30B thinking model
ajrasane dfedafa
Optimize calibration for text data
ajrasane e287c0b
Refactor model specific code to example_utils
ajrasane 1221e4f
Update hf configs for vLLM deployment
ajrasane 1b4440b
Create a script to run vllm inference
ajrasane 2a0ff6f
Add an option to supply host as an argument
ajrasane 8740db3
Add video dataset utils
ajrasane File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
| import copy | ||
| import glob | ||
| import inspect | ||
| import json | ||
| import os | ||
| import shutil | ||
| import sys | ||
|
|
@@ -42,10 +43,135 @@ | |
| snapshot_download = None | ||
|
|
||
| import modelopt.torch.quantization as mtq | ||
| from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor | ||
| from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE | ||
| from modelopt.torch.utils.dataset_utils import get_dataset_dataloader | ||
| from modelopt.torch.utils.image_processor import ( | ||
| BaseImageProcessor, | ||
| MllamaImageProcessor, | ||
| Qwen3OmniImageProcessor, | ||
| ) | ||
| from modelopt.torch.utils.video_dataset_utils import ( | ||
| Qwen3OmniVideoProcessor, | ||
| get_supported_video_datasets, | ||
| get_video_dataset_dataloader, | ||
| ) | ||
| from modelopt.torch.utils.vlm_dataset_utils import ( | ||
| get_supported_vlm_datasets, | ||
| get_vlm_dataset_dataloader, | ||
| ) | ||
|
|
||
| SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"] | ||
|
|
||
| # Files needed for tokenizer/processor that vLLM loads from model path | ||
| TOKENIZER_FILES = [ | ||
| "vocab.json", | ||
| "merges.txt", | ||
| "tokenizer.json", | ||
| "tokenizer_config.json", | ||
| "special_tokens_map.json", | ||
| "preprocessor_config.json", | ||
| "chat_template.json", | ||
| ] | ||
|
|
||
|
|
||
| def get_model_type_from_config(model_path: str) -> str | None: | ||
| """Get model type from the config.json file. | ||
|
|
||
| Args: | ||
| model_path: Path to the model directory or HuggingFace model ID. | ||
|
|
||
| Returns: | ||
| Model type string (e.g., 'qwen3omni', 'llama', 'gpt') or None if not found. | ||
| """ | ||
| config_path = os.path.join(model_path, "config.json") | ||
| if not os.path.exists(config_path): | ||
| return None | ||
|
|
||
| with open(config_path) as f: | ||
| config = json.load(f) | ||
|
|
||
| # Check architectures field first | ||
| architectures = config.get("architectures", []) | ||
| for arch in architectures: | ||
| for key, model_type in MODEL_NAME_TO_TYPE.items(): | ||
| if key.lower() in arch.lower(): | ||
| return model_type | ||
|
|
||
| # Fallback to model_type field | ||
| model_type_field = config.get("model_type", "") | ||
| for key, model_type in MODEL_NAME_TO_TYPE.items(): | ||
| if key.lower() in model_type_field.lower(): | ||
| return model_type | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| def get_sampling_params_from_config(model_path: str) -> dict: | ||
| """Extract sampling params from generation_config.json if present.""" | ||
| gen_config_path = Path(model_path) / "generation_config.json" | ||
| if not gen_config_path.exists(): | ||
| return {} | ||
|
|
||
| gen_config = json.loads(gen_config_path.read_text()) | ||
|
|
||
| params = {k: gen_config[k] for k in ("temperature", "top_p", "top_k") if k in gen_config} | ||
|
|
||
| for key in ("max_new_tokens", "max_length"): | ||
| if key in gen_config: | ||
| params["max_tokens"] = gen_config[key] | ||
| break | ||
|
|
||
| return params | ||
|
|
||
|
|
||
| def get_quantization_format(model_path: str) -> str | None: | ||
| """Get quantization format from the model config. | ||
|
|
||
| Args: | ||
| model_path: Path to the model directory. | ||
|
|
||
| Returns: | ||
| vLLM quantization string ('modelopt', 'modelopt_fp4') or None if not quantized. | ||
| """ | ||
| hf_quant_config_path = os.path.join(model_path, "hf_quant_config.json") | ||
| if os.path.exists(hf_quant_config_path): | ||
| with open(hf_quant_config_path) as f: | ||
| quant_config = json.load(f) | ||
| quant_algo = quant_config.get("quantization", {}).get("quant_algo", "") | ||
| if "NVFP4" in quant_algo: | ||
| return "modelopt_fp4" | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None: | ||
| """Copy tokenizer files from HF model to local quantized model dir if missing.""" | ||
| if not os.path.isdir(model_path): | ||
| return # Not a local path, nothing to do | ||
|
|
||
| # Check if tokenizer files are missing | ||
| missing_files = [f for f in TOKENIZER_FILES if not os.path.exists(os.path.join(model_path, f))] | ||
| if not missing_files: | ||
| return | ||
|
|
||
| if snapshot_download is None: | ||
| print("Warning: huggingface_hub not installed, cannot download tokenizer files") | ||
| return | ||
|
|
||
| print(f"Copying missing tokenizer files from {source_model_id}...") | ||
| # Download only tokenizer files from HF | ||
| cache_dir = snapshot_download( | ||
| source_model_id, | ||
| allow_patterns=TOKENIZER_FILES, | ||
| ) | ||
|
|
||
| for fname in TOKENIZER_FILES: | ||
| src = os.path.join(cache_dir, fname) | ||
| dst = os.path.join(model_path, fname) | ||
| if os.path.exists(src) and not os.path.exists(dst): | ||
| shutil.copy2(src, dst) | ||
| print(f" Copied {fname}") | ||
|
|
||
|
|
||
| def run_nemotron_vl_preview( | ||
| full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False | ||
|
|
@@ -240,9 +366,33 @@ def build_quant_cfg( | |
| quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False} | ||
|
|
||
| if model_type == "qwen3omni": | ||
| print( | ||
| "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model" | ||
| ) | ||
| quant_cfg["quant_cfg"]["*conv*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} | ||
|
|
||
| return quant_cfg | ||
|
|
||
|
|
||
| def get_generation_kwargs(model_type: str) -> dict[str, Any]: | ||
| """Get model-specific generation kwargs for calibration. | ||
|
|
||
| Args: | ||
| model_type: The model type string. | ||
|
|
||
| Returns: | ||
| Dictionary of generation kwargs for the model. | ||
| """ | ||
| generation_kwargs = {} | ||
| if model_type == "qwen3omni": | ||
| generation_kwargs["return_audio"] = False | ||
| generation_kwargs["thinker_max_new_tokens"] = 1 | ||
| return generation_kwargs | ||
|
|
||
|
|
||
| def is_speculative(hf_config): | ||
| """Check if the model architecture is a speculative model.""" | ||
| return hf_config.architectures and any( | ||
|
|
@@ -284,7 +434,7 @@ def get_processor( | |
| if attn_implementation is not None: | ||
| model_kwargs["attn_implementation"] = attn_implementation | ||
|
|
||
| if model_type == "whisper": | ||
| if model_type in ("whisper", "mllama", "qwen3omni"): | ||
| processor = AutoProcessor.from_pretrained( | ||
| ckpt_path, | ||
| padding_side="left", | ||
|
|
@@ -296,20 +446,11 @@ def get_processor( | |
| f"Pad token for {ckpt_path} cannot be set!" | ||
| ) | ||
|
|
||
| if model_type == "mllama": | ||
| return MllamaImageProcessor(processor, device) | ||
| elif model_type == "qwen3omni": | ||
| return Qwen3OmniImageProcessor(processor, device) | ||
| return processor | ||
| elif model_type == "mllama": | ||
| processor = AutoProcessor.from_pretrained( | ||
| ckpt_path, | ||
| padding_side="left", | ||
| **model_kwargs, | ||
| ) | ||
| if processor.tokenizer.pad_token is None: | ||
| processor.tokenizer.pad_token = processor.tokenizer.eos_token | ||
| assert processor.tokenizer.pad_token is not None, ( | ||
| f"Pad token for {ckpt_path} cannot be set!" | ||
| ) | ||
|
|
||
| return MllamaImageProcessor(processor, device) | ||
|
|
||
| return None | ||
|
|
||
|
|
@@ -622,3 +763,131 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod | |
| print(f"Successfully copied {len(copied_files)} custom model files to {export_path}") | ||
| else: | ||
| print("No custom model files found to copy") | ||
|
|
||
|
|
||
| def patch_config_for_unified_export(model_type: str, export_path: str) -> None: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Edwardf0t1 could you review this part? |
||
| """Patch config files to add missing exclusion patterns for unified HF export. | ||
|
|
||
| This function adds missing exclusion patterns for modules that should not be quantized | ||
| (e.g., audio tower, visual encoder, lm_head) to both hf_quant_config.json and config.json. | ||
|
|
||
| Args: | ||
| export_path: Path to the exported model directory. | ||
| """ | ||
| if model_type == "qwen3omni": | ||
| missing_patterns = [ | ||
| "thinker.audio_tower*", | ||
| "thinker.visual*", | ||
| "thinker.lm_head", | ||
| ] | ||
|
|
||
| # (filename, path_to_exclude_list) | ||
| configs = [ | ||
| ("hf_quant_config.json", ["quantization", "exclude_modules"]), | ||
| ("config.json", ["quantization_config", "ignore"]), | ||
| ] | ||
|
|
||
| for filename, keys in configs: | ||
| filepath = os.path.join(export_path, filename) | ||
| if not os.path.exists(filepath): | ||
| continue | ||
| try: | ||
| with open(filepath) as f: | ||
| config = json.load(f) | ||
|
|
||
| # Navigate to nested key | ||
| target = config | ||
| for key in keys[:-1]: | ||
| target = target.get(key, {}) | ||
|
|
||
| exclude_list = target.get(keys[-1]) | ||
| if exclude_list is None: | ||
| continue | ||
|
|
||
| added = [p for p in missing_patterns if p not in exclude_list] | ||
| if added: | ||
| exclude_list.extend(added) | ||
| with open(filepath, "w") as f: | ||
| json.dump(config, f, indent=2) | ||
| print(f"Patched {filename} with exclusions: {added}") | ||
| except Exception as e: | ||
| print(f"Warning: Failed to patch {filename}: {e}") | ||
|
|
||
|
|
||
| def get_qwen3omni_dataloader( | ||
| dataset_name: str | list[str] | None, | ||
| processor: Qwen3OmniImageProcessor | None, | ||
| tokenizer, | ||
| batch_size: int, | ||
| num_samples: int | list[int], | ||
| device: torch.device, | ||
| model_dtype: torch.dtype, | ||
| include_labels: bool = False, | ||
| ): | ||
| """Create a calibration dataloader for Qwen3Omni models. | ||
|
|
||
| Handles video, VLM, and text-only dataset configurations. | ||
|
|
||
| Args: | ||
| dataset_name: Name of the dataset(s) to use for calibration. | ||
| processor: The Qwen3OmniImageProcessor for multimodal inputs. | ||
| tokenizer: The tokenizer for text-only fallback. | ||
| batch_size: Batch size for the dataloader. | ||
| num_samples: Number of samples to use (int or list for multi-dataset). | ||
| device: Target device for tensors. | ||
| model_dtype: Model dtype for proper tensor conversion. | ||
| include_labels: Whether to include labels (for gradient-based auto_quantize). | ||
|
|
||
| Returns: | ||
| DataLoader for calibration. | ||
| """ | ||
| if dataset_name is None: | ||
| dataset_name = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] | ||
| num_samples = [512, 512] | ||
|
|
||
| if processor is not None: | ||
| if dataset_name in get_supported_video_datasets(): | ||
| assert isinstance(dataset_name, str) | ||
| video_processor = Qwen3OmniVideoProcessor( | ||
| processor.tokenizer if hasattr(processor, "tokenizer") else processor, | ||
| device=device, | ||
| dtype=model_dtype, | ||
| use_audio_in_video=True, | ||
| ) | ||
| calib_dataloader = get_video_dataset_dataloader( | ||
| dataset_name=dataset_name, | ||
| processor=video_processor, | ||
| batch_size=batch_size, | ||
| num_samples=num_samples if isinstance(num_samples, int) else num_samples[0], | ||
| ) | ||
| elif dataset_name in get_supported_vlm_datasets(): | ||
| assert isinstance(dataset_name, str) | ||
| assert isinstance(processor, Qwen3OmniImageProcessor), ( | ||
| "The Qwen3OmniImageProcessor must be set." | ||
| ) | ||
| # Set the dtype for proper tensor conversion in collate_function | ||
| processor.dtype = model_dtype | ||
| calib_dataloader = get_vlm_dataset_dataloader( | ||
| dataset_name=dataset_name, | ||
| processor=processor, | ||
| batch_size=batch_size, | ||
| num_samples=num_samples if isinstance(num_samples, int) else num_samples[0], | ||
| ) | ||
| else: | ||
| raise ValueError( | ||
| f"Dataset '{dataset_name}' not supported for Qwen3Omni with processor. " | ||
| f"Supported video datasets: {get_supported_video_datasets()}, " | ||
| f"Supported VLM datasets: {get_supported_vlm_datasets()}" | ||
| ) | ||
| else: | ||
| # Text-only fallback | ||
| calib_dataloader = get_dataset_dataloader( | ||
| dataset_name=dataset_name if isinstance(dataset_name, list) else [dataset_name], | ||
| tokenizer=tokenizer, | ||
| batch_size=batch_size, | ||
| num_samples=num_samples if isinstance(num_samples, list) else [num_samples], | ||
| device=device, | ||
| include_labels=include_labels, | ||
| ) | ||
|
|
||
| return calib_dataloader | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do we need this?