From 2727a61b0fefa81950edb48e05f16fa7987b80d0 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Wed, 11 Feb 2026 03:10:08 +0000 Subject: [PATCH] update Signed-off-by: Zhewen Li --- examples/llm_ptq/.gitignore | 1 + examples/llm_ptq/hf_ptq.py | 14 +- examples/llm_ptq/ptq.log | 57764 +++++++++++++++++++ modelopt/torch/quantization/model_calib.py | 3 + modelopt/torch/quantization/utils.py | 11 + modelopt/torch/utils/dataset_utils.py | 9 +- 6 files changed, 57799 insertions(+), 3 deletions(-) create mode 100644 examples/llm_ptq/ptq.log diff --git a/examples/llm_ptq/.gitignore b/examples/llm_ptq/.gitignore index 941b45e52..1a5ee38bd 100644 --- a/examples/llm_ptq/.gitignore +++ b/examples/llm_ptq/.gitignore @@ -1,3 +1,4 @@ saved_models_* *.model model_config.yaml +qwen3_omni_30b_nvfp4/ \ No newline at end of file diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 4f3a8af28..ea8eb0f82 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -766,7 +766,11 @@ def pre_quantize( elif model_type == "qwen3omni": # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, max_new_tokens=100) + print("[DEBUG] pre_quantize: starting qwen3omni preview generation (max_new_tokens=100)...", flush=True) + result = full_model.generate( + **calib_batch, max_new_tokens=100, thinker_max_new_tokens=100 + ) + print("[DEBUG] pre_quantize: preview generation complete", flush=True) if isinstance(result, tuple): text_ids, _ = result generated_ids_before_ptq = ( @@ -827,7 +831,11 @@ def post_quantize( elif model_type == "qwen3omni": # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, max_new_tokens=100) + # Note: thinker_max_new_tokens controls the thinker's generation limit (default 1024), + # which is separate from max_new_tokens. Cap it to avoid long waits. + result = full_model.generate( + **calib_batch, max_new_tokens=100, thinker_max_new_tokens=100 + ) if isinstance(result, tuple): text_ids, _ = result generated_ids_after_ptq = ( @@ -958,9 +966,11 @@ def quantize_main( # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) + print("[DEBUG] quantize_main: calling pre_quantize...", flush=True) preview_input_ids, generated_ids_before_ptq, calib_batch = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) + print("[DEBUG] quantize_main: pre_quantize done, proceeding to quantization", flush=True) if args.auto_quantize_bits: assert len(args.qformat.split(",")) > 1, ( diff --git a/examples/llm_ptq/ptq.log b/examples/llm_ptq/ptq.log new file mode 100644 index 000000000..03cd26063 --- /dev/null +++ b/examples/llm_ptq/ptq.log @@ -0,0 +1,57764 @@ +Skipping import of cpp extensions due to incompatible torch version 2.10.0+cu130 for torchao version 0.14.1 Please see https://github.com/pytorch/ao/issues/2919 for more info +Multiple distributions found for package modelopt. Picked distribution: nvidia-modelopt +ModelOpt save/restore enabled for `transformers` library. +ModelOpt save/restore enabled for `diffusers` library. +ModelOpt save/restore enabled for `peft` library. +Initializing model from Qwen/Qwen3-Omni-30B-A3B-Thinking +Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved', 'mrope_interleaved'} +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour +Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved', 'mrope_interleaved'} + Loading checkpoint shards: 0%| | 0/16 [00:00 to _QuantAttention for KV Cache quantization +Registered to _QuantAttention for KV Cache quantization +Registered to _QuantAttention for KV Cache quantization +Inserted 57392 quantizers +[DEBUG] max_calibrate: starting forward_loop + 0%| | 0/32 [00:00user\nLONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he\'ll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. "I\'ll definitely have some sort of party," he said in an interview. "Hopefully none of you will be reading about it." Radcliffe\'s earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. "People are always looking to say \'kid star goes off the rails,\'" he told reporters last month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Watch I-Reporter give her review of Potter\'s latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called "My Boy Jack," about author Rudyard Kipling and his son, due for release later this year. He will also appear in "December Boys," an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer\'s "Equus." Meanwhile, he is braced for even closer media scrutiny now that he\'s legally an adult: "I just think I\'m going to be more sort of fair game," he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n'] +-------- +example outputs before ptq: ["Hmm, the user has shared a news article about Daniel Radcliffe turning 18 and gaining access to his £20 million fortune. Let me analyze this carefully.\n\nFirst, I need to understand what the user is asking for. They've provided the full Reuters article but haven't specified a question. Since they're sharing this historical news piece (from 2007), they might be looking for either a summary, analysis, or perhaps context about Radcliffe's career trajectory. \n\nLooking"] +-------- +example outputs after ptq: ["Hmm, the user has shared a news article about Daniel Radcliffe turning 18 and gaining access to his £20 million fortune. They haven't asked a specific question, so I need to figure out what they want. Maybe they're looking for a summary, analysis, or just sharing the article for discussion.\n\nFirst, I should read through the article carefully. It's from Reuters, dated 2007, so it's historical context. Key points: Radcliffe is turning "] +Running optimization on Qwen3Omni thinker with fake_input shape: torch.Size([1, 2]) +[DEBUG] reduce_amax called 2150000 times, input shape=torch.Size([768, 128, 16]), device=cuda:0 +[DEBUG] reduce_amax called 2160000 times, input shape=torch.Size([2048, 48, 16]), device=cuda:1 +[DEBUG] reduce_amax called 2170000 times, input shape=torch.Size([768, 128, 16]), device=cuda:2 +[DEBUG] reduce_amax called 2180000 times, input shape=torch.Size([768, 128, 16]), device=cuda:3 +Saved ModelOpt state to qwen3_omni_30b_nvfp4/modelopt_state.pth +Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved', 'mrope_interleaved'} + Fetching 7 files: 0%| | 0/7 [00:00 None: infer_method = model.generate if use_generate else model.forward max_working_batch_size = None # Initialize max working batch size as None - for _, data in enumerate(tqdm(dataloader)): + for idx, data in enumerate(tqdm(dataloader)): + print(f"[DEBUG] Starting calibration iteration {idx}", flush=True) # For generate(), add max_new_tokens to prevent indefinite generation during calibration if use_generate: data["max_new_tokens"] = 1 + # For Qwen3-Omni Thinking models, the thinker's token limit is controlled by + # a separate `thinker_max_new_tokens` param (default 1024), not `max_new_tokens`. + # Cap it to avoid unbounded chain-of-thought generation during calibration. + if "qwen3omni" in model.__class__.__name__.lower(): + data["thinker_max_new_tokens"] = 1 # Process batch and update max working batch size max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size) + print(f"[DEBUG] Finished calibration iteration {idx}", flush=True) def create_forward_loop(