NVIDIA · chochowski · Feb 13, 2026 · Feb 13, 2026 · danielkorzekwa · Feb 16, 2026
@@ -285,3 +285,10 @@ python -m nemo_export/convert_nemo_to_hf --input-ckpt-path path/to/nemo-model --
 ## Advanced Usage
 
 Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios.
+
+## GptOss - 20b
+
+With this release Puzzle algorithm supports only experts removal for Gpt-Oss-20b. This model comes as a quantized checkpoint i.e. MoE experts matrices are quantized with mxfp4 format. In the prunning steps puzzle utilizes decompressed model (back to bf16) for statistics and scores computation. This means, during the conversion to puzzle format we decompress the model and store it as a bf16. Once the pruning is done i.e. experts to be removed are identified and the process is finished, user may want to get back the mxfp4 format of the checkpoint. To do so, there is an additional script, that takes the original and the pruned checkpoint and outputs pruned checkpoint in mxfp4 format. 
+```bash
+python gpt_oss_pack_mxfp4_vllm.py --student-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/solution_0/ --original-path /workspaces/source_model_checkpoints/openai_gpt-oss-20b/ --output-path /workspaces/any_model_gpt_oss_20b/mip/puzzle_solutions/stats_num_params_18014757184/solutions--checkpoints/mxfp4-ckpt/ --deduce-experts --num-layers 24
+``` 
@@ -0,0 +1,110 @@
+defaults:
+  - pruning: ffn_pruning
+  - scoring: ../validate_solutions_defaults
+  - realize_model: ../validate_solutions_defaults
+  - bypass:
+  - override hydra/hydra_logging: disabled
+  - _self_
+
+puzzle_dir: ???
+descriptor: llama
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ??? # path to Nemotron-Post-Training-Dataset-v2
+
+skip_realize_model: false
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override: # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
+  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+  runtime_stats:
+    backend: trt_torch
+
+scoring:
+  descriptor: ${descriptor}
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 128
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+
+  human_constraints:
+    target_memory: 45_000
+    num_params: 3_000_000_000
+
+  mip_constraints:
+  metric_overrides:
+  max_seconds_per_solution: 60
+
+realize_model:
+  descriptor: ${descriptor}
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path: # Filled dynamically
+
+  # Validate params
+  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 128
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
@@ -0,0 +1,22 @@
+defaults:
+  - gptoss-20b
+  - _self_
+
+# Input Hugging Face model to compress
+input_hf_model_path: /workspace/hf_models/openai/gpt-oss-20b
+
+# Dataset path for pruning and NAS scoring
+dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+
+# Working directory for compression outputs
+puzzle_dir: /workspace/puzzle_dir
+
+# MIP memory constraint (in MiB)
+mip:
+  human_constraints:
+    target_memory: 45_000 # 45 GiB
+
+# FFN intermediate sizes to search over (heterogeneous architecture)
+# teacher_intermediate_size is 8192, so we use proportionally smaller values
+pruning:
+  intermediate_size_list: [2048, 4096, 6144]
@@ -0,0 +1,21 @@
+defaults:
+  - pruning_defaults
+
+eval_samples: 2500 #10
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/expert_removal/${pruning.experiment_id}
+
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin.ExpertRemovalPruningMixIn
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b.gpt_oss_20b_model_descriptor.GptOss20bExpertRemovalLayerDescriptor
+    target_name: "mlp.router"
+
+hook_class: ${get_object:utils.activation_hooks.hooks.RankedChoiceVotingHook}
+activation_hooks_kwargs:    # Additional kwargs to pass to the hook init
+
+num_experts_to_keep_list: [24, 16, 8]  # num_experts in teacher is 128
+mlp_init_mode: "ExpertRemoval"
+mlp_init_config_yaml:
+  expert_scores_key: "expert_ranks"
+  layer_prefix_template: "model.layers.{layer_idx}.mlp.router"
+
@@ -0,0 +1,34 @@
+defaults:
+  - /validate_model_defaults
+
+model_name_or_path: ${teacher_dir}
+experiment_id: ${pruning.eval_samples}samples_diverse_mini
+activations_log_dir: ???
+activation_hooks_kwargs: ???
+
+descriptor: ${descriptor}
+
+# Data:
+eval_samples: 10_000
+micro_batch_size: 1
+dataset_path: ${dataset_path}
+val_dataset_name: train
+
+# Prune ckpts
+pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
+
+## FFN pruning
+ffn_list:
+mlp_init_mode: "Truncate" # PruneByActivationsLog
+
+## KV-heads pruning
+n_heads_in_group_list:
+gqa_init_mode: "AverageKV" 
+
+## Hidden dimension pruning
+hidden_size_list:
+hidden_size_init_mode: "PruneByChannelRanking"
+linear_init_mode: "FromTeacher"
+
+mlp_init_config_yaml:
+  activations_log_dir: ${pruning.activations_log_dir}
@@ -0,0 +1,18 @@
+model_dtype: torch.bfloat16 # dtype to cast the model for validate_model
+autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model
+block_size: 8192
+bos_rate: 0.5
+data_column: messages
+val_dataset_name: valid
+shuffle_seed: 81436
+seed: 42
+fim_rate: 0
+fim_spm_rate: 0
+source_datasets_to_discard:
+varlen: false
+write_results: false
+calc_losses_on_cpu: false
+activations_log_dir:
+model_name_or_path:
+load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn}
+
@@ -0,0 +1,11 @@
+defaults:
+  - /validate_model_defaults
+  - _self_
+
+solutions_to_validate:
+skip_validation: false
+save_models: false
+bigger_is_better: false
+sort_solutions_by:
+calculate_full_score_ablations: false
+
@@ -27,6 +27,7 @@
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 from transformers import PretrainedConfig
+from transformers.integrations.mxfp4 import convert_moe_packed_tensors
 
 from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor
 from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
@@ -61,8 +62,9 @@ def _get_weight_map(input_dir: Path) -> Dict[str, str]:
                 f"Neither {index_path} nor {single_file_path} found. Cannot determine model format."
             )
 
-    @staticmethod
+    @classmethod
     def convert_model_weights(
+        cls,
         input_dir: Path, output_dir: Path, descriptor: ModelDescriptor, num_hidden_layers: int
     ):
         """Convert model weights to subblock format."""
@@ -95,7 +97,18 @@ def convert_model_weights(
                 data = load_file(os.path.join(input_dir, file))
                 for name in param_names:
                     if param_to_file[name] == file and name in data:
-                        tensors[name] = data[name]
+                        converted_name = cls.convert_weight_name(name)
+                        # Convert MoE packed tensors if quantized is mxfp4 //gpt-oss-20b
+                        if getattr(cls, 'quantized', None) == 'mxfp4':
+                            if name.endswith("_blocks"):
+                                converted_name = converted_name.replace("_blocks", "")
+                                tensors[converted_name] = convert_moe_packed_tensors(data[converted_name+"_blocks"], data[converted_name+"_scales"])
+                            elif name.endswith("_scales"):
+                                continue
+                            else:
+                                tensors[converted_name] = data[name]
+                        else:
+                            tensors[converted_name] = data[name]
 
             # Save this subblock
             print(f"\n✅ Group: {subblock} ({len(tensors)} layers)")

@@ -36,6 +36,7 @@ class GptOss20bConverter(Converter):
     GPT-OSS-20B is a pure MoE model with 32 experts per layer and 4 active experts.
     All layers use MoE FFN (no standard dense FFN layers).
     """
+    quantized = 'mxfp4'
 
     @staticmethod
     def create_block_configs_from_main_config(config: PretrainedConfig) -> List[BlockConfig]:

@@ -50,6 +50,13 @@ class GptOss20bModelDescriptor(ModelDescriptor):
 
     _DECODER_LAYER_CLS: Type[nn.Module] = None
 
+    @classmethod
+    def create_dummy_block(cls, original_layer: GptOssDecoderLayer, block_index: int) -> nn.Module:
+        dummy_block = DummyBlock(block_index=block_index)
+        # Required by `GptOssModel.forward`.
+        dummy_block.attention_type = original_layer.attention_type
+        return dummy_block
+
     @staticmethod
     def decoder_layer_cls():
         """Get the decoder layer class for GPT-OSS models.
@@ -132,7 +139,7 @@ def build_ffn_predicates() -> Dict[str, re.Pattern]:
                     r"(post_attention_layernorm\.weight"
                     r"|mlp\.router\.weight"
                     r"|mlp\.router\.bias"
-                    r"|mlp\.experts\.((\d+\.)?(gate_up_proj|down_proj)(\.(weight|bias|blocks|scales))?|gate_up_proj_(bias|blocks|scales)|down_proj_(bias|blocks|scales)))$"
+                    r"|mlp\.experts\.(gate_up_proj|down_proj)(_(bias|blocks|scales))?)$"
                 )
                 for layer_idx in range(num_layers)
             }
@@ -190,12 +197,15 @@ class GptOss20bExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor):
 
     target_name: str = "mlp"
     moe_prefix_name: str = "model.layers.{layer_idx}.mlp"
-    expert_prefix_name: str = "experts.{expert_idx}"
+    expert_prefix_name: str = "experts"
 
     # Router has both weight and bias
     router_weights: List[str] = field(default_factory=lambda: ["router.weight"])
     router_biases: List[str] = field(default_factory=lambda: ["router.bias"])
 
+    # Fused format: experts stored as single tensors
+    is_fused_experts: bool = True
+
     # Fused format: single tensors containing all experts (test models)
     fused_expert_weights: List[str] = field(
         default_factory=lambda: [
@@ -212,5 +222,12 @@ class GptOss20bExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor):
         default_factory=lambda: ["gate_up_proj_bias", "down_proj_bias"]
     )
 
-    # Fused format: experts stored as single tensors
-    is_fused_experts: bool = True
+    def get_modules_names_to_hook(self, model) -> List[Tuple[int, str]]:
+        target_class_name = "GptOssTopKRouter"
+
+        module_names_to_hook = []
+        for module_name, module in model.named_modules():
+            if module_name.endswith(self.target_name) and module.__class__.__name__ == target_class_name:
+                module_names_to_hook.append((self.block_idx_from_module_name(module_name), module_name))
+        return module_names_to_hook
+