NVIDIA · kevalmorabia97 · Dec 22, 2025
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add support for parallel draft heads in Eagle speculative decoding.
 - Add support to enable custom emulated quantization backend. See :meth:`register_quant_backend <modelopt.torch.quantization.nn.modules.tensor_quantizer.register_quant_backend>`` for more details. See an example in ``tests/unit/torch/quantization/test_custom_backend.py``.
 - Add ``examples/llm_qad`` for QAD training with Megatron-LM.
+- Add support for ``params`` constraint based automatic neural architecture search in Minitron pruning (``mcore_minitron``) as an alternative to manual pruning using ``export_config``. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details on its usage.
 
 **Deprecations**
 
@@ -80,7 +81,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 
 **Documentation**
 
-- Add general guidelines for Minitron pruning and distillation. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
+- Add general guidelines for Minitron pruning and distillation. See `pruning guidelines <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
 - Added example for exporting QLoRA checkpoint for vLLM deployment. Refer to `examples/llm_qat/README.md <https://github.com/NVIDIA/Model-Optimizer/blob/79ef31bc7269ba4da0cfab446da5b64509cbfcef/examples/llm_qat/README.md#qlora-deployment>`_ for more details
 
 0.37 (2025-10-08)

@@ -27,8 +27,6 @@
 from megatron.core.models.gpt import GPTModel
 from megatron.core.parallel_state import (
     get_data_parallel_group,
-    get_pipeline_model_parallel_group,
-    get_tensor_model_parallel_group,
     is_pipeline_first_stage,
     is_pipeline_last_stage,
 )
@@ -54,13 +52,8 @@
 from modelopt.torch.opt.searcher import ConstraintsDict
 from modelopt.torch.trace import Symbol
 from modelopt.torch.utils import distributed as dist
-from modelopt.torch.utils import (
-    get_module_device,
-    make_divisible,
-    param_num_from_forward,
-    print_rank_0,
-    random,
-)
+from modelopt.torch.utils import make_divisible, print_rank_0, random
+from modelopt.torch.utils.plugins import param_num_megatron
 
 from ..algorithms import (
     MODULE_TYPE_TO_CONSTRAINTS_FUNC,
@@ -1045,7 +1038,6 @@ def modify(
         *,
         hidden_size_divisor: int = 1,
         ffn_hidden_size_divisor: int = 1,
-        mamba_num_heads_divisor: int = 1,
         mamba_head_dim_divisor: int = 1,
         num_moe_experts_divisor: int = 1,
     ):
@@ -1054,7 +1046,6 @@ def modify(
         Args:
             hidden_size_divisor: The divisor of the hidden_size.
             ffn_hidden_size_divisor: The divisor of the mlp ffn_hidden_size.
-            mamba_num_heads_divisor: The divisor of the mamba num_heads.
             mamba_head_dim_divisor: The divisor of the mamba head_dim.
             num_moe_experts_divisor: The divisor of the number of MoE experts.
         """
@@ -1065,7 +1056,6 @@ def modify(
         for layer in self.decoder.layers:
             layer.modify(
                 ffn_hidden_size_divisor=ffn_hidden_size_divisor,
-                mamba_num_heads_divisor=mamba_num_heads_divisor,
                 mamba_head_dim_divisor=mamba_head_dim_divisor,
                 num_moe_experts_divisor=num_moe_experts_divisor,
             )
@@ -1142,11 +1132,7 @@ def constraint_eval_funcs(self) -> dict[str, ConstraintEvalFunc]:
 
     def _get_params(self, _: ConstraintsRes | None = None) -> float:
         """Get number of model parameters from forward pass."""
-        params = param_num_from_forward(self.model, args=self.dummy_input, unit=1.0)
-        reduced_params = torch.Tensor([params]).to(device=get_module_device(self.model))
-        torch.distributed.all_reduce(reduced_params, group=get_pipeline_model_parallel_group())
-        torch.distributed.all_reduce(reduced_params, group=get_tensor_model_parallel_group())
-        return reduced_params.item()
+        return param_num_megatron(self.model, from_forward=True, args=self.dummy_input)
 
     def _get_flops(self, _: ConstraintsRes | None = None) -> float:
         """Get inference FLOPs."""

@@ -35,7 +35,7 @@
 import torch.nn as nn
 
 from modelopt.torch.utils import distributed as dist
-from modelopt.torch.utils import no_stdout, run_forward_loop
+from modelopt.torch.utils import no_stdout, print_rank_0, run_forward_loop
 
 LimitsTuple = tuple[float, float]
 ConstraintsDict = dict[str, str | float | dict | None]
@@ -212,6 +212,7 @@ def construct_forward_loop(
             return None
 
         def forward_loop_with_silence_check(m: nn.Module) -> None:
+            print_rank_0("Running forward loop...")
             with no_stdout() if silent else nullcontext():
                 if data_loader is not None:
                     run_forward_loop(

@@ -19,8 +19,6 @@
 simplifies the overall workflow to accommodate for the simpler nature of pruning algorithms.
 """
 
-# nas is a required - so let's check if it's available
-import modelopt.torch.nas
 from modelopt.torch.utils import import_plugin
 
 from . import fastnas, gradnas, plugins