Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ NVIDIA Model Optimizer Changelog (Linux)
- Add support for parallel draft heads in Eagle speculative decoding.
- Add support to enable custom emulated quantization backend. See :meth:`register_quant_backend <modelopt.torch.quantization.nn.modules.tensor_quantizer.register_quant_backend>`` for more details. See an example in ``tests/unit/torch/quantization/test_custom_backend.py``.
- Add ``examples/llm_qad`` for QAD training with Megatron-LM.
- Add support for ``params`` constraint based automatic neural architecture search in Minitron pruning (``mcore_minitron``) as an alternative to manual pruning using ``export_config``. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details on its usage.

**Deprecations**

Expand Down Expand Up @@ -80,7 +81,7 @@ NVIDIA Model Optimizer Changelog (Linux)

**Documentation**

- Add general guidelines for Minitron pruning and distillation. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
- Add general guidelines for Minitron pruning and distillation. See `pruning guidelines <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
- Added example for exporting QLoRA checkpoint for vLLM deployment. Refer to `examples/llm_qat/README.md <https://github.com/NVIDIA/Model-Optimizer/blob/79ef31bc7269ba4da0cfab446da5b64509cbfcef/examples/llm_qat/README.md#qlora-deployment>`_ for more details

0.37 (2025-10-08)
Expand Down
20 changes: 3 additions & 17 deletions modelopt/torch/nas/plugins/megatron.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
from megatron.core.models.gpt import GPTModel
from megatron.core.parallel_state import (
get_data_parallel_group,
get_pipeline_model_parallel_group,
get_tensor_model_parallel_group,
is_pipeline_first_stage,
is_pipeline_last_stage,
)
Expand All @@ -54,13 +52,8 @@
from modelopt.torch.opt.searcher import ConstraintsDict
from modelopt.torch.trace import Symbol
from modelopt.torch.utils import distributed as dist
from modelopt.torch.utils import (
get_module_device,
make_divisible,
param_num_from_forward,
print_rank_0,
random,
)
from modelopt.torch.utils import make_divisible, print_rank_0, random
from modelopt.torch.utils.plugins import param_num_megatron

from ..algorithms import (
MODULE_TYPE_TO_CONSTRAINTS_FUNC,
Expand Down Expand Up @@ -1045,7 +1038,6 @@ def modify(
*,
hidden_size_divisor: int = 1,
ffn_hidden_size_divisor: int = 1,
mamba_num_heads_divisor: int = 1,
mamba_head_dim_divisor: int = 1,
num_moe_experts_divisor: int = 1,
):
Expand All @@ -1054,7 +1046,6 @@ def modify(
Args:
hidden_size_divisor: The divisor of the hidden_size.
ffn_hidden_size_divisor: The divisor of the mlp ffn_hidden_size.
mamba_num_heads_divisor: The divisor of the mamba num_heads.
mamba_head_dim_divisor: The divisor of the mamba head_dim.
num_moe_experts_divisor: The divisor of the number of MoE experts.
"""
Expand All @@ -1065,7 +1056,6 @@ def modify(
for layer in self.decoder.layers:
layer.modify(
ffn_hidden_size_divisor=ffn_hidden_size_divisor,
mamba_num_heads_divisor=mamba_num_heads_divisor,
mamba_head_dim_divisor=mamba_head_dim_divisor,
num_moe_experts_divisor=num_moe_experts_divisor,
)
Expand Down Expand Up @@ -1142,11 +1132,7 @@ def constraint_eval_funcs(self) -> dict[str, ConstraintEvalFunc]:

def _get_params(self, _: ConstraintsRes | None = None) -> float:
"""Get number of model parameters from forward pass."""
params = param_num_from_forward(self.model, args=self.dummy_input, unit=1.0)
reduced_params = torch.Tensor([params]).to(device=get_module_device(self.model))
torch.distributed.all_reduce(reduced_params, group=get_pipeline_model_parallel_group())
torch.distributed.all_reduce(reduced_params, group=get_tensor_model_parallel_group())
return reduced_params.item()
return param_num_megatron(self.model, from_forward=True, args=self.dummy_input)

def _get_flops(self, _: ConstraintsRes | None = None) -> float:
"""Get inference FLOPs."""
Expand Down
3 changes: 2 additions & 1 deletion modelopt/torch/opt/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import torch.nn as nn

from modelopt.torch.utils import distributed as dist
from modelopt.torch.utils import no_stdout, run_forward_loop
from modelopt.torch.utils import no_stdout, print_rank_0, run_forward_loop

LimitsTuple = tuple[float, float]
ConstraintsDict = dict[str, str | float | dict | None]
Expand Down Expand Up @@ -212,6 +212,7 @@ def construct_forward_loop(
return None

def forward_loop_with_silence_check(m: nn.Module) -> None:
print_rank_0("Running forward loop...")
with no_stdout() if silent else nullcontext():
if data_loader is not None:
run_forward_loop(
Expand Down
2 changes: 0 additions & 2 deletions modelopt/torch/prune/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
simplifies the overall workflow to accommodate for the simpler nature of pruning algorithms.
"""

# nas is a required - so let's check if it's available
import modelopt.torch.nas
from modelopt.torch.utils import import_plugin

from . import fastnas, gradnas, plugins
Expand Down
Loading