Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -5,8 +5,8 @@ from collections.abc import Callable, Iterable
from enum import Enum
from typing import Literal, cast, get_args, overload
import ast, re
import torch
import torch.nn.functional as F
from torch.nn.parameter import UninitializedParameter
import vllm.envs as envs
@@ -54,10 +54,14 @@ from vllm.model_executor.layers.quantization.base_config import (
)
from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up
from vllm.model_executor.layers.utils import (
parse_opt_exclude_layers,
weight_quant_l1,
weight_quant_l2,
)
logger = init_logger(__name__)
class FusedMoeWeightScaleSupported(Enum):
TENSOR = "tensor"
CHANNEL = "channel"
@@ -333,6 +337,7 @@ class FusedMoE(CustomOp):
gate: torch.nn.Module | None = None,
shared_experts: torch.nn.Module | None = None,
routed_input_transform: torch.nn.Module | None = None,
fused_shared_output: bool = False,
):
super().__init__()
@@ -483,6 +488,8 @@ class FusedMoE(CustomOp):
(expert_mask == 0) | (expert_mask == 1)
), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s."
self.hidden_size = hidden_size
self.num_experts = num_experts
assert intermediate_size % self.tp_size == 0
self.intermediate_size_per_partition = intermediate_size // self.tp_size
self.reduce_results = reduce_results
@@ -526,16 +533,18 @@ class FusedMoE(CustomOp):
# Round up hidden size before creating moe_config.
# This way moe_config is created with the correct hidden_size from the start.
unpadded_hidden_size = hidden_size
self.model_type = (
self.vllm_config.model_config.hf_config.model_type
if self.vllm_config.model_config is not None
else None
)
hidden_size = maybe_roundup_hidden_size(
hidden_size=hidden_size,
act_dtype=moe_in_dtype,
moe_parallel_config=self.moe_parallel_config,
is_lora_enabled=vllm_config.lora_config is not None,
model_type=(
self.vllm_config.model_config.hf_config.model_type
if self.vllm_config.model_config is not None
else None
),
model_type=self.model_type,
is_mxfp4_quant=(
quant_config is not None and quant_config.is_mxfp4_quant(prefix, self)
),
@@ -581,14 +590,27 @@ class FusedMoE(CustomOp):
"""
quant_method = None
if self.quant_config is not None:
self.opt_level = 0
quant_method = self.quant_config.get_quant_method(self, prefix)
if quant_method is None:
quant_method = UnquantizedFusedMoEMethod(self.moe_config)
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
CompressedTensorsL1OptMoEMethod, CompressedTensorsL2OptMoEMethod)
if self.opt_level == 1:
quant_method = CompressedTensorsL1OptMoEMethod(self.moe_config)
elif self.opt_level == 2:
quant_method = CompressedTensorsL2OptMoEMethod(self.moe_config)
else:
quant_method = UnquantizedFusedMoEMethod(self.moe_config)
assert isinstance(quant_method, FusedMoEMethodBase)
return quant_method
# Note: get_quant_method will look at the layer's local_num_experts
# for heuristic purposes, so it must be initialized first.
self.opt_level = envs.VLLM_MOE_OPT_LEVEL
if parse_opt_exclude_layers(envs.VLLM_OPT_EXCLUDE_LAYERS, prefix):
self.opt_flag = False
logger.info(f"Excluding layer {prefix} from optimization")
self.quant_method: FusedMoEMethodBase = _get_quant_method()
if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike():
@@ -611,6 +633,7 @@ class FusedMoE(CustomOp):
moe_quant_params = {
"num_experts": self.local_num_experts,
"hidden_size": hidden_size,
"unpadded_hidden_size": unpadded_hidden_size,
"intermediate_size_per_partition": self.intermediate_size_per_partition,
"params_dtype": params_dtype,
"weight_loader": self.weight_loader,
@@ -625,6 +648,7 @@ class FusedMoE(CustomOp):
moe_quant_params["intermediate_size_full"] = intermediate_size
self.quant_method.create_weights(layer=self, **moe_quant_params)
self.base_quant_method = self.quant_method
# Disable shared expert overlap if:
# - we are using eplb with non-default backend, because of correctness issues
@@ -638,7 +662,10 @@ class FusedMoE(CustomOp):
)
and self._shared_experts is not None
)
if fused_shared_output:
assert self.use_ep == False, "Fused shared output is only supported when EP is disabled."
assert shared_experts is not None, "Shared experts must be provided when fused_shared_output is True."
self.fused_shared_output = fused_shared_output
self.runner = self._init_runner()
def _init_runner(self):
@@ -655,6 +682,7 @@ class FusedMoE(CustomOp):
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
fused_shared_output=self.fused_shared_output,
)
# TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py
@@ -681,7 +709,7 @@ class FusedMoE(CustomOp):
# routing_tables only needed for round-robin expert placement with
# DeepEP all2all backend.
routing_tables = self._maybe_init_expert_routing_tables()
prepare_finalize = self.quant_method.maybe_make_prepare_finalize(
prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize(
routing_tables=routing_tables
)
if prepare_finalize is not None:
@@ -691,7 +719,7 @@ class FusedMoE(CustomOp):
self._replace_quant_method(
FusedMoEModularMethod.make(
self,
self.quant_method,
self.base_quant_method,
prepare_finalize,
self.shared_experts,
inplace=not self.moe_config.disable_inplace,
@@ -959,11 +987,7 @@ class FusedMoE(CustomOp):
else:
assert shard_id == "w3"
expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
try:
expert_data.copy_(loaded_weight)
except Exception as e:
print(expert_data.shape, expert_data.dtype, loaded_weight.shape, loaded_weight.dtype)
raise e
expert_data.copy_(loaded_weight)
def _load_w2(
self,
@@ -976,7 +1000,7 @@ class FusedMoE(CustomOp):
# Index the loaded weight for tp sharding.
# down_proj: "RowParallel" so tp sharding on input_dim
# Narrow parameter and load.
shard_size = expert_data.shape[shard_dim]
shard_size = loaded_weight.shape[shard_dim] // self.tp_size
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
# and we're not loading the full weight
if not load_full and loaded_weight.ndim > 0:
@@ -984,7 +1008,55 @@ class FusedMoE(CustomOp):
shard_dim, shard_size * tp_rank, shard_size
)
# w2, down_proj: Load into only logical weight of w2.
expert_data.copy_(loaded_weight)
expert_data.narrow(shard_dim, 0, shard_size).copy_(loaded_weight)
def _load_model_opt_weight_or_group_weight_scale(self,
shard_dim: int,
shard_dim_scale: int,
expert_data: torch.Tensor,
scale_data: torch.Tensor,
shard_id: str,
loaded_weight: torch.Tensor,
tp_rank: int,
opt_level: int,
load_full_w2: bool = False):
"""
Load grouped weight scales for group quantization or model weights
:param shard_dim: dimension to shard
:param expert_data: parameter for a particular expert
:param shard_id: either w1, w2, or w3
:param loaded_weight: checkpoint weight to load into the param
:param tp_rank: tensor parallel rank
:param load_full_w2: whether or not the w2 loaded should be sharded.
"""
assert opt_level in [1, 2]
if opt_level == 1:
weight, scale = weight_quant_l1(loaded_weight)
else:
weight, scale = weight_quant_l2(loaded_weight)
scale = scale.view(1, -1)
if shard_id == "w2":
# In the case where we have actorder/g_idx, we do not partition the
# w2 scales, as indicated by `load_full` argument, for all tp cases
self._load_w2(shard_dim=shard_dim,
loaded_weight=weight,
expert_data=expert_data,
tp_rank=tp_rank,
load_full=load_full_w2)
scale_data.copy_(scale)
elif shard_id in ("w1", "w3"):
self._load_w13(shard_id=shard_id,
shard_dim=shard_dim,
loaded_weight=weight,
expert_data=expert_data,
tp_rank=tp_rank)
self._load_w13(shard_id=shard_id,
shard_dim=shard_dim_scale,
loaded_weight=scale,
expert_data=scale_data,
tp_rank=tp_rank)
def _load_single_value(
self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
@@ -1147,7 +1219,6 @@ class FusedMoE(CustomOp):
shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
if is_transposed:
shard_dim = int(not shard_dim)
shard_dim_force = getattr(param, "shard_dim", None)
shard_dim = shard_dim_force if shard_dim_force is not None else shard_dim
@@ -1309,13 +1380,28 @@ class FusedMoE(CustomOp):
# Case model weights
if "weight" in weight_name:
self._load_model_weight_or_group_weight_scale(
shard_id=shard_id,
shard_dim=shard_dim,
loaded_weight=loaded_weight,
expert_data=expert_data,
tp_rank=self.tp_rank,
)
if self.opt_level != 0:
scale_name = weight_name.split('.')[-1] + "_scale"
params_dict = dict(self.named_parameters())
scale_param = params_dict[scale_name]
shard_dim_scale = getattr(scale_param, "shard_dim", None)
scale_expert_data = scale_param.data if full_load else scale_param.data[expert_id]
self._load_model_opt_weight_or_group_weight_scale(
shard_id=shard_id,
shard_dim=shard_dim,
shard_dim_scale=shard_dim_scale,
loaded_weight=loaded_weight,
expert_data=expert_data,
scale_data=scale_expert_data,
opt_level=self.opt_level,
tp_rank=self.tp_rank)
else:
self._load_model_weight_or_group_weight_scale(
shard_id=shard_id,
shard_dim=shard_dim,
loaded_weight=loaded_weight,
expert_data=expert_data,
tp_rank=self.tp_rank)
return True if return_success else None
return False if return_success else None