Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -5,8 +5,8 @@ from collections.abc import Callable, Iterable
|
||||
from enum import Enum
|
||||
from typing import Literal, cast, get_args, overload
|
||||
|
||||
import ast, re
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.parameter import UninitializedParameter
|
||||
|
||||
import vllm.envs as envs
|
||||
@@ -54,10 +54,14 @@ from vllm.model_executor.layers.quantization.base_config import (
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.math_utils import round_up
|
||||
from vllm.model_executor.layers.utils import (
|
||||
parse_opt_exclude_layers,
|
||||
weight_quant_l1,
|
||||
weight_quant_l2,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class FusedMoeWeightScaleSupported(Enum):
|
||||
TENSOR = "tensor"
|
||||
CHANNEL = "channel"
|
||||
@@ -333,6 +337,7 @@ class FusedMoE(CustomOp):
|
||||
gate: torch.nn.Module | None = None,
|
||||
shared_experts: torch.nn.Module | None = None,
|
||||
routed_input_transform: torch.nn.Module | None = None,
|
||||
fused_shared_output: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -483,6 +488,8 @@ class FusedMoE(CustomOp):
|
||||
(expert_mask == 0) | (expert_mask == 1)
|
||||
), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s."
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.num_experts = num_experts
|
||||
assert intermediate_size % self.tp_size == 0
|
||||
self.intermediate_size_per_partition = intermediate_size // self.tp_size
|
||||
self.reduce_results = reduce_results
|
||||
@@ -526,16 +533,18 @@ class FusedMoE(CustomOp):
|
||||
|
||||
# Round up hidden size before creating moe_config.
|
||||
# This way moe_config is created with the correct hidden_size from the start.
|
||||
unpadded_hidden_size = hidden_size
|
||||
self.model_type = (
|
||||
self.vllm_config.model_config.hf_config.model_type
|
||||
if self.vllm_config.model_config is not None
|
||||
else None
|
||||
)
|
||||
hidden_size = maybe_roundup_hidden_size(
|
||||
hidden_size=hidden_size,
|
||||
act_dtype=moe_in_dtype,
|
||||
moe_parallel_config=self.moe_parallel_config,
|
||||
is_lora_enabled=vllm_config.lora_config is not None,
|
||||
model_type=(
|
||||
self.vllm_config.model_config.hf_config.model_type
|
||||
if self.vllm_config.model_config is not None
|
||||
else None
|
||||
),
|
||||
model_type=self.model_type,
|
||||
is_mxfp4_quant=(
|
||||
quant_config is not None and quant_config.is_mxfp4_quant(prefix, self)
|
||||
),
|
||||
@@ -581,14 +590,27 @@ class FusedMoE(CustomOp):
|
||||
"""
|
||||
quant_method = None
|
||||
if self.quant_config is not None:
|
||||
self.opt_level = 0
|
||||
quant_method = self.quant_config.get_quant_method(self, prefix)
|
||||
if quant_method is None:
|
||||
quant_method = UnquantizedFusedMoEMethod(self.moe_config)
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
|
||||
CompressedTensorsL1OptMoEMethod, CompressedTensorsL2OptMoEMethod)
|
||||
if self.opt_level == 1:
|
||||
quant_method = CompressedTensorsL1OptMoEMethod(self.moe_config)
|
||||
elif self.opt_level == 2:
|
||||
quant_method = CompressedTensorsL2OptMoEMethod(self.moe_config)
|
||||
else:
|
||||
quant_method = UnquantizedFusedMoEMethod(self.moe_config)
|
||||
assert isinstance(quant_method, FusedMoEMethodBase)
|
||||
return quant_method
|
||||
|
||||
# Note: get_quant_method will look at the layer's local_num_experts
|
||||
# for heuristic purposes, so it must be initialized first.
|
||||
self.opt_level = envs.VLLM_MOE_OPT_LEVEL
|
||||
if parse_opt_exclude_layers(envs.VLLM_OPT_EXCLUDE_LAYERS, prefix):
|
||||
self.opt_flag = False
|
||||
logger.info(f"Excluding layer {prefix} from optimization")
|
||||
|
||||
self.quant_method: FusedMoEMethodBase = _get_quant_method()
|
||||
|
||||
if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike():
|
||||
@@ -611,6 +633,7 @@ class FusedMoE(CustomOp):
|
||||
moe_quant_params = {
|
||||
"num_experts": self.local_num_experts,
|
||||
"hidden_size": hidden_size,
|
||||
"unpadded_hidden_size": unpadded_hidden_size,
|
||||
"intermediate_size_per_partition": self.intermediate_size_per_partition,
|
||||
"params_dtype": params_dtype,
|
||||
"weight_loader": self.weight_loader,
|
||||
@@ -625,6 +648,7 @@ class FusedMoE(CustomOp):
|
||||
moe_quant_params["intermediate_size_full"] = intermediate_size
|
||||
|
||||
self.quant_method.create_weights(layer=self, **moe_quant_params)
|
||||
self.base_quant_method = self.quant_method
|
||||
|
||||
# Disable shared expert overlap if:
|
||||
# - we are using eplb with non-default backend, because of correctness issues
|
||||
@@ -638,7 +662,10 @@ class FusedMoE(CustomOp):
|
||||
)
|
||||
and self._shared_experts is not None
|
||||
)
|
||||
|
||||
if fused_shared_output:
|
||||
assert self.use_ep == False, "Fused shared output is only supported when EP is disabled."
|
||||
assert shared_experts is not None, "Shared experts must be provided when fused_shared_output is True."
|
||||
self.fused_shared_output = fused_shared_output
|
||||
self.runner = self._init_runner()
|
||||
|
||||
def _init_runner(self):
|
||||
@@ -655,6 +682,7 @@ class FusedMoE(CustomOp):
|
||||
quant_method=self.quant_method,
|
||||
reduce_results=self.reduce_results,
|
||||
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
|
||||
fused_shared_output=self.fused_shared_output,
|
||||
)
|
||||
|
||||
# TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py
|
||||
@@ -681,7 +709,7 @@ class FusedMoE(CustomOp):
|
||||
# routing_tables only needed for round-robin expert placement with
|
||||
# DeepEP all2all backend.
|
||||
routing_tables = self._maybe_init_expert_routing_tables()
|
||||
prepare_finalize = self.quant_method.maybe_make_prepare_finalize(
|
||||
prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize(
|
||||
routing_tables=routing_tables
|
||||
)
|
||||
if prepare_finalize is not None:
|
||||
@@ -691,7 +719,7 @@ class FusedMoE(CustomOp):
|
||||
self._replace_quant_method(
|
||||
FusedMoEModularMethod.make(
|
||||
self,
|
||||
self.quant_method,
|
||||
self.base_quant_method,
|
||||
prepare_finalize,
|
||||
self.shared_experts,
|
||||
inplace=not self.moe_config.disable_inplace,
|
||||
@@ -959,11 +987,7 @@ class FusedMoE(CustomOp):
|
||||
else:
|
||||
assert shard_id == "w3"
|
||||
expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
|
||||
try:
|
||||
expert_data.copy_(loaded_weight)
|
||||
except Exception as e:
|
||||
print(expert_data.shape, expert_data.dtype, loaded_weight.shape, loaded_weight.dtype)
|
||||
raise e
|
||||
expert_data.copy_(loaded_weight)
|
||||
|
||||
def _load_w2(
|
||||
self,
|
||||
@@ -976,7 +1000,7 @@ class FusedMoE(CustomOp):
|
||||
# Index the loaded weight for tp sharding.
|
||||
# down_proj: "RowParallel" so tp sharding on input_dim
|
||||
# Narrow parameter and load.
|
||||
shard_size = expert_data.shape[shard_dim]
|
||||
shard_size = loaded_weight.shape[shard_dim] // self.tp_size
|
||||
# Only narrow if the loaded_weight is not a scalar (0-dim tensor)
|
||||
# and we're not loading the full weight
|
||||
if not load_full and loaded_weight.ndim > 0:
|
||||
@@ -984,7 +1008,55 @@ class FusedMoE(CustomOp):
|
||||
shard_dim, shard_size * tp_rank, shard_size
|
||||
)
|
||||
# w2, down_proj: Load into only logical weight of w2.
|
||||
expert_data.copy_(loaded_weight)
|
||||
expert_data.narrow(shard_dim, 0, shard_size).copy_(loaded_weight)
|
||||
|
||||
def _load_model_opt_weight_or_group_weight_scale(self,
|
||||
shard_dim: int,
|
||||
shard_dim_scale: int,
|
||||
expert_data: torch.Tensor,
|
||||
scale_data: torch.Tensor,
|
||||
shard_id: str,
|
||||
loaded_weight: torch.Tensor,
|
||||
tp_rank: int,
|
||||
opt_level: int,
|
||||
load_full_w2: bool = False):
|
||||
"""
|
||||
Load grouped weight scales for group quantization or model weights
|
||||
:param shard_dim: dimension to shard
|
||||
:param expert_data: parameter for a particular expert
|
||||
:param shard_id: either w1, w2, or w3
|
||||
:param loaded_weight: checkpoint weight to load into the param
|
||||
:param tp_rank: tensor parallel rank
|
||||
:param load_full_w2: whether or not the w2 loaded should be sharded.
|
||||
"""
|
||||
|
||||
assert opt_level in [1, 2]
|
||||
if opt_level == 1:
|
||||
weight, scale = weight_quant_l1(loaded_weight)
|
||||
else:
|
||||
weight, scale = weight_quant_l2(loaded_weight)
|
||||
scale = scale.view(1, -1)
|
||||
|
||||
if shard_id == "w2":
|
||||
# In the case where we have actorder/g_idx, we do not partition the
|
||||
# w2 scales, as indicated by `load_full` argument, for all tp cases
|
||||
self._load_w2(shard_dim=shard_dim,
|
||||
loaded_weight=weight,
|
||||
expert_data=expert_data,
|
||||
tp_rank=tp_rank,
|
||||
load_full=load_full_w2)
|
||||
scale_data.copy_(scale)
|
||||
elif shard_id in ("w1", "w3"):
|
||||
self._load_w13(shard_id=shard_id,
|
||||
shard_dim=shard_dim,
|
||||
loaded_weight=weight,
|
||||
expert_data=expert_data,
|
||||
tp_rank=tp_rank)
|
||||
self._load_w13(shard_id=shard_id,
|
||||
shard_dim=shard_dim_scale,
|
||||
loaded_weight=scale,
|
||||
expert_data=scale_data,
|
||||
tp_rank=tp_rank)
|
||||
|
||||
def _load_single_value(
|
||||
self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
|
||||
@@ -1147,7 +1219,6 @@ class FusedMoE(CustomOp):
|
||||
shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
|
||||
if is_transposed:
|
||||
shard_dim = int(not shard_dim)
|
||||
|
||||
shard_dim_force = getattr(param, "shard_dim", None)
|
||||
shard_dim = shard_dim_force if shard_dim_force is not None else shard_dim
|
||||
|
||||
@@ -1309,13 +1380,28 @@ class FusedMoE(CustomOp):
|
||||
|
||||
# Case model weights
|
||||
if "weight" in weight_name:
|
||||
self._load_model_weight_or_group_weight_scale(
|
||||
shard_id=shard_id,
|
||||
shard_dim=shard_dim,
|
||||
loaded_weight=loaded_weight,
|
||||
expert_data=expert_data,
|
||||
tp_rank=self.tp_rank,
|
||||
)
|
||||
if self.opt_level != 0:
|
||||
scale_name = weight_name.split('.')[-1] + "_scale"
|
||||
params_dict = dict(self.named_parameters())
|
||||
scale_param = params_dict[scale_name]
|
||||
shard_dim_scale = getattr(scale_param, "shard_dim", None)
|
||||
scale_expert_data = scale_param.data if full_load else scale_param.data[expert_id]
|
||||
self._load_model_opt_weight_or_group_weight_scale(
|
||||
shard_id=shard_id,
|
||||
shard_dim=shard_dim,
|
||||
shard_dim_scale=shard_dim_scale,
|
||||
loaded_weight=loaded_weight,
|
||||
expert_data=expert_data,
|
||||
scale_data=scale_expert_data,
|
||||
opt_level=self.opt_level,
|
||||
tp_rank=self.tp_rank)
|
||||
else:
|
||||
self._load_model_weight_or_group_weight_scale(
|
||||
shard_id=shard_id,
|
||||
shard_dim=shard_dim,
|
||||
loaded_weight=loaded_weight,
|
||||
expert_data=expert_data,
|
||||
tp_rank=self.tp_rank)
|
||||
return True if return_success else None
|
||||
|
||||
return False if return_success else None
|
||||
|
||||
Reference in New Issue
Block a user