Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,8 +5,8 @@ from collections.abc import Callable, Iterable
 from enum import Enum
 from typing import Literal, cast, get_args, overload

+import ast, re
 import torch
-import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter

 import vllm.envs as envs
@@ -54,10 +54,14 @@ from vllm.model_executor.layers.quantization.base_config import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.model_executor.layers.utils import (
+    parse_opt_exclude_layers,
+    weight_quant_l1,
+    weight_quant_l2,
+)

 logger = init_logger(__name__)

-
 class FusedMoeWeightScaleSupported(Enum):
    TENSOR = "tensor"
    CHANNEL = "channel"
@@ -333,6 +337,7 @@ class FusedMoE(CustomOp):
        gate: torch.nn.Module | None = None,
        shared_experts: torch.nn.Module | None = None,
        routed_input_transform: torch.nn.Module | None = None,
+        fused_shared_output: bool = False,
    ):
        super().__init__()

@@ -483,6 +488,8 @@ class FusedMoE(CustomOp):
                (expert_mask == 0) | (expert_mask == 1)
            ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s."

+        self.hidden_size = hidden_size
+        self.num_experts = num_experts
        assert intermediate_size % self.tp_size == 0
        self.intermediate_size_per_partition = intermediate_size // self.tp_size
        self.reduce_results = reduce_results
@@ -526,16 +533,18 @@ class FusedMoE(CustomOp):

        # Round up hidden size before creating moe_config.
        # This way moe_config is created with the correct hidden_size from the start.
+        unpadded_hidden_size = hidden_size
+        self.model_type = (
+            self.vllm_config.model_config.hf_config.model_type
+            if self.vllm_config.model_config is not None
+            else None
+        )
        hidden_size = maybe_roundup_hidden_size(
            hidden_size=hidden_size,
            act_dtype=moe_in_dtype,
            moe_parallel_config=self.moe_parallel_config,
            is_lora_enabled=vllm_config.lora_config is not None,
-            model_type=(
-                self.vllm_config.model_config.hf_config.model_type
-                if self.vllm_config.model_config is not None
-                else None
-            ),
+            model_type=self.model_type,
            is_mxfp4_quant=(
                quant_config is not None and quant_config.is_mxfp4_quant(prefix, self)
            ),
@@ -581,14 +590,27 @@ class FusedMoE(CustomOp):
            """
            quant_method = None
            if self.quant_config is not None:
+                self.opt_level = 0
                quant_method = self.quant_config.get_quant_method(self, prefix)
            if quant_method is None:
-                quant_method = UnquantizedFusedMoEMethod(self.moe_config)
+                from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+                    CompressedTensorsL1OptMoEMethod, CompressedTensorsL2OptMoEMethod)
+                if self.opt_level == 1:
+                    quant_method = CompressedTensorsL1OptMoEMethod(self.moe_config)
+                elif self.opt_level == 2:
+                    quant_method = CompressedTensorsL2OptMoEMethod(self.moe_config)
+                else:
+                    quant_method = UnquantizedFusedMoEMethod(self.moe_config)
            assert isinstance(quant_method, FusedMoEMethodBase)
            return quant_method

        # Note: get_quant_method will look at the layer's local_num_experts
        # for heuristic purposes, so it must be initialized first.
+        self.opt_level = envs.VLLM_MOE_OPT_LEVEL
+        if parse_opt_exclude_layers(envs.VLLM_OPT_EXCLUDE_LAYERS, prefix):
+            self.opt_flag = False
+            logger.info(f"Excluding layer {prefix} from optimization")
+
        self.quant_method: FusedMoEMethodBase = _get_quant_method()

        if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike():
@@ -611,6 +633,7 @@ class FusedMoE(CustomOp):
        moe_quant_params = {
            "num_experts": self.local_num_experts,
            "hidden_size": hidden_size,
+            "unpadded_hidden_size": unpadded_hidden_size,
            "intermediate_size_per_partition": self.intermediate_size_per_partition,
            "params_dtype": params_dtype,
            "weight_loader": self.weight_loader,
@@ -625,6 +648,7 @@ class FusedMoE(CustomOp):
            moe_quant_params["intermediate_size_full"] = intermediate_size

        self.quant_method.create_weights(layer=self, **moe_quant_params)
+        self.base_quant_method = self.quant_method

        # Disable shared expert overlap if:
        #   - we are using eplb with non-default backend, because of correctness issues
@@ -638,7 +662,10 @@ class FusedMoE(CustomOp):
            )
            and self._shared_experts is not None
        )
-
+        if fused_shared_output:
+            assert self.use_ep == False, "Fused shared output is only supported when EP is disabled."
+            assert shared_experts is not None, "Shared experts must be provided when fused_shared_output is True."
+        self.fused_shared_output = fused_shared_output
        self.runner = self._init_runner()

    def _init_runner(self):
@@ -655,6 +682,7 @@ class FusedMoE(CustomOp):
            quant_method=self.quant_method,
            reduce_results=self.reduce_results,
            enable_dbo=self.vllm_config.parallel_config.enable_dbo,
+            fused_shared_output=self.fused_shared_output,
        )

    # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py
@@ -681,7 +709,7 @@ class FusedMoE(CustomOp):
        # routing_tables only needed for round-robin expert placement with
        # DeepEP all2all backend.
        routing_tables = self._maybe_init_expert_routing_tables()
-        prepare_finalize = self.quant_method.maybe_make_prepare_finalize(
+        prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize(
            routing_tables=routing_tables
        )
        if prepare_finalize is not None:
@@ -691,7 +719,7 @@ class FusedMoE(CustomOp):
            self._replace_quant_method(
                FusedMoEModularMethod.make(
                    self,
-                    self.quant_method,
+                    self.base_quant_method,
                    prepare_finalize,
                    self.shared_experts,
                    inplace=not self.moe_config.disable_inplace,
@@ -959,11 +987,7 @@ class FusedMoE(CustomOp):
        else:
            assert shard_id == "w3"
            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
-        try:
-            expert_data.copy_(loaded_weight)
-        except Exception as e:
-            print(expert_data.shape, expert_data.dtype, loaded_weight.shape, loaded_weight.dtype)
-            raise e
+        expert_data.copy_(loaded_weight)

    def _load_w2(
        self,
@@ -976,7 +1000,7 @@ class FusedMoE(CustomOp):
        # Index the loaded weight for tp sharding.
        # down_proj: "RowParallel" so tp sharding on input_dim
        # Narrow parameter and load.
-        shard_size = expert_data.shape[shard_dim]
+        shard_size = loaded_weight.shape[shard_dim] // self.tp_size
        # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
        # and we're not loading the full weight
        if not load_full and loaded_weight.ndim > 0:
@@ -984,7 +1008,55 @@ class FusedMoE(CustomOp):
                shard_dim, shard_size * tp_rank, shard_size
            )
        # w2, down_proj: Load into only logical weight of w2.
-        expert_data.copy_(loaded_weight)
+        expert_data.narrow(shard_dim, 0, shard_size).copy_(loaded_weight)
+
+    def _load_model_opt_weight_or_group_weight_scale(self,
+                                                     shard_dim: int,
+                                                     shard_dim_scale: int,
+                                                     expert_data: torch.Tensor,
+                                                     scale_data: torch.Tensor,
+                                                     shard_id: str,
+                                                     loaded_weight: torch.Tensor,
+                                                     tp_rank: int,
+                                                     opt_level: int,
+                                                     load_full_w2: bool = False):
+        """
+        Load grouped weight scales for group quantization or model weights
+            :param shard_dim: dimension to shard
+            :param expert_data: parameter for a particular expert
+            :param shard_id: either w1, w2, or w3
+            :param loaded_weight: checkpoint weight to load into the param
+            :param tp_rank: tensor parallel rank
+            :param load_full_w2: whether or not the w2 loaded should be sharded.
+        """
+
+        assert opt_level in [1, 2]
+        if opt_level == 1:
+            weight, scale = weight_quant_l1(loaded_weight)
+        else:
+            weight, scale = weight_quant_l2(loaded_weight)
+            scale = scale.view(1, -1)
+
+        if shard_id == "w2":
+            # In the case where we have actorder/g_idx, we do not partition the
+            # w2 scales, as indicated by `load_full` argument, for all tp cases
+            self._load_w2(shard_dim=shard_dim,
+                          loaded_weight=weight,
+                          expert_data=expert_data,
+                          tp_rank=tp_rank,
+                          load_full=load_full_w2)
+            scale_data.copy_(scale)
+        elif shard_id in ("w1", "w3"):
+            self._load_w13(shard_id=shard_id,
+                           shard_dim=shard_dim,
+                           loaded_weight=weight,
+                           expert_data=expert_data,
+                           tp_rank=tp_rank)
+            self._load_w13(shard_id=shard_id,
+                           shard_dim=shard_dim_scale,
+                           loaded_weight=scale,
+                           expert_data=scale_data,
+                           tp_rank=tp_rank)

    def _load_single_value(
        self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
@@ -1147,7 +1219,6 @@ class FusedMoE(CustomOp):
        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
        if is_transposed:
            shard_dim = int(not shard_dim)
-        
        shard_dim_force = getattr(param, "shard_dim", None)
        shard_dim = shard_dim_force if shard_dim_force is not None else shard_dim

@@ -1309,13 +1380,28 @@ class FusedMoE(CustomOp):

        # Case model weights
        if "weight" in weight_name:
-            self._load_model_weight_or_group_weight_scale(
-                shard_id=shard_id,
-                shard_dim=shard_dim,
-                loaded_weight=loaded_weight,
-                expert_data=expert_data,
-                tp_rank=self.tp_rank,
-            )
+            if self.opt_level != 0:
+                scale_name = weight_name.split('.')[-1] + "_scale"
+                params_dict = dict(self.named_parameters())
+                scale_param = params_dict[scale_name]
+                shard_dim_scale = getattr(scale_param, "shard_dim", None)
+                scale_expert_data = scale_param.data if full_load else scale_param.data[expert_id]
+                self._load_model_opt_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    shard_dim_scale=shard_dim_scale,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    scale_data=scale_expert_data,
+                    opt_level=self.opt_level,
+                    tp_rank=self.tp_rank)
+            else:
+                self._load_model_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=self.tp_rank)
            return True if return_success else None

        return False if return_success else None