Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -7,7 +7,7 @@ import torch
 import torch.distributed as dist
 from torch import nn
 from transformers import GptOssConfig
-
+import vllm.envs as envs
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -23,7 +23,11 @@ from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_BLOCK_SIZE
@@ -42,6 +46,7 @@ from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backend import AttentionType
+from vllm.model_executor.model_loader import padding_weight_loader

 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
@@ -107,7 +112,6 @@ class OAIAttention(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.qkv_proj",
        )
-
        self.o_proj = RowParallelLinear(
            input_size=self.num_attention_heads * self.head_dim,
            output_size=self.hidden_size,
@@ -165,7 +169,14 @@ class MLPBlock(torch.nn.Module):
        self.hidden_size = config.hidden_size
        self.experts_per_token = config.num_experts_per_tok
        self.world_size = dist.get_world_size() if dist.is_initialized() else 1
-        self.router = torch.nn.Linear(config.hidden_size, config.num_local_experts)
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=True,
+            quant_config=None,
+            prefix=f"{prefix}.router",
+            return_bias=False,
+        )
        assert config.intermediate_size % self.world_size == 0
        self.experts = FusedMoE(
            num_experts=config.num_local_experts,
@@ -969,8 +980,18 @@ class GptOssModel(nn.Module):
        weights: Iterable[tuple[str, torch.Tensor]],
        stacked_params_mapping: list[tuple[str, ...]],
    ) -> set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
+
+        def handle_weight(name, weight, param_name, permute_dims=None, slice_dims=None, contiguous=True):
+            """Helper function to handle weight loading with optional slicing and permutation."""
+            param = params_dict[param_name]
+            if slice_dims:
+                weight = weight[slice_dims]
+            if permute_dims:
+                weight = weight.permute(*permute_dims)
+            if contiguous:
+                weight = weight.contiguous()
+            padding_weight_loader(param, weight)
+            loaded_params.add(param_name)

        use_ep = self.parallel_config.enable_expert_parallel

@@ -986,91 +1007,71 @@ class GptOssModel(nn.Module):

        intermediate_size = self.config.intermediate_size
        per_rank_intermediate_size = cdiv(intermediate_size, tp_size)
-        # Calculate common slicing bounds for current rank
        tp_rank_start = tp_rank * per_rank_intermediate_size
        tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size)

+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        pack_factor = 2 if envs.VLLM_W8A8_MOE_USE_W4A8 else 1
+        w4a8_flag = envs.VLLM_W8A8_MOE_USE_W4A8
+        gemm_format = envs.VLLM_W8A8_FORMAT
+
        for name, weight in weights:
-            # Skip layers on other devices.
+             # Skip layers on other devices.
            if is_pp_missing_parameter(name, self):
                continue
-
-            if ".w13_weight" in name:
-                # Handle MLP gate and up projection weights
-                # Extract gate and up projection parts
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:, :, 2 * tp_rank_start : 2 * tp_rank_end]
-
-                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
+            if ".experts.w13_weight" in name and "scale" not in name and "bias" not in name:
+                slice_dims = (slice(ep_rank_start, ep_rank_end), ...) if use_ep else (slice(None), slice(None), slice(2 * tp_rank_start, 2 * tp_rank_end))
+                permute_dims = None if gemm_format == "NN" else (0, 2, 1)
+                handle_weight(name, weight, name, permute_dims=permute_dims, slice_dims=slice_dims)
+            elif ".experts.w2_weight" in name and "scale" not in name and "bias" not in name:
+                slice_dims = (slice(ep_rank_start, ep_rank_end), ...) if use_ep else (slice(None), slice(tp_rank_start // pack_factor, tp_rank_end // pack_factor), slice(None))
+                permute_dims = None if gemm_format == "NN" else (0, 2, 1)
+                handle_weight(name, weight, name, permute_dims=permute_dims, slice_dims=slice_dims)
+            elif ".experts.gate_up_proj_scale" in name:
+                new_name = name.replace("gate_up_proj_scale", "w13_weight_scale")
+                slice_dims = (slice(ep_rank_start, ep_rank_end), ...) if use_ep else (slice(None), slice(None), slice(2 * tp_rank_start, 2 * tp_rank_end))
+                permute_dims = None if w4a8_flag else (0, 2, 1)
+                handle_weight(name, weight, new_name, permute_dims=permute_dims, slice_dims=slice_dims, contiguous=w4a8_flag)
+            elif ".experts.down_proj_scale" in name:
+                new_name = name.replace("down_proj_scale", "w2_weight_scale")
+                slice_dims = (slice(ep_rank_start, ep_rank_end), ...) if use_ep else None
+                permute_dims = None if w4a8_flag else (0, 2, 1)
+                handle_weight(name, weight, new_name, permute_dims=permute_dims, slice_dims=slice_dims, contiguous=w4a8_flag)
+            elif ".experts.w13_bias" in name:
+                slice_dims = (slice(ep_rank_start, ep_rank_end), ...) if use_ep else (slice(None), slice(2 * tp_rank_start, 2 * tp_rank_end))
+                handle_weight(name, weight, name, slice_dims=slice_dims, contiguous=False)
+            elif ".experts.w2_bias" in name:
                param = params_dict[name]
-
-                param.copy_(narrow_weight)
-                loaded_params.add(name)
-                continue
-            elif ".w2_weight" in name:
-                # Handle MLP down projection weights
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:, tp_rank_start:tp_rank_end, :]
-                narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
-                param = params_dict[name]
-
-                param.copy_(narrow_weight)
-                loaded_params.add(name)
-                continue
-            elif ".w13_bias" in name:
-                # Handle MLP gate and up projection biases
-                # Extract gate and up projection bias parts
-                if use_ep:
-                    narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    narrow_weight = weight[:, 2 * tp_rank_start : 2 * tp_rank_end]
-
-                param = params_dict[name]
-                param.copy_(narrow_weight)
-                loaded_params.add(name)
-                continue
-            elif ".w2_bias" in name:
-                # Handle MLP down projection bias
                if use_ep:
                    weight = weight[ep_rank_start:ep_rank_end, ...]
-                else:
-                    # (only load on rank 0 to avoid duplication)
-                    if tp_rank != 0:
-                        weight.zero_()
-                param = params_dict[name]
-                param.copy_(weight)
+                elif tp_rank != 0:
+                    weight.zero_()
+                param.data.copy_(weight)
                loaded_params.add(name)
-                continue
            elif "sinks" in name:
-                # Handle attention sinks (distributed across ranks)
+                name = name.replace("self_attn", "attn")
                param = params_dict[name]
                narrow_weight = weight.narrow(0, head_start, heads_per_rank)
                param.data.copy_(narrow_weight)
                loaded_params.add(name)
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
-                if weight_loader == default_weight_loader:
-                    weight_loader(param, weight)
-                else:
-                    weight_loader(param, weight, shard_id)
-                break
+            elif ("q_proj" in name or "k_proj" in name or "v_proj" in name):
+                shard_id = ("q" if "q_proj" in name else "k" if "k_proj" in name else "v")
+                name = name.replace("self_attn", "attn")
+                param_name = name.replace(f"{shard_id}_proj", "qkv_proj")
+                param = params_dict[param_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, weight, loaded_shard_id=shard_id)
+                loaded_params.add(param_name)
            else:
-                # Handle all other weights with potential renaming
                if name not in params_dict:
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, weight)
-            loaded_params.add(name)
+                loaded_params.add(name)
+
        return loaded_params

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: