Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -7,6 +7,7 @@ from typing import Any

 import gguf
 import torch
+import torch.nn.functional as F
 from gguf import GGMLQuantizationType as WeightType
 from torch.nn.parameter import Parameter, UninitializedParameter

@@ -234,7 +235,7 @@ try:
        op_func=_fused_mul_mat_gguf,
        fake_impl=_fused_mul_mat_gguf_fake,
    )
-    fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf
+    fused_mul_mat_gguf = _fused_mul_mat_gguf

 except AttributeError as error:
    raise error
@@ -365,7 +366,7 @@ try:
        op_func=_fused_moe_gguf,
        fake_impl=_fused_moe_gguf_fake,
    )
-    fused_moe_gguf = torch.ops.vllm._fused_moe_gguf
+    fused_moe_gguf = _fused_moe_gguf

 except AttributeError as error:
    raise error
@@ -410,7 +411,7 @@ try:
        op_func=_apply_gguf_embedding,
        fake_impl=_apply_gguf_embedding_fake,
    )
-    apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding
+    apply_gguf_embedding = _apply_gguf_embedding

 except AttributeError as error:
    raise error
@@ -451,6 +452,9 @@ class GGUFLinearMethod(LinearMethodBase):
                "data_container": [],
                "shard_id": [],
                "shard_id_map": {},
+                "params_dtype": params_dtype,
+                "input_size_per_partition" :input_size_per_partition, # restore shape for qkv and merge
+                "output_partition_sizes" :output_partition_sizes,
            },
        )
        set_weight_attrs(qweight, extra_weight_attrs)
@@ -664,6 +668,10 @@ class GGUFEmbeddingMethod(GGUFLinearMethod):
    """

    def embedding(self, layer: torch.nn.Module, x: torch.Tensor) -> torch.Tensor:
+        weight = layer.weight
+        return F.embedding(x, weight)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        qweight = layer.qweight
        qweight_type = layer.qweight_type.weight_type
        hidden_size = qweight.tensor_shape[1]