[Main2Main] Upgrade vLLM to 0226 (#6813)

### What this PR does / why we need it? Breaking: 1. https://github.com/vllm-project/vllm/pull/33452 2. https://github.com/vllm-project/vllm/pull/33451 3. https://github.com/vllm-project/vllm/pull/32567 4. https://github.com/vllm-project/vllm/pull/32344 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 --------- Signed-off-by: MrZ20 <2609716663@qq.com> Signed-off-by: gcanlin <canlinguosdu@gmail.com> Co-authored-by: MrZ20 <2609716663@qq.com>
2026-02-27 16:05:21 +08:00
parent 80316c5824
commit e4458b2d2b
40 changed files with 117 additions and 184 deletions
--- a/vllm_ascend/quantization/method_adapters.py
+++ b/vllm_ascend/quantization/method_adapters.py
@@ -117,6 +117,18 @@ class AscendLinearMethod(LinearMethodBase):
        if hasattr(self.quant_method, "process_weights_after_loading"):
            self.quant_method.process_weights_after_loading(layer)

+    def get_computed_params(self) -> set[str]:
+        """Return parameter name patterns that are computed, not loaded.
+
+        These parameters are computed during process_weights_after_loading
+        rather than loaded from checkpoint:
+        - weight_offset: Zero for symmetric quantization
+        - quant_bias: Computed from weight statistics
+        - deq_scale: Computed as input_scale * weight_scale
+        - weight_scale: May be computed or have default values for some models
+        """
+        return {"weight_offset", "quant_bias", "deq_scale", "weight_scale"}
+
    def apply(
        self,
        layer: torch.nn.Module,
--- a/vllm_ascend/quantization/modelslim_config.py
+++ b/vllm_ascend/quantization/modelslim_config.py
@@ -401,12 +401,7 @@ class AscendModelSlimConfig(QuantizationConfig):
            self.packed_modules_mapping = packed_modules_model_mapping[model_type]
        prefix = self.quant_prefix_mapper(model_type, prefix)

-        from vllm_ascend.utils import vllm_version_is
-
-        if vllm_version_is("v0.15.0"):
-            from vllm.attention.layer import Attention  # type: ignore
-        else:
-            from vllm.model_executor.layers.attention import Attention
+        from vllm.model_executor.layers.attention import Attention

        if model_type != "kimi_k2":
            if prefix.startswith("language_model"):