[Main2Main] Upgrade vLLM to 0226 (#6813)

### What this PR does / why we need it?

Breaking:
1. https://github.com/vllm-project/vllm/pull/33452
2. https://github.com/vllm-project/vllm/pull/33451
3. https://github.com/vllm-project/vllm/pull/32567
4. https://github.com/vllm-project/vllm/pull/32344

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
83b47f67b1

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
Canlin Guo
2026-02-27 16:05:21 +08:00
committed by GitHub
parent 80316c5824
commit e4458b2d2b
40 changed files with 117 additions and 184 deletions

View File

@@ -117,6 +117,18 @@ class AscendLinearMethod(LinearMethodBase):
if hasattr(self.quant_method, "process_weights_after_loading"):
self.quant_method.process_weights_after_loading(layer)
def get_computed_params(self) -> set[str]:
"""Return parameter name patterns that are computed, not loaded.
These parameters are computed during process_weights_after_loading
rather than loaded from checkpoint:
- weight_offset: Zero for symmetric quantization
- quant_bias: Computed from weight statistics
- deq_scale: Computed as input_scale * weight_scale
- weight_scale: May be computed or have default values for some models
"""
return {"weight_offset", "quant_bias", "deq_scale", "weight_scale"}
def apply(
self,
layer: torch.nn.Module,

View File

@@ -401,12 +401,7 @@ class AscendModelSlimConfig(QuantizationConfig):
self.packed_modules_mapping = packed_modules_model_mapping[model_type]
prefix = self.quant_prefix_mapper(model_type, prefix)
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.attention import Attention
if model_type != "kimi_k2":
if prefix.startswith("language_model"):