[Main2Main] Upgrade vLLM to 0226 (#6813)

### What this PR does / why we need it?

Breaking:
1. https://github.com/vllm-project/vllm/pull/33452
2. https://github.com/vllm-project/vllm/pull/33451
3. https://github.com/vllm-project/vllm/pull/32567
4. https://github.com/vllm-project/vllm/pull/32344

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main:
83b47f67b1

---------

Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
Canlin Guo
2026-02-27 16:05:21 +08:00
committed by GitHub
parent 80316c5824
commit e4458b2d2b
40 changed files with 117 additions and 184 deletions

View File

@@ -19,11 +19,6 @@ import os
import vllm_ascend.patch.platform.patch_distributed # noqa
import vllm_ascend.patch.platform.patch_mamba_config # noqa
import vllm_ascend.patch.platform.patch_sched_yield # noqa
from vllm_ascend import envs
from vllm_ascend.utils import vllm_version_is
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is("0.15.0"):
import vllm_ascend.patch.platform.patch_balance_schedule # noqa

View File

@@ -1,14 +1,8 @@
import torch
import vllm.v1.worker.utils as utils
from vllm.model_executor.layers.attention import Attention
from vllm.v1.worker.utils import defaultdict, extract_layer_index
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
# Without this patch, it will raise an exception when initialize kv_cache.
# TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError.

View File

@@ -21,7 +21,14 @@ import vllm
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
from vllm.v1.worker.gpu.input_batch import InputBatch
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.16.0"):
from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
else:
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
@@ -168,4 +175,7 @@ def propose(
return self.draft_tokens[:num_reqs]
vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
if vllm_version_is("v0.16.0"):
vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
else:
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose