[Version] Drop 0.16.0 support (#7153)
### What this PR does / why we need it?
Drop 0.16.0 support in main
- Fix eagle proposer break introduced by
https://github.com/vllm-project/vllm/pull/34552. Mainly change to use
the draft attention group to initialize the attention metadata builder.
- Fix the `ModelRunner` has no attribute `cudagraph_capture_sizes`
error, which is a bug in vLLM v0.17.0, and fixed by a later pr
https://github.com/vllm-project/vllm/pull/30515
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -17,13 +17,9 @@
|
||||
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if HAS_TRITON:
|
||||
import vllm_ascend.patch.worker.patch_triton
|
||||
|
||||
if not vllm_version_is("v0.16.0"):
|
||||
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
|
||||
|
||||
# isort: off
|
||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||
@@ -35,6 +31,7 @@ import vllm_ascend.patch.worker.patch_minimax_m2_linear_attn # noqa
|
||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_next # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
|
||||
import vllm_ascend.patch.worker.patch_qwen3_5 # noqa
|
||||
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
|
||||
import vllm_ascend.patch.worker.patch_v2_eagle # noqa
|
||||
import vllm_ascend.patch.worker.patch_v2_uva # noqa
|
||||
|
||||
@@ -21,14 +21,7 @@ import vllm
|
||||
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
|
||||
from vllm.v1.worker.gpu.input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("v0.16.0"):
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
|
||||
else:
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
|
||||
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
|
||||
|
||||
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
|
||||
|
||||
@@ -175,7 +168,4 @@ def propose(
|
||||
return self.draft_tokens[:num_reqs]
|
||||
|
||||
|
||||
if vllm_version_is("v0.16.0"):
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
|
||||
else:
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
|
||||
|
||||
Reference in New Issue
Block a user