[Version] Drop 0.16.0 support (#7153)
### What this PR does / why we need it?
Drop 0.16.0 support in main
- Fix eagle proposer break introduced by
https://github.com/vllm-project/vllm/pull/34552. Mainly change to use
the draft attention group to initialize the attention metadata builder.
- Fix the `ModelRunner` has no attribute `cudagraph_capture_sizes`
error, which is a bug in vLLM v0.17.0, and fixed by a later pr
https://github.com/vllm-project/vllm/pull/30515
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -21,14 +21,7 @@ import vllm
|
||||
from vllm.v1.worker.gpu.attn_utils import build_slot_mappings_by_layer
|
||||
from vllm.v1.worker.gpu.input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("v0.16.0"):
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
|
||||
else:
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
|
||||
|
||||
from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
|
||||
|
||||
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
|
||||
|
||||
@@ -175,7 +168,4 @@ def propose(
|
||||
return self.draft_tokens[:num_reqs]
|
||||
|
||||
|
||||
if vllm_version_is("v0.16.0"):
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
|
||||
else:
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
|
||||
vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
|
||||
|
||||
Reference in New Issue
Block a user