[CI] upgrade to vllm 0.9.0 (#959)
Upgrade to vllm 0.9.0. 0.8.5 will not be supported any more. Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -30,7 +30,6 @@ from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendAttentionBackend(AttentionBackend):
|
||||
@@ -142,14 +141,11 @@ class AscendAttentionMetadataBuilder:
|
||||
|
||||
def build(self, num_reqs, num_actual_tokens, max_query_len,
|
||||
common_prefix_len):
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
block_table = (self.runner.input_batch.block_table.
|
||||
get_device_tensor()[:num_reqs])
|
||||
else:
|
||||
block_table = self.runner.input_batch.block_table[
|
||||
0].get_device_tensor()
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
|
||||
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
|
||||
)
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
|
||||
query_lens = self.runner.query_lens
|
||||
seq_lens = self.runner.seq_lens_cpu[:num_reqs]
|
||||
|
||||
@@ -16,7 +16,6 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -239,14 +238,11 @@ class AscendMLAMetadataBuilder:
|
||||
# function. We should avoid GPU -> CPU sync as much as possible because
|
||||
# it blocks on all previous kernels.
|
||||
device = self.runner.device
|
||||
if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
|
||||
block_table = (self.runner.input_batch.block_table.
|
||||
get_device_tensor()[:num_reqs])
|
||||
else:
|
||||
block_table = self.runner.input_batch.block_table[
|
||||
0].get_device_tensor()
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
|
||||
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
|
||||
)
|
||||
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
|
||||
block_table[:num_reqs])
|
||||
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
|
||||
device, non_blocking=True)
|
||||
input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
|
||||
|
||||
Reference in New Issue
Block a user