[Bugfix][V1] Fix deepseek with v1 (#958)

### What this PR does / why we need it?
Fix deepseek with v1, this error is introdeced by
https://github.com/vllm-project/vllm-ascend/pull/945. and this pr fix
the block table of mla

### How was this patch tested?
CI passed with new addedtest.

Signed-off-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-05-29 11:57:43 +08:00
committed by GitHub
parent e3c7f71462
commit cc74b97f74
2 changed files with 2 additions and 7 deletions

View File

@@ -22,7 +22,6 @@ Run `pytest tests/test_offline_inference.py`.
"""
import os
import pytest
import vllm # noqa: F401
from tests.conftest import VllmRunner
@@ -47,8 +46,6 @@ def test_models_distributed_QwQ():
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1",
reason="deepseek v2 lite is not supported on v1")
def test_models_distributed_DeepSeek():
example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",

View File

@@ -239,10 +239,8 @@ class AscendMLAMetadataBuilder:
# it blocks on all previous kernels.
device = self.runner.device
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
)
block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
block_table[:num_reqs])
block_table = (self.runner.input_batch.block_table[0].
get_device_tensor()[:num_reqs])
slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
device, non_blocking=True)
input_positions = self.runner.positions_cpu[:num_actual_tokens].to(