[CI] fix ci (#2464)

### What this PR does / why we need it?
1. use action/checkout@v5 instead of v4
2. remove dbo test case because there is issue with it and will be
refactored later
3. make vllm-ascend compatible with vllm v0.10.1.1 and add CI for it
4. fix sampler api changes introduced by
https://github.com/vllm-project/vllm/pull/22387
6. fix qwen3 moe config changes intruoduced by
https://github.com/vllm-project/vllm/pull/20562
7. fix kvcache block changes introduced by
https://github.com/vllm-project/vllm/pull/23262

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
CI passed with existing test.


- vLLM version: v0.10.0
- vLLM main:
0c6e40bbaa

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
Mengqing Cao
2025-08-22 07:30:48 +08:00
committed by GitHub
parent 0ca3f48c90
commit b0403f8d8a
27 changed files with 389 additions and 199 deletions

View File

@@ -64,8 +64,8 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheSpec)
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
LogprobsTensors, ModelRunnerOutput)
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
ModelRunnerOutput)
from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import build_logitsprocs
from vllm.v1.sample.metadata import SamplingMetadata
@@ -95,11 +95,17 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
ProfileExecuteDuration, is_310p,
maybe_converting_weight_acl_format)
maybe_converting_weight_acl_format,
vllm_version_is)
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if not vllm_version_is("0.10.1.1"):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
if TYPE_CHECKING:
import xgrammar as xgr # type: ignore[import-untyped]
from vllm.v1.core.sched.output import SchedulerOutput
@@ -514,11 +520,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Update the block IDs.
if not resumed_from_preemption:
# Append the new blocks to the existing block IDs.
for block_ids, new_ids in zip(req_state.block_ids,
new_block_ids):
block_ids.extend(new_ids)
if new_block_ids is not None:
# Append the new blocks to the existing block IDs.
for block_ids, new_ids in zip(req_state.block_ids,
new_block_ids):
block_ids.extend(new_ids)
else:
assert new_block_ids is not None
# The request is resumed from preemption.
# Replace the existing block IDs with the new ones.
req_state.block_ids = new_block_ids
@@ -534,7 +542,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Update the persistent batch.
self.input_batch.num_computed_tokens_cpu[req_index] = (
num_computed_tokens)
self.input_batch.block_table.append_row(new_block_ids, req_index)
if new_block_ids is not None:
self.input_batch.block_table.append_row(
new_block_ids, req_index)
# For the last rank, we don't need to update the token_ids_cpu
# because the sampled tokens are already cached.
@@ -1526,16 +1536,28 @@ class NPUModelRunner(LoRAModelRunnerMixin):
else:
pooler_output.append(None)
extra_args = ({"kv_connector_output": kv_connector_output})
return ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=[],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=pooler_output,
**extra_args,
)
if vllm_version_is("0.10.1.1"):
modelrunner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=[],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=pooler_output,
**extra_args,
)
else:
modelrunner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=[],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=pooler_output,
**extra_args,
)
return modelrunner_output
@torch.inference_mode()
def execute_model(
@@ -1757,15 +1779,27 @@ class NPUModelRunner(LoRAModelRunnerMixin):
extra_args = ({"kv_connector_output": kv_connector_output})
model_runner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=valid_sampled_token_ids,
logprobs=logprobs_lists,
prompt_logprobs_dict=prompt_logprobs_dict,
pooler_output=[],
**extra_args,
)
if vllm_version_is("0.10.1.1"):
model_runner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=valid_sampled_token_ids,
logprobs=logprobs_lists,
spec_token_ids=self._draft_token_ids,
prompt_logprobs_dict=prompt_logprobs_dict,
pooler_output=[],
**extra_args,
)
else:
model_runner_output = ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index,
sampled_token_ids=valid_sampled_token_ids,
logprobs=logprobs_lists,
prompt_logprobs_dict=prompt_logprobs_dict,
pooler_output=[],
**extra_args,
)
durations = ProfileExecuteDuration().pop_captured_sync()
if durations: