[CI] fix ci (#2464)
### What this PR does / why we need it?
1. use action/checkout@v5 instead of v4
2. remove dbo test case because there is issue with it and will be
refactored later
3. make vllm-ascend compatible with vllm v0.10.1.1 and add CI for it
4. fix sampler api changes introduced by
https://github.com/vllm-project/vllm/pull/22387
6. fix qwen3 moe config changes intruoduced by
https://github.com/vllm-project/vllm/pull/20562
7. fix kvcache block changes introduced by
https://github.com/vllm-project/vllm/pull/23262
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.10.0
- vLLM main:
0c6e40bbaa
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -31,6 +31,13 @@ from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
else:
|
||||
KVCacheBlocks = None
|
||||
|
||||
|
||||
class AscendScheduler(Scheduler):
|
||||
"""This Scheduler extends vllm's original v1 scheduler
|
||||
@@ -59,7 +66,10 @@ class AscendScheduler(Scheduler):
|
||||
scheduled_running_reqs: list[Request] = []
|
||||
preempted_reqs: list[Request] = []
|
||||
|
||||
req_to_new_block_ids: dict[str, list[int]] = {}
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
req_to_new_block_ids: dict[str, list[int]] = {}
|
||||
else:
|
||||
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
|
||||
num_scheduled_tokens: dict[str, int] = {}
|
||||
token_budget = self.max_num_scheduled_tokens
|
||||
# Spec decode-related.
|
||||
@@ -217,8 +227,11 @@ class AscendScheduler(Scheduler):
|
||||
|
||||
if self.lora_config and request.lora_request:
|
||||
scheduled_loras.add(request.lora_request.lora_int_id)
|
||||
req_to_new_block_ids[request.request_id] = (
|
||||
self.kv_cache_manager.get_block_ids(request.request_id))
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
req_to_new_block_ids[request.request_id] = (
|
||||
self.kv_cache_manager.get_block_ids(request.request_id))
|
||||
else:
|
||||
req_to_new_blocks[request.request_id] = new_blocks
|
||||
# Update request info.
|
||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||
token_budget -= num_new_tokens
|
||||
@@ -307,8 +320,11 @@ class AscendScheduler(Scheduler):
|
||||
# Schedule the request.
|
||||
scheduled_running_reqs.append(request)
|
||||
self.scheduled_req_ids.add(request.request_id)
|
||||
req_to_new_block_ids[request.request_id] = (
|
||||
new_blocks.get_block_ids())
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
req_to_new_block_ids[request.request_id] = (
|
||||
new_blocks.get_block_ids())
|
||||
else:
|
||||
req_to_new_blocks[request.request_id] = new_blocks
|
||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||
token_budget -= num_new_tokens
|
||||
req_index += 1
|
||||
@@ -346,16 +362,27 @@ class AscendScheduler(Scheduler):
|
||||
any_request, len(self.running)))
|
||||
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
NewRequestData.from_request(req,
|
||||
req_to_new_block_ids[req.request_id])
|
||||
for req in scheduled_new_reqs
|
||||
]
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
new_reqs_data = [
|
||||
NewRequestData.from_request(
|
||||
req, req_to_new_block_ids[req.request_id])
|
||||
for req in scheduled_new_reqs
|
||||
]
|
||||
cached_reqs_data = self._make_cached_request_data(
|
||||
scheduled_running_reqs, scheduled_resumed_reqs,
|
||||
num_scheduled_tokens, scheduled_spec_decode_tokens,
|
||||
req_to_new_block_ids)
|
||||
else:
|
||||
new_reqs_data = [
|
||||
NewRequestData.from_request(
|
||||
req, req_to_new_blocks[req.request_id].get_block_ids())
|
||||
for req in scheduled_new_reqs
|
||||
]
|
||||
|
||||
cached_reqs_data = self._make_cached_request_data(
|
||||
scheduled_running_reqs, scheduled_resumed_reqs,
|
||||
num_scheduled_tokens, scheduled_spec_decode_tokens,
|
||||
req_to_new_block_ids)
|
||||
cached_reqs_data = self._make_cached_request_data(
|
||||
scheduled_running_reqs, scheduled_resumed_reqs,
|
||||
num_scheduled_tokens, scheduled_spec_decode_tokens,
|
||||
req_to_new_blocks)
|
||||
scheduled_cached_reqs = cached_reqs_data
|
||||
|
||||
scheduler_output = SchedulerOutput(
|
||||
|
||||
@@ -50,6 +50,7 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
||||
from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
|
||||
init_metadata_for_sp)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
||||
@@ -253,7 +254,11 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
self.num_redundant_experts = parallel_config.num_redundant_experts
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
self.num_redundant_experts = parallel_config.num_redundant_experts
|
||||
else:
|
||||
eplb_config = parallel_config.eplb_config
|
||||
self.num_redundant_experts = eplb_config.num_redundant_experts
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.vocab_size = config.vocab_size
|
||||
self.config = config
|
||||
|
||||
@@ -3,12 +3,19 @@ import torch_npu
|
||||
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
|
||||
from vllm.v1.sample.sampler import Sampler
|
||||
|
||||
from vllm_ascend.utils import is_310p
|
||||
from vllm_ascend.utils import is_310p, vllm_version_is
|
||||
|
||||
if not vllm_version_is("0.10.1.1"):
|
||||
from vllm.config import LogprobsMode
|
||||
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
|
||||
else:
|
||||
LogprobsMode = None
|
||||
DEFAULT_LOGPROBS_MODE = "raw_logprobs"
|
||||
|
||||
|
||||
class AscendSampler(Sampler):
|
||||
|
||||
def __init__(self, logprobs_mode="raw_logprobs"):
|
||||
def __init__(self, logprobs_mode=DEFAULT_LOGPROBS_MODE):
|
||||
# TODO: support logprobs_mode in vllm-ascend
|
||||
super().__init__(logprobs_mode=logprobs_mode)
|
||||
self.topk_topp_sampler = AscendTopKTopPSampler()
|
||||
@@ -61,5 +68,19 @@ class AscendTopKTopPSampler(TopKTopPSampler):
|
||||
def forward_native(self, logits, generators, k, p):
|
||||
"""Override pytorch native implementation to torch_npu"""
|
||||
logits = self._apply_top_k_top_p(logits, k, p)
|
||||
if not vllm_version_is("0.10.1.1"):
|
||||
|
||||
logits_to_return = None
|
||||
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
|
||||
logits_to_return = logits
|
||||
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
|
||||
logits_to_return = logits.log_softmax(dim=-1,
|
||||
dtype=torch.float32)
|
||||
|
||||
probs = logits.softmax(dim=-1, dtype=torch.float32)
|
||||
return random_sample(probs, generators)
|
||||
output = None
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
output = random_sample(probs, generators)
|
||||
else:
|
||||
output = (random_sample(probs, generators), logits_to_return)
|
||||
return output
|
||||
|
||||
@@ -64,8 +64,8 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheSpec)
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
|
||||
LogprobsTensors, ModelRunnerOutput)
|
||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
|
||||
ModelRunnerOutput)
|
||||
from vllm.v1.pool.metadata import PoolingMetadata
|
||||
from vllm.v1.sample.logits_processor import build_logitsprocs
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
@@ -95,11 +95,17 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
|
||||
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||
ProfileExecuteDuration, is_310p,
|
||||
maybe_converting_weight_acl_format)
|
||||
maybe_converting_weight_acl_format,
|
||||
vllm_version_is)
|
||||
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
|
||||
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
if not vllm_version_is("0.10.1.1"):
|
||||
from vllm.v1.outputs import DraftTokenIds
|
||||
else:
|
||||
DraftTokenIds = None
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
@@ -514,11 +520,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# Update the block IDs.
|
||||
if not resumed_from_preemption:
|
||||
# Append the new blocks to the existing block IDs.
|
||||
for block_ids, new_ids in zip(req_state.block_ids,
|
||||
new_block_ids):
|
||||
block_ids.extend(new_ids)
|
||||
if new_block_ids is not None:
|
||||
# Append the new blocks to the existing block IDs.
|
||||
for block_ids, new_ids in zip(req_state.block_ids,
|
||||
new_block_ids):
|
||||
block_ids.extend(new_ids)
|
||||
else:
|
||||
assert new_block_ids is not None
|
||||
# The request is resumed from preemption.
|
||||
# Replace the existing block IDs with the new ones.
|
||||
req_state.block_ids = new_block_ids
|
||||
@@ -534,7 +542,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Update the persistent batch.
|
||||
self.input_batch.num_computed_tokens_cpu[req_index] = (
|
||||
num_computed_tokens)
|
||||
self.input_batch.block_table.append_row(new_block_ids, req_index)
|
||||
if new_block_ids is not None:
|
||||
self.input_batch.block_table.append_row(
|
||||
new_block_ids, req_index)
|
||||
|
||||
# For the last rank, we don't need to update the token_ids_cpu
|
||||
# because the sampled tokens are already cached.
|
||||
@@ -1526,16 +1536,28 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
else:
|
||||
pooler_output.append(None)
|
||||
extra_args = ({"kv_connector_output": kv_connector_output})
|
||||
|
||||
return ModelRunnerOutput(
|
||||
req_ids=self.input_batch.req_ids,
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=[],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=pooler_output,
|
||||
**extra_args,
|
||||
)
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
modelrunner_output = ModelRunnerOutput(
|
||||
req_ids=self.input_batch.req_ids,
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=[],
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=pooler_output,
|
||||
**extra_args,
|
||||
)
|
||||
else:
|
||||
modelrunner_output = ModelRunnerOutput(
|
||||
req_ids=self.input_batch.req_ids,
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=[],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=pooler_output,
|
||||
**extra_args,
|
||||
)
|
||||
return modelrunner_output
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
@@ -1757,15 +1779,27 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
extra_args = ({"kv_connector_output": kv_connector_output})
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=self.input_batch.req_ids,
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=valid_sampled_token_ids,
|
||||
logprobs=logprobs_lists,
|
||||
prompt_logprobs_dict=prompt_logprobs_dict,
|
||||
pooler_output=[],
|
||||
**extra_args,
|
||||
)
|
||||
if vllm_version_is("0.10.1.1"):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=self.input_batch.req_ids,
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=valid_sampled_token_ids,
|
||||
logprobs=logprobs_lists,
|
||||
spec_token_ids=self._draft_token_ids,
|
||||
prompt_logprobs_dict=prompt_logprobs_dict,
|
||||
pooler_output=[],
|
||||
**extra_args,
|
||||
)
|
||||
else:
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=self.input_batch.req_ids,
|
||||
req_id_to_index=self.input_batch.req_id_to_index,
|
||||
sampled_token_ids=valid_sampled_token_ids,
|
||||
logprobs=logprobs_lists,
|
||||
prompt_logprobs_dict=prompt_logprobs_dict,
|
||||
pooler_output=[],
|
||||
**extra_args,
|
||||
)
|
||||
|
||||
durations = ProfileExecuteDuration().pop_captured_sync()
|
||||
if durations:
|
||||
|
||||
Reference in New Issue
Block a user