[CI] Fix FusedMoEConfig and input batch failure to recover CI (#1602)

Make CI happy

1.
c1909e7e8c
changed moeConfig init way
2.
48fb076cbc
changed input batch logic.

This PR address these change to vllm-ascend.

Closes: https://github.com/vllm-project/vllm-ascend/issues/1600

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-03 18:36:17 +08:00
committed by GitHub
parent d96da1f00c
commit a45dfde283
11 changed files with 173 additions and 134 deletions

View File

@@ -61,7 +61,6 @@ from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.sampler import Sampler
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.spec_decode.utils import is_spec_decode_supported
from vllm.v1.utils import bind_kv_cache
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
from vllm.v1.worker.utils import (gather_mm_placeholders,
@@ -93,6 +92,9 @@ import vllm.envs as envs_vllm
import vllm_ascend.envs as envs_ascend
if vllm_version_is("0.9.1"):
from vllm.v1.spec_decode.utils import is_spec_decode_supported
@dataclass
class GraphCaptureContext:
@@ -2093,6 +2095,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
pin_memory=True,
vocab_size=self.model_config.get_vocab_size(),
block_sizes=[self.block_size],
is_spec_decode=bool(self.vllm_config.speculative_config),
)
kv_cache_sizes = {}
@@ -2272,9 +2275,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Skip requests that require top-p, top-k, etc.
req_id = self.input_batch.req_ids[i]
if not is_spec_decode_supported(req_id, self.input_batch):
draft_token_ids.append([])
continue
if vllm_version_is("0.9.1"):
if not is_spec_decode_supported(req_id, self.input_batch):
draft_token_ids.append([])
continue
else:
if req_id in self.input_batch.spec_decode_unsupported_reqs:
draft_token_ids.append([])
continue
# Add sampled_token_ids to token_ids_cpu.
start_idx = self.input_batch.num_tokens_no_spec[i]