[1/N][Refactor] Refactor code to adapt with vllm main (#3612)

### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
17c540a993

1. refactor deepseek to the latest code arch as of
17c540a993
 
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
CI passed with existing test.


- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
This commit is contained in:
Mengqing Cao
2025-10-24 16:55:08 +08:00
committed by GitHub
parent ec9ec78b53
commit cea0755b07
47 changed files with 1189 additions and 493 deletions

View File

@@ -35,7 +35,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import \
KVConnectorMetadata
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import \
KVConnectorStats
from vllm.logger import init_logger
from vllm.logger import logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
compute_encoder_budget)
@@ -55,7 +55,7 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats
from vllm.v1.structured_output import StructuredOutputManager
from vllm.v1.utils import ConstantList
logger = init_logger(__name__)
from vllm_ascend.utils import vllm_version_is
class RecomputeScheduler(SchedulerInterface):
@@ -67,6 +67,7 @@ class RecomputeScheduler(SchedulerInterface):
vllm_config: VllmConfig,
kv_cache_config: KVCacheConfig,
structured_output_manager: StructuredOutputManager,
block_size: Optional[int] = None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
@@ -586,9 +587,14 @@ class RecomputeScheduler(SchedulerInterface):
self.kv_cache_config.kv_cache_groups)
if self.running:
any_request = self.running[0]
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
if vllm_version_is("0.11.0"):
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
else:
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id))
# Construct the scheduler output.
new_reqs_data = [

View File

@@ -59,7 +59,7 @@ class AscendSchedulerConfig(SchedulerConfig):
scheduler_config[k] = getattr(ascend_scheduler_config, k)
return cls(**scheduler_config)
def __post_init__(self) -> None:
def __post_init__(self, *args) -> None:
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
self.encoder_cache_size = self.max_num_batched_tokens
self.chunked_prefill_enabled = self.enable_chunked_prefill

View File

@@ -16,7 +16,7 @@
#
import time
from collections import deque
from typing import Iterable, Union
from typing import Iterable, Optional, Union
from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVEventBatch
@@ -32,27 +32,19 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
class AscendScheduler(Scheduler):
"""This Scheduler extends vllm's original v1 scheduler
with prefill-first scheduling strategy."""
def __init__(
self,
vllm_config: VllmConfig,
kv_cache_config: KVCacheConfig,
structured_output_manager: StructuredOutputManager,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
) -> None:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry,
include_finished_set, log_stats)
def _initialize_common(self) -> None:
"""Initialize common attributes shared across all versions."""
self.scheduled_req_ids: set[str] = set()
self.running: list[Request] = []
self.finished_prefill_reqs: deque[Request] = deque()
enable_pd_transfer = getattr(self.scheduler_config,
'enable_pd_transfer', False)
decode_max_num_seqs = getattr(self.scheduler_config,
@@ -61,6 +53,29 @@ class AscendScheduler(Scheduler):
self.decode_max_num_running_reqs = max(self.max_num_running_reqs,
decode_max_num_seqs)
def __init__(
self,
vllm_config: VllmConfig,
kv_cache_config: KVCacheConfig,
structured_output_manager: StructuredOutputManager,
block_size: Optional[int] = None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
) -> None:
# Call the parent class's __init__ method
if vllm_version_is("0.11.0"):
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry,
include_finished_set, log_stats)
else:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, block_size,
mm_registry, include_finished_set, log_stats)
# Initialize common attributes
self._initialize_common()
def schedule(self) -> SchedulerOutput:
if self.scheduler_config.chunked_prefill_enabled:
return super().schedule()
@@ -440,9 +455,14 @@ class AscendScheduler(Scheduler):
self.kv_cache_config.kv_cache_groups)
if self.running:
any_request = self.running[0]
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
if vllm_version_is("0.11.0"):
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
else:
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id))
# Construct the scheduler output.
new_reqs_data = [

View File

@@ -16,6 +16,7 @@
#
import os
import time
from typing import Optional
import pandas as pd
from vllm.config import VllmConfig
@@ -32,6 +33,8 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
class BudgetRefiner:
"""This budget refiner can make dynamic adjustment to the token budget
@@ -122,13 +125,19 @@ class SchedulerDynamicBatch(Scheduler):
vllm_config: VllmConfig,
kv_cache_config: KVCacheConfig,
structured_output_manager: StructuredOutputManager,
block_size: Optional[int] = None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
include_finished_set: bool = False,
log_stats: bool = False,
) -> None:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry,
include_finished_set, log_stats)
if vllm_version_is("0.11.0"):
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, mm_registry,
include_finished_set, log_stats)
else:
super().__init__(vllm_config, kv_cache_config,
structured_output_manager, block_size,
mm_registry, include_finished_set, log_stats)
self.running: list[Request] = []
self.budget_refiner = BudgetRefiner(
default_budget=self.scheduler_config.max_num_batched_tokens,
@@ -531,10 +540,14 @@ class SchedulerDynamicBatch(Scheduler):
self.kv_cache_config.kv_cache_groups)
if self.running:
any_request = self.running[0]
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
if vllm_version_is("0.11.0"):
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request, len(self.running)))
else:
num_common_prefix_blocks = (
self.kv_cache_manager.get_num_common_prefix_blocks(
any_request.request_id))
# Construct the scheduler output.
new_reqs_data = [
NewRequestData.from_request(