[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it? This is the step 1 of refactoring code to adapt with vllm main, and this pr aligned with17c540a9931. refactor deepseek to the latest code arch as of17c540a9932. bunches of fixes due to vllm changes - Fix `AscendScheduler` `__post_init__`, caused by https://github.com/vllm-project/vllm/pull/25075 - Fix `AscendScheduler` init got an unexpected arg `block_size`, caused by https://github.com/vllm-project/vllm/pull/26296 - Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by https://github.com/vllm-project/vllm/pull/23485 - Fix `MLAAttention` import,caused by https://github.com/vllm-project/vllm/pull/25103 - Fix `SharedFusedMoE` import, caused by https://github.com/vllm-project/vllm/pull/26145 - Fix `LazyLoader` improt, caused by https://github.com/vllm-project/vllm/pull/27022 - Fix `vllm.utils.swap_dict_values` improt, caused by https://github.com/vllm-project/vllm/pull/26990 - Fix `Backend` enum import, caused by https://github.com/vllm-project/vllm/pull/25893 - Fix `CompilationLevel` renaming to `CompilationMode` issue introduced by https://github.com/vllm-project/vllm/pull/26355 - Fix fused_moe ops, caused by https://github.com/vllm-project/vllm/pull/24097 - Fix bert model because of `inputs_embeds`, caused by https://github.com/vllm-project/vllm/pull/25922 - Fix MRope because of `get_input_positions_tensor` to `get_mrope_input_positions`, caused by https://github.com/vllm-project/vllm/pull/24172 - Fix `splitting_ops` changes introduced by https://github.com/vllm-project/vllm/pull/25845 - Fix multi-modality changes introduced by https://github.com/vllm-project/vllm/issues/16229 - Fix lora bias dropping issue introduced by https://github.com/vllm-project/vllm/pull/25807 - Fix structured ouput break introduced by https://github.com/vllm-project/vllm/issues/26737 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? CI passed with existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com>
This commit is contained in:
@@ -35,7 +35,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import \
|
||||
KVConnectorMetadata
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import \
|
||||
KVConnectorStats
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logger import logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
|
||||
compute_encoder_budget)
|
||||
@@ -55,7 +55,7 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
from vllm.v1.utils import ConstantList
|
||||
|
||||
logger = init_logger(__name__)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class RecomputeScheduler(SchedulerInterface):
|
||||
@@ -67,6 +67,7 @@ class RecomputeScheduler(SchedulerInterface):
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_config: KVCacheConfig,
|
||||
structured_output_manager: StructuredOutputManager,
|
||||
block_size: Optional[int] = None,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
include_finished_set: bool = False,
|
||||
log_stats: bool = False,
|
||||
@@ -586,9 +587,14 @@ class RecomputeScheduler(SchedulerInterface):
|
||||
self.kv_cache_config.kv_cache_groups)
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
if vllm_version_is("0.11.0"):
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
else:
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
|
||||
@@ -59,7 +59,7 @@ class AscendSchedulerConfig(SchedulerConfig):
|
||||
scheduler_config[k] = getattr(ascend_scheduler_config, k)
|
||||
return cls(**scheduler_config)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
def __post_init__(self, *args) -> None:
|
||||
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
|
||||
self.encoder_cache_size = self.max_num_batched_tokens
|
||||
self.chunked_prefill_enabled = self.enable_chunked_prefill
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import Iterable, Union
|
||||
from typing import Iterable, Optional, Union
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_events import KVEventBatch
|
||||
@@ -32,27 +32,19 @@ from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendScheduler(Scheduler):
|
||||
"""This Scheduler extends vllm's original v1 scheduler
|
||||
with prefill-first scheduling strategy."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_config: KVCacheConfig,
|
||||
structured_output_manager: StructuredOutputManager,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
include_finished_set: bool = False,
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
def _initialize_common(self) -> None:
|
||||
"""Initialize common attributes shared across all versions."""
|
||||
self.scheduled_req_ids: set[str] = set()
|
||||
self.running: list[Request] = []
|
||||
|
||||
self.finished_prefill_reqs: deque[Request] = deque()
|
||||
|
||||
enable_pd_transfer = getattr(self.scheduler_config,
|
||||
'enable_pd_transfer', False)
|
||||
decode_max_num_seqs = getattr(self.scheduler_config,
|
||||
@@ -61,6 +53,29 @@ class AscendScheduler(Scheduler):
|
||||
self.decode_max_num_running_reqs = max(self.max_num_running_reqs,
|
||||
decode_max_num_seqs)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_config: KVCacheConfig,
|
||||
structured_output_manager: StructuredOutputManager,
|
||||
block_size: Optional[int] = None,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
include_finished_set: bool = False,
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
# Call the parent class's __init__ method
|
||||
if vllm_version_is("0.11.0"):
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
else:
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, block_size,
|
||||
mm_registry, include_finished_set, log_stats)
|
||||
|
||||
# Initialize common attributes
|
||||
self._initialize_common()
|
||||
|
||||
def schedule(self) -> SchedulerOutput:
|
||||
if self.scheduler_config.chunked_prefill_enabled:
|
||||
return super().schedule()
|
||||
@@ -440,9 +455,14 @@ class AscendScheduler(Scheduler):
|
||||
self.kv_cache_config.kv_cache_groups)
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
if vllm_version_is("0.11.0"):
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
else:
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from vllm.config import VllmConfig
|
||||
@@ -32,6 +33,8 @@ from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class BudgetRefiner:
|
||||
"""This budget refiner can make dynamic adjustment to the token budget
|
||||
@@ -122,13 +125,19 @@ class SchedulerDynamicBatch(Scheduler):
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_config: KVCacheConfig,
|
||||
structured_output_manager: StructuredOutputManager,
|
||||
block_size: Optional[int] = None,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
include_finished_set: bool = False,
|
||||
log_stats: bool = False,
|
||||
) -> None:
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
if vllm_version_is("0.11.0"):
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, mm_registry,
|
||||
include_finished_set, log_stats)
|
||||
else:
|
||||
super().__init__(vllm_config, kv_cache_config,
|
||||
structured_output_manager, block_size,
|
||||
mm_registry, include_finished_set, log_stats)
|
||||
self.running: list[Request] = []
|
||||
self.budget_refiner = BudgetRefiner(
|
||||
default_budget=self.scheduler_config.max_num_batched_tokens,
|
||||
@@ -531,10 +540,14 @@ class SchedulerDynamicBatch(Scheduler):
|
||||
self.kv_cache_config.kv_cache_groups)
|
||||
if self.running:
|
||||
any_request = self.running[0]
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
|
||||
if vllm_version_is("0.11.0"):
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request, len(self.running)))
|
||||
else:
|
||||
num_common_prefix_blocks = (
|
||||
self.kv_cache_manager.get_num_common_prefix_blocks(
|
||||
any_request.request_id))
|
||||
# Construct the scheduler output.
|
||||
new_reqs_data = [
|
||||
NewRequestData.from_request(
|
||||
|
||||
Reference in New Issue
Block a user