[Model] Support DeepSeek-V4
This commit is contained in:
3
vllm_mlu/v1/engine/__init__.py
Normal file
3
vllm_mlu/v1/engine/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
23
vllm_mlu/v1/engine/async_llm.py
Normal file
23
vllm_mlu/v1/engine/async_llm.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
|
||||
class AsyncLLM_MluHijack(AsyncLLM):
|
||||
|
||||
async def start_scheduler_profile(self) -> None:
|
||||
await self.engine_core.start_scheduler_profile()
|
||||
|
||||
async def stop_scheduler_profile(self) -> None:
|
||||
await self.engine_core.stop_scheduler_profile()
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(AsyncLLM,
|
||||
"start_scheduler_profile",
|
||||
AsyncLLM_MluHijack.start_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(AsyncLLM,
|
||||
"stop_scheduler_profile",
|
||||
AsyncLLM_MluHijack.stop_scheduler_profile)
|
||||
566
vllm_mlu/v1/engine/core.py
Normal file
566
vllm_mlu/v1/engine/core.py
Normal file
@@ -0,0 +1,566 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from collections import deque
|
||||
import signal
|
||||
from typing import Any, Callable, cast
|
||||
from concurrent.futures import Future
|
||||
|
||||
from vllm.config import ParallelConfig, VllmConfig
|
||||
from vllm.logger import logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.cache import engine_receiver_cache_from_config
|
||||
from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
|
||||
from vllm.utils.gc_utils import freeze_gc_heap
|
||||
from vllm.utils.hashing import get_hash_fn_by_name
|
||||
from vllm.utils.system_utils import decorate_logs, set_process_title
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash, get_request_block_hasher, init_none_hash
|
||||
from vllm.v1.engine import EngineCoreOutputs
|
||||
from vllm.v1.engine.core import (
|
||||
EngineCore,
|
||||
EngineCoreProc,
|
||||
DPEngineCoreProc,
|
||||
)
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.core.sched.interface import SchedulerInterface
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
from logging import DEBUG
|
||||
|
||||
import vllm_mlu._mlu_utils as mlu_envs
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm_mlu.mlu_metric import LLMMetric
|
||||
|
||||
|
||||
class EngineCore_MluHijack(EngineCore):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
executor_class: type[Executor],
|
||||
log_stats: bool,
|
||||
executor_fail_callback: Callable | None = None,
|
||||
):
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: load_general_plugins in run_engine_core
|
||||
'''
|
||||
# # plugins need to be loaded at the engine/scheduler level too
|
||||
# from vllm.plugins import load_general_plugins
|
||||
# load_general_plugins()
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
self.vllm_config = vllm_config
|
||||
if vllm_config.parallel_config.data_parallel_rank == 0:
|
||||
logger.info(
|
||||
"Initializing a V1 LLM engine (v%s) with config: %s",
|
||||
VLLM_VERSION,
|
||||
vllm_config,
|
||||
)
|
||||
|
||||
self.log_stats = log_stats
|
||||
|
||||
# Setup Model.
|
||||
self.model_executor = executor_class(vllm_config)
|
||||
if executor_fail_callback is not None:
|
||||
self.model_executor.register_failure_callback(executor_fail_callback)
|
||||
|
||||
self.available_gpu_memory_for_kv_cache = -1
|
||||
|
||||
# Setup KV Caches and update CacheConfig after profiling.
|
||||
num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
|
||||
vllm_config
|
||||
)
|
||||
|
||||
vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))
|
||||
|
||||
self.structured_output_manager = StructuredOutputManager(vllm_config)
|
||||
|
||||
# Setup scheduler.
|
||||
Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
|
||||
|
||||
if len(kv_cache_config.kv_cache_groups) == 0:
|
||||
# Encoder models without KV cache don't support
|
||||
# chunked prefill. But do SSM models?
|
||||
logger.info("Disabling chunked prefill for model without KVCache")
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
|
||||
scheduler_block_size = (
|
||||
vllm_config.cache_config.block_size
|
||||
* vllm_config.parallel_config.decode_context_parallel_size
|
||||
)
|
||||
|
||||
self.scheduler: SchedulerInterface = Scheduler(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
structured_output_manager=self.structured_output_manager,
|
||||
include_finished_set=vllm_config.parallel_config.data_parallel_size > 1,
|
||||
log_stats=self.log_stats,
|
||||
block_size=scheduler_block_size,
|
||||
)
|
||||
self.use_spec_decode = vllm_config.speculative_config is not None
|
||||
if self.scheduler.connector is not None: # type: ignore
|
||||
self.model_executor.init_kv_output_aggregator(self.scheduler.connector) # type: ignore
|
||||
|
||||
self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
|
||||
self.mm_receiver_cache = engine_receiver_cache_from_config(
|
||||
vllm_config, mm_registry
|
||||
)
|
||||
|
||||
# If a KV connector is initialized for scheduler, we want to collect
|
||||
# handshake metadata from all workers so the connector in the scheduler
|
||||
# will have the full context
|
||||
kv_connector = self.scheduler.get_kv_connector()
|
||||
if kv_connector is not None:
|
||||
# Collect and store KV connector xfer metadata from workers
|
||||
# (after KV cache registration)
|
||||
xfer_handshake_metadata = (
|
||||
self.model_executor.get_kv_connector_handshake_metadata()
|
||||
)
|
||||
|
||||
if xfer_handshake_metadata:
|
||||
# xfer_handshake_metadata is list of dicts from workers
|
||||
# Each dict already has structure {tp_rank: metadata}
|
||||
# Merge all worker dicts into a single dict
|
||||
content: dict[int, Any] = {}
|
||||
for worker_dict in xfer_handshake_metadata:
|
||||
if worker_dict is not None:
|
||||
content.update(worker_dict)
|
||||
kv_connector.set_xfer_handshake_metadata(content)
|
||||
|
||||
# Setup batch queue for pipeline parallelism.
|
||||
# Batch queue for scheduled batches. This enables us to asynchronously
|
||||
# schedule and execute batches, and is required by pipeline parallelism
|
||||
# to eliminate pipeline bubbles.
|
||||
self.batch_queue_size = self.model_executor.max_concurrent_batches
|
||||
self.batch_queue: (
|
||||
deque[tuple[Future[ModelRunnerOutput], SchedulerOutput]] | None
|
||||
) = None
|
||||
if self.batch_queue_size > 1:
|
||||
logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
|
||||
self.batch_queue = deque(maxlen=self.batch_queue_size)
|
||||
|
||||
self.ec_producer = (
|
||||
vllm_config.ec_transfer_config is not None
|
||||
and vllm_config.ec_transfer_config.is_ec_producer
|
||||
)
|
||||
self.is_pooling_model = vllm_config.model_config.runner_type == "pooling"
|
||||
|
||||
self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
|
||||
if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None:
|
||||
caching_hash_fn = get_hash_fn_by_name(
|
||||
vllm_config.cache_config.prefix_caching_hash_algo
|
||||
)
|
||||
init_none_hash(caching_hash_fn)
|
||||
|
||||
self.request_block_hasher = get_request_block_hasher(
|
||||
scheduler_block_size, caching_hash_fn
|
||||
)
|
||||
|
||||
self.step_fn = (
|
||||
self.step if self.batch_queue is None else self.step_with_batch_queue
|
||||
)
|
||||
self.async_scheduling = vllm_config.scheduler_config.async_scheduling
|
||||
|
||||
# Mark the startup heap as static so that it's ignored by GC.
|
||||
# Reduces pause times of oldest generation collections.
|
||||
freeze_gc_heap()
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: v1 support offline benchmark
|
||||
'''
|
||||
self.step_latency = []
|
||||
self.model_exec_latency = []
|
||||
self.mm_encoder_latency = []
|
||||
self.num_gpu_blocks = num_gpu_blocks
|
||||
self.num_cpu_blocks = num_cpu_blocks
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
|
||||
"""Schedule, execute, and make output.
|
||||
|
||||
Returns tuple of outputs and a flag indicating whether the model
|
||||
was executed.
|
||||
"""
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: v1 support offline benchmark
|
||||
'''
|
||||
if mlu_envs.VLLM_LATENCY_DEBUG_EN:
|
||||
step_start = LLMMetric.get_mlu_cost_time()
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
# Check for any requests remaining in the scheduler - unfinished,
|
||||
# or finished and not yet removed from the batch.
|
||||
if not self.scheduler.has_requests():
|
||||
return {}, False
|
||||
scheduler_output = self.scheduler.schedule()
|
||||
future = self.model_executor.execute_model(scheduler_output, non_block=True)
|
||||
grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
|
||||
with self.log_error_detail(scheduler_output):
|
||||
model_output = future.result()
|
||||
if model_output is None:
|
||||
model_output = self.model_executor.sample_tokens(grammar_output)
|
||||
|
||||
if self.use_spec_decode and \
|
||||
self.vllm_config.kv_transfer_config is not None and \
|
||||
self.vllm_config.kv_transfer_config.kv_role == "kv_producer":
|
||||
draft_token_ids = self.model_executor.take_draft_token_ids()
|
||||
self.scheduler.draft_token_ids = draft_token_ids
|
||||
|
||||
engine_core_outputs = self.scheduler.update_from_output(
|
||||
scheduler_output, model_output
|
||||
)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: v1 support offline benchmark
|
||||
'''
|
||||
has_sched_reqs = (scheduler_output.total_num_scheduled_tokens > 0)
|
||||
if mlu_envs.VLLM_LATENCY_DEBUG_EN and has_sched_reqs:
|
||||
self.step_latency.append(LLMMetric.get_mlu_cost_time() - step_start)
|
||||
if mlu_envs.VLLM_LATENCY_DEBUG_WITH_DEVICE_EN and has_sched_reqs:
|
||||
self.model_exec_latency.append(self.get_model_exec_latency())
|
||||
mm_encoder_latency = self.get_mm_encoder_latency()
|
||||
if mm_encoder_latency:
|
||||
self.mm_encoder_latency.append(mm_encoder_latency)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
|
||||
|
||||
def step_with_batch_queue(
|
||||
self,
|
||||
) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
|
||||
"""Schedule and execute batches with the batch queue.
|
||||
Note that if nothing to output in this step, None is returned.
|
||||
|
||||
The execution flow is as follows:
|
||||
1. Try to schedule a new batch if the batch queue is not full.
|
||||
If a new batch is scheduled, directly return an empty engine core
|
||||
output. In other words, fulfilling the batch queue has a higher priority
|
||||
than getting model outputs.
|
||||
2. If there is no new scheduled batch, meaning that the batch queue
|
||||
is full or no other requests can be scheduled, we block until the first
|
||||
batch in the job queue is finished.
|
||||
3. Update the scheduler from the output.
|
||||
"""
|
||||
batch_queue = self.batch_queue
|
||||
assert batch_queue is not None
|
||||
|
||||
# Try to schedule a new batch if the batch queue is not full, but
|
||||
# the scheduler may return an empty batch if all requests are scheduled.
|
||||
# Note that this is not blocking.
|
||||
assert len(batch_queue) < self.batch_queue_size
|
||||
|
||||
model_executed = False
|
||||
deferred_scheduler_output = None
|
||||
if self.scheduler.has_requests():
|
||||
scheduler_output = self.scheduler.schedule()
|
||||
exec_future = self.model_executor.execute_model(
|
||||
scheduler_output, non_block=True
|
||||
)
|
||||
if not self.ec_producer:
|
||||
model_executed = scheduler_output.total_num_scheduled_tokens > 0
|
||||
|
||||
if self.is_pooling_model or not model_executed:
|
||||
# No sampling required (no requests scheduled).
|
||||
future = cast(Future[ModelRunnerOutput], exec_future)
|
||||
else:
|
||||
exec_future.add_done_callback(self._log_err_callback(scheduler_output))
|
||||
|
||||
if not scheduler_output.pending_structured_output_tokens:
|
||||
# We aren't waiting for any tokens, get any grammar output
|
||||
# and sample immediately.
|
||||
grammar_output = self.scheduler.get_grammar_bitmask(
|
||||
scheduler_output
|
||||
)
|
||||
future = self.model_executor.sample_tokens(
|
||||
grammar_output, non_block=True
|
||||
)
|
||||
else:
|
||||
# We need to defer sampling until we have processed the model output
|
||||
# from the prior step.
|
||||
deferred_scheduler_output = scheduler_output
|
||||
|
||||
if not deferred_scheduler_output:
|
||||
# Add this step's future to the queue.
|
||||
batch_queue.appendleft((future, scheduler_output))
|
||||
if (
|
||||
model_executed
|
||||
and len(batch_queue) < self.batch_queue_size
|
||||
and not batch_queue[-1][0].done()
|
||||
):
|
||||
# Don't block on next worker response unless the queue is full
|
||||
# or there are no more requests to schedule.
|
||||
return None, True
|
||||
|
||||
elif not batch_queue:
|
||||
# Queue is empty. We should not reach here since this method should
|
||||
# only be called when the scheduler contains requests or the queue
|
||||
# is non-empty.
|
||||
return None, False
|
||||
|
||||
# Block until the next result is available.
|
||||
future, scheduler_output = batch_queue.pop()
|
||||
with self.log_error_detail(scheduler_output):
|
||||
model_output = future.result()
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: supoort disagg for mlu.
|
||||
'''
|
||||
if self.use_spec_decode and \
|
||||
self.vllm_config.kv_transfer_config is not None and \
|
||||
self.vllm_config.kv_transfer_config.kv_role == "kv_producer":
|
||||
draft_token_ids = self.model_executor.take_draft_token_ids()
|
||||
self.scheduler.draft_token_ids = draft_token_ids
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
engine_core_outputs = self.scheduler.update_from_output(
|
||||
scheduler_output, model_output
|
||||
)
|
||||
|
||||
# NOTE(nick): We can either handle the deferred tasks here or save
|
||||
# in a field and do it immediately once step_with_batch_queue is
|
||||
# re-called. The latter slightly favors TTFT over TPOT/throughput.
|
||||
if deferred_scheduler_output:
|
||||
# We now have the tokens needed to compute the bitmask for the
|
||||
# deferred request. Get the bitmask and call sample tokens.
|
||||
grammar_output = self.scheduler.get_grammar_bitmask(
|
||||
deferred_scheduler_output
|
||||
)
|
||||
future = self.model_executor.sample_tokens(grammar_output, non_block=True)
|
||||
batch_queue.appendleft((future, deferred_scheduler_output))
|
||||
|
||||
return engine_core_outputs, model_executed
|
||||
|
||||
def get_model_exec_latency(self):
|
||||
latency = self.model_executor.get_latency()
|
||||
return latency
|
||||
|
||||
def get_mm_encoder_latency(self):
|
||||
return self.model_executor.get_mm_encoder_latency()
|
||||
|
||||
def get_hfu_info(self, batch, input_len, output_len):
|
||||
return self.model_executor.get_hfu_info(batch, input_len, output_len)
|
||||
|
||||
def get_latency(self):
|
||||
return (self.step_latency, self.model_exec_latency, self.mm_encoder_latency)
|
||||
|
||||
def get_memory_usage(self):
|
||||
peak_memory, block_memory = self.model_executor.get_memory_usage()
|
||||
return (peak_memory, block_memory,
|
||||
self.num_gpu_blocks, self.num_cpu_blocks)
|
||||
|
||||
def recapture_model(self,
|
||||
prefill_enable_mlugraph: bool,
|
||||
batch_size: int,
|
||||
input_len: int):
|
||||
self.model_executor.recapture_model(
|
||||
prefill_enable_mlugraph, batch_size, input_len)
|
||||
|
||||
def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
|
||||
self.step_latency = []
|
||||
self.model_exec_latency = []
|
||||
self.mm_encoder_latency = []
|
||||
mlu_envs.VLLM_V1_USE_UNCHUNK_SCHED = use_unchunk_sched
|
||||
mlu_envs.VLLM_V1_MIN_PREFILL_BATCH = min_prefill_batch
|
||||
|
||||
def start_scheduler_profile(self):
|
||||
self.scheduler.start_scheduler_profile()
|
||||
|
||||
def stop_scheduler_profile(self):
|
||||
self.scheduler.stop_scheduler_profile()
|
||||
|
||||
def response_remote_alloc_once(self):
|
||||
self.model_executor.response_remote_alloc_once()
|
||||
|
||||
|
||||
class EngineCoreProc_MluHijack(EngineCoreProc):
|
||||
|
||||
@staticmethod
|
||||
def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
|
||||
"""Launch EngineCore busy loop in background process."""
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: load_general_plugins for mp backend engine
|
||||
'''
|
||||
# plugins need to be loaded at the engine/scheduler level too
|
||||
from vllm.plugins import load_general_plugins
|
||||
load_general_plugins()
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
# Signal handler used for graceful termination.
|
||||
# SystemExit exception is only raised once to allow this and worker
|
||||
# processes to terminate without error
|
||||
shutdown_requested = False
|
||||
|
||||
# Ensure we can serialize transformer config after spawning
|
||||
maybe_register_config_serialize_by_value()
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
nonlocal shutdown_requested
|
||||
if not shutdown_requested:
|
||||
shutdown_requested = True
|
||||
raise SystemExit()
|
||||
|
||||
# Either SIGTERM or SIGINT will terminate the engine_core
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
engine_core: EngineCoreProc | None = None
|
||||
try:
|
||||
parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config
|
||||
if parallel_config.data_parallel_size > 1 or dp_rank > 0:
|
||||
set_process_title("EngineCore", f"DP{dp_rank}")
|
||||
decorate_logs()
|
||||
# Set data parallel rank for this engine process.
|
||||
parallel_config.data_parallel_rank = dp_rank
|
||||
parallel_config.data_parallel_rank_local = local_dp_rank
|
||||
engine_core = DPEngineCoreProc(*args, **kwargs)
|
||||
else:
|
||||
set_process_title("EngineCore")
|
||||
decorate_logs()
|
||||
engine_core = EngineCoreProc(*args, **kwargs)
|
||||
|
||||
engine_core.run_busy_loop()
|
||||
|
||||
except SystemExit:
|
||||
logger.debug("EngineCore exiting.")
|
||||
raise
|
||||
except Exception as e:
|
||||
if engine_core is None:
|
||||
logger.exception("EngineCore failed to start.")
|
||||
else:
|
||||
logger.exception("EngineCore encountered a fatal error.")
|
||||
engine_core._send_engine_dead()
|
||||
raise e
|
||||
finally:
|
||||
if engine_core is not None:
|
||||
engine_core.shutdown()
|
||||
|
||||
def _process_input_queue(self):
|
||||
"""Exits when an engine step needs to be performed."""
|
||||
|
||||
waited = False
|
||||
while (
|
||||
not self.engines_running
|
||||
and not self.scheduler.has_requests()
|
||||
and not self.batch_queue
|
||||
):
|
||||
if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
|
||||
logger.debug("EngineCore waiting for work.")
|
||||
waited = True
|
||||
|
||||
if self.vllm_config.kv_transfer_config is not None and \
|
||||
self.vllm_config.kv_transfer_config.kv_role == "kv_consumer":
|
||||
self.response_remote_alloc_once()
|
||||
if self.input_queue.empty():
|
||||
continue
|
||||
req = self.input_queue.get_nowait()
|
||||
self._handle_client_request(*req)
|
||||
else:
|
||||
req = self.input_queue.get()
|
||||
self._handle_client_request(*req)
|
||||
|
||||
if waited:
|
||||
logger.debug("EngineCore loop active.")
|
||||
|
||||
if self.vllm_config.kv_transfer_config is not None and \
|
||||
self.vllm_config.kv_transfer_config.kv_role == "kv_consumer":
|
||||
self.response_remote_alloc_once()
|
||||
|
||||
# Handle any more client requests.
|
||||
while not self.input_queue.empty():
|
||||
req = self.input_queue.get_nowait()
|
||||
self._handle_client_request(*req)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"get_mm_encoder_latency",
|
||||
EngineCore_MluHijack.get_mm_encoder_latency)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"get_model_exec_latency",
|
||||
EngineCore_MluHijack.get_model_exec_latency)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"get_hfu_info",
|
||||
EngineCore_MluHijack.get_hfu_info)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"get_latency",
|
||||
EngineCore_MluHijack.get_latency)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"get_memory_usage",
|
||||
EngineCore_MluHijack.get_memory_usage)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"recapture_model",
|
||||
EngineCore_MluHijack.recapture_model)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"init_metric",
|
||||
EngineCore_MluHijack.init_metric)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"start_scheduler_profile",
|
||||
EngineCore_MluHijack.start_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"stop_scheduler_profile",
|
||||
EngineCore_MluHijack.stop_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
EngineCore.__init__,
|
||||
EngineCore_MluHijack.__init__)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
EngineCore.step,
|
||||
EngineCore_MluHijack.step)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
"response_remote_alloc_once",
|
||||
EngineCore_MluHijack.response_remote_alloc_once)
|
||||
MluHijackObject.apply_hijack(EngineCore,
|
||||
EngineCore.step_with_batch_queue,
|
||||
EngineCore_MluHijack.step_with_batch_queue)
|
||||
MluHijackObject.apply_hijack(EngineCoreProc,
|
||||
EngineCoreProc.run_engine_core,
|
||||
EngineCoreProc_MluHijack.run_engine_core)
|
||||
MluHijackObject.apply_hijack(EngineCoreProc,
|
||||
EngineCoreProc._process_input_queue,
|
||||
EngineCoreProc_MluHijack._process_input_queue)
|
||||
227
vllm_mlu/v1/engine/core_client.py
Normal file
227
vllm_mlu/v1/engine/core_client.py
Normal file
@@ -0,0 +1,227 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from vllm.v1.engine.core_client import (
|
||||
EngineCoreClient,
|
||||
InprocClient,
|
||||
SyncMPClient,
|
||||
AsyncMPClient,
|
||||
DPAsyncMPClient,
|
||||
DPLBAsyncMPClient,
|
||||
)
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.v1.executor import Executor
|
||||
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
class EngineCoreClient_MluHiack(EngineCoreClient):
|
||||
|
||||
@staticmethod
|
||||
def make_async_mp_client(
|
||||
vllm_config: VllmConfig,
|
||||
executor_class: type[Executor],
|
||||
log_stats: bool,
|
||||
client_addresses: dict[str, str] | None = None,
|
||||
client_count: int = 1,
|
||||
client_index: int = 0,
|
||||
) -> "MPClient":
|
||||
parallel_config = vllm_config.parallel_config
|
||||
client_args = (
|
||||
vllm_config,
|
||||
executor_class,
|
||||
log_stats,
|
||||
client_addresses,
|
||||
client_count,
|
||||
client_index,
|
||||
)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: disagg use DPAsyncMPClient instead of DPLBAsyncMPClient.
|
||||
'''
|
||||
if parallel_config.data_parallel_size > 1:
|
||||
if parallel_config.data_parallel_external_lb or vllm_config.kv_transfer_config is not None:
|
||||
# External load balancer - client per DP rank.
|
||||
return DPAsyncMPClient(*client_args)
|
||||
# Internal load balancer - client balances to all DP ranks.
|
||||
return DPLBAsyncMPClient(*client_args)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
return AsyncMPClient(*client_args)
|
||||
|
||||
|
||||
class InprocClient_MluHiack(InprocClient):
|
||||
|
||||
def get_hfu_info(self, batch, input_len, output_len):
|
||||
return self.engine_core.get_hfu_info(batch, input_len, output_len)
|
||||
|
||||
def get_latency(self):
|
||||
return self.engine_core.get_latency()
|
||||
|
||||
def get_memory_usage(self):
|
||||
return self.engine_core.get_memory_usage()
|
||||
|
||||
def recapture_model(
|
||||
self,
|
||||
prefill_enable_mlugraph: bool,
|
||||
batch_size: int,
|
||||
input_len: int,
|
||||
):
|
||||
return self.engine_core.recapture_model(
|
||||
prefill_enable_mlugraph, batch_size, input_len
|
||||
)
|
||||
|
||||
def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
|
||||
return self.engine_core.init_metric(
|
||||
use_unchunk_sched, min_prefill_batch,
|
||||
)
|
||||
|
||||
def start_scheduler_profile(self):
|
||||
self.engine_core.start_scheduler_profile()
|
||||
|
||||
def stop_scheduler_profile(self):
|
||||
self.engine_core.stop_scheduler_profile()
|
||||
|
||||
def response_remote_alloc_once(self) -> None:
|
||||
self.engine_core.response_remote_alloc_once()
|
||||
|
||||
|
||||
class SyncMPClient_MluHiack(SyncMPClient):
|
||||
|
||||
def get_hfu_info(self, batch, input_len, output_len):
|
||||
try:
|
||||
return self.call_utility("get_hfu_info", batch, input_len, output_len)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get HFU info: {str(e)}")
|
||||
|
||||
def get_latency(self):
|
||||
return self.call_utility("get_latency")
|
||||
|
||||
def get_memory_usage(self):
|
||||
return self.call_utility("get_memory_usage")
|
||||
|
||||
def recapture_model(self,
|
||||
prefill_enable_mlugraph: bool,
|
||||
batch_size: int,
|
||||
input_len: int):
|
||||
return self.call_utility("recapture_model",
|
||||
prefill_enable_mlugraph, batch_size, input_len)
|
||||
|
||||
def init_metric(self, use_unchunk_sched: bool, min_prefill_batch: int):
|
||||
return self.call_utility("init_metric",
|
||||
use_unchunk_sched,
|
||||
min_prefill_batch)
|
||||
|
||||
def start_scheduler_profile(self):
|
||||
self.call_utility("start_scheduler_profile")
|
||||
|
||||
def stop_scheduler_profile(self):
|
||||
self.call_utility("stop_scheduler_profile")
|
||||
|
||||
def response_remote_alloc_once(self) -> None:
|
||||
self.call_utility("response_remote_alloc_once")
|
||||
|
||||
|
||||
class AsyncMPClient_MluHijack(AsyncMPClient):
|
||||
|
||||
async def start_scheduler_profile(self) -> None:
|
||||
await self.call_utility_async("start_scheduler_profile")
|
||||
|
||||
async def stop_scheduler_profile(self) -> None:
|
||||
await self.call_utility_async("stop_scheduler_profile")
|
||||
|
||||
async def response_remote_alloc_once(self) -> None:
|
||||
await self.call_utility_async("response_remote_alloc_once")
|
||||
|
||||
|
||||
class DPAsyncMPClient_MluHijack(DPAsyncMPClient):
|
||||
|
||||
def get_core_engine_for_request(self, request: EngineCoreRequest):
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: disagg need proxy to assign dp_rank
|
||||
'''
|
||||
if request.data_parallel_rank is not None:
|
||||
# engines are already in rank order
|
||||
return self.core_engines[request.data_parallel_rank]
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
|
||||
return self.core_engine
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(EngineCoreClient,
|
||||
EngineCoreClient.make_async_mp_client,
|
||||
EngineCoreClient_MluHiack.make_async_mp_client)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"get_hfu_info",
|
||||
InprocClient_MluHiack.get_hfu_info)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"get_latency",
|
||||
InprocClient_MluHiack.get_latency)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"get_memory_usage",
|
||||
InprocClient_MluHiack.get_memory_usage)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"recapture_model",
|
||||
InprocClient_MluHiack.recapture_model)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"init_metric",
|
||||
InprocClient_MluHiack.init_metric)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"start_scheduler_profile",
|
||||
InprocClient_MluHiack.start_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"stop_scheduler_profile",
|
||||
InprocClient_MluHiack.stop_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(InprocClient,
|
||||
"response_remote_alloc_once",
|
||||
InprocClient_MluHiack.response_remote_alloc_once)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"get_hfu_info",
|
||||
SyncMPClient_MluHiack.get_hfu_info)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"get_latency",
|
||||
SyncMPClient_MluHiack.get_latency)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"get_memory_usage",
|
||||
SyncMPClient_MluHiack.get_memory_usage)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"recapture_model",
|
||||
SyncMPClient_MluHiack.recapture_model)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"init_metric",
|
||||
SyncMPClient_MluHiack.init_metric)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"start_scheduler_profile",
|
||||
SyncMPClient_MluHiack.start_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"stop_scheduler_profile",
|
||||
SyncMPClient_MluHiack.stop_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(SyncMPClient,
|
||||
"response_remote_alloc_once",
|
||||
SyncMPClient_MluHiack.response_remote_alloc_once)
|
||||
MluHijackObject.apply_hijack(AsyncMPClient,
|
||||
"start_scheduler_profile",
|
||||
AsyncMPClient_MluHijack.start_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(AsyncMPClient,
|
||||
"stop_scheduler_profile",
|
||||
AsyncMPClient_MluHijack.stop_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(AsyncMPClient,
|
||||
"response_remote_alloc_once",
|
||||
AsyncMPClient_MluHijack.response_remote_alloc_once)
|
||||
MluHijackObject.apply_hijack(DPAsyncMPClient,
|
||||
DPAsyncMPClient.get_core_engine_for_request,
|
||||
DPAsyncMPClient_MluHijack.get_core_engine_for_request)
|
||||
43
vllm_mlu/v1/engine/llm_engine.py
Normal file
43
vllm_mlu/v1/engine/llm_engine.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
|
||||
def vllm__engine__llm_engine__LLMEngine__get_hfu_info(self, batch, input_len, output_len):
|
||||
return self.engine_core.get_hfu_info(batch, input_len, output_len)
|
||||
|
||||
|
||||
def vllm__engine__llm_engine__LLMEngine__get_latency(self):
|
||||
return self.engine_core.get_latency()
|
||||
|
||||
|
||||
def vllm__engine__llm_engine__LLMEngine__get_memory_usage(self):
|
||||
return self.engine_core.get_memory_usage()
|
||||
|
||||
|
||||
def vllm__engine__llm_engine__LLMEngine__start_scheduler_profile(self):
|
||||
self.engine_core.start_scheduler_profile()
|
||||
|
||||
|
||||
def vllm__engine__llm_engine__LLMEngine__stop_scheduler_profile(self):
|
||||
self.engine_core.stop_scheduler_profile()
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(LLMEngine,
|
||||
"get_hfu_info",
|
||||
vllm__engine__llm_engine__LLMEngine__get_hfu_info)
|
||||
MluHijackObject.apply_hijack(LLMEngine,
|
||||
"get_latency",
|
||||
vllm__engine__llm_engine__LLMEngine__get_latency)
|
||||
MluHijackObject.apply_hijack(LLMEngine,
|
||||
"get_memory_usage",
|
||||
vllm__engine__llm_engine__LLMEngine__get_memory_usage)
|
||||
MluHijackObject.apply_hijack(LLMEngine,
|
||||
"start_scheduler_profile",
|
||||
vllm__engine__llm_engine__LLMEngine__start_scheduler_profile)
|
||||
MluHijackObject.apply_hijack(LLMEngine,
|
||||
"stop_scheduler_profile",
|
||||
vllm__engine__llm_engine__LLMEngine__stop_scheduler_profile)
|
||||
Reference in New Issue
Block a user