[Model] Support DeepSeek-V4
This commit is contained in:
3
vllm_mlu/v1/core/__init__.py
Normal file
3
vllm_mlu/v1/core/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
146
vllm_mlu/v1/core/kv_cache_manager.py
Normal file
146
vllm_mlu/v1/core/kv_cache_manager.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
import itertools
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, overload
|
||||
|
||||
from vllm.distributed.kv_events import KVCacheEvent
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
|
||||
from vllm.v1.core.kv_cache_utils import KVCacheBlock
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
|
||||
|
||||
class KVCacheManager_MluHijack(KVCacheManager):
|
||||
|
||||
def allocate_slots(
|
||||
self,
|
||||
request: Request,
|
||||
num_new_tokens: int,
|
||||
num_new_computed_tokens: int = 0,
|
||||
new_computed_blocks: KVCacheBlocks | None = None,
|
||||
num_lookahead_tokens: int = 0,
|
||||
delay_cache_blocks: bool = False,
|
||||
num_encoder_tokens: int = 0,
|
||||
fixed_window_tokens: int = 0,
|
||||
) -> KVCacheBlocks | None:
|
||||
"""Add slots for a request with new tokens to append.
|
||||
|
||||
Args:
|
||||
request: The request to allocate slots.
|
||||
num_new_tokens: The number of tokens to allocate, including external
|
||||
tokens. Note that this does not include tokens that have
|
||||
already been computed locally (i.e. new_computed_blocks).
|
||||
num_new_computed_tokens: The number of new computed tokens just
|
||||
hitting the prefix caching, excluding external tokens.
|
||||
new_computed_blocks: The cached blocks for the above new computed
|
||||
tokens.
|
||||
num_lookahead_tokens: The number of speculative tokens to allocate.
|
||||
This is used by spec decode proposers with kv-cache such
|
||||
as eagle.
|
||||
delay_cache_blocks: Whether to skip caching the blocks. This is
|
||||
used by P/D when allocating blocks used in a KV transfer
|
||||
which will complete in a future step.
|
||||
|
||||
Blocks layout:
|
||||
```
|
||||
-----------------------------------------------------------------------
|
||||
| < computed > | < new computed > | < new > | < pre-allocated > |
|
||||
-----------------------------------------------------------------------
|
||||
| < required > |
|
||||
--------------------------------------------------
|
||||
| < full > |
|
||||
------------------------------------------------
|
||||
| <new full> |
|
||||
--------------
|
||||
```
|
||||
The following *_blocks are illustrated in this layout.
|
||||
|
||||
Returns:
|
||||
A list of new allocated blocks.
|
||||
"""
|
||||
if num_new_tokens == 0:
|
||||
raise ValueError("num_new_tokens must be greater than 0")
|
||||
|
||||
if new_computed_blocks is not None:
|
||||
new_computed_block_list = new_computed_blocks.blocks
|
||||
else:
|
||||
new_computed_block_list = self.empty_kv_cache_blocks.blocks
|
||||
|
||||
# Free the blocks that are skipped during the attention computation
|
||||
# (e.g., tokens outside the sliding window).
|
||||
# We can do this even if we cannot schedule this request due to
|
||||
# insufficient free blocks.
|
||||
# Should call this function before allocating new blocks to reduce
|
||||
# the number of evicted blocks.
|
||||
self.coordinator.remove_skipped_blocks(
|
||||
request.request_id, request.num_computed_tokens
|
||||
)
|
||||
|
||||
# The number of computed tokens is the number of computed tokens plus
|
||||
# the new prefix caching hits
|
||||
num_computed_tokens = request.num_computed_tokens + num_new_computed_tokens
|
||||
num_tokens_need_slot = min(
|
||||
num_computed_tokens + num_new_tokens + num_lookahead_tokens + fixed_window_tokens,
|
||||
self.max_model_len,
|
||||
)
|
||||
|
||||
num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
|
||||
request_id=request.request_id,
|
||||
num_tokens=num_tokens_need_slot,
|
||||
new_computed_blocks=new_computed_block_list,
|
||||
num_encoder_tokens=num_encoder_tokens,
|
||||
)
|
||||
|
||||
if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
|
||||
# Cannot allocate new blocks
|
||||
return None
|
||||
|
||||
# Touch the computed blocks to make sure they won't be evicted.
|
||||
if self.enable_caching:
|
||||
self.block_pool.touch(new_computed_block_list)
|
||||
else:
|
||||
assert not any(new_computed_block_list), (
|
||||
"Computed blocks should be empty when prefix caching is disabled"
|
||||
)
|
||||
|
||||
if new_computed_block_list is not self.empty_kv_cache_blocks.blocks:
|
||||
# Append the new computed blocks to the request blocks until now to
|
||||
# avoid the case where the new blocks cannot be allocated.
|
||||
self.coordinator.save_new_computed_blocks(
|
||||
request.request_id, new_computed_block_list
|
||||
)
|
||||
|
||||
new_blocks = self.coordinator.allocate_new_blocks(
|
||||
request.request_id, num_tokens_need_slot, num_encoder_tokens
|
||||
)
|
||||
|
||||
# P/D: delay caching blocks if we have to recv from
|
||||
# remote. Update state for locally cached blocks.
|
||||
if not self.enable_caching or delay_cache_blocks:
|
||||
return self.create_kv_cache_blocks(new_blocks)
|
||||
|
||||
# NOTE(woosuk): We want to commit (cache) up to num_computed_tokens +
|
||||
# num_new_tokens, but must exclude "non-committable" tokens (e.g.,
|
||||
# draft tokens that could be rejected). Therefore, we cap the number
|
||||
# at `request.num_tokens`, ensuring only "finalized" tokens are cached.
|
||||
num_tokens_to_cache = min(
|
||||
num_computed_tokens + num_new_tokens, request.num_tokens
|
||||
)
|
||||
self.coordinator.cache_blocks(request, num_tokens_to_cache)
|
||||
|
||||
return self.create_kv_cache_blocks(new_blocks)
|
||||
|
||||
|
||||
MluHijackObject.apply_hijack(KVCacheManager,
|
||||
KVCacheManager.allocate_slots,
|
||||
KVCacheManager_MluHijack.allocate_slots)
|
||||
123
vllm_mlu/v1/core/kv_cache_utils.py
Normal file
123
vllm_mlu/v1/core/kv_cache_utils.py
Normal file
@@ -0,0 +1,123 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm_mlu.mlu_hijack_utils import MluHijackObject
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.v1.kv_cache_interface import (
|
||||
KVCacheConfig,
|
||||
KVCacheGroupSpec,
|
||||
KVCacheSpec,
|
||||
KVCacheTensor,
|
||||
UniformTypeKVCacheSpecs,
|
||||
)
|
||||
from vllm.v1.core import kv_cache_utils
|
||||
from vllm.v1.core.kv_cache_utils import (may_override_num_blocks,
|
||||
get_uniform_page_size,
|
||||
get_num_blocks)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
def vllm__v1__core__kv_cache_utils__get_kv_cache_config_from_groups(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_groups: list[KVCacheGroupSpec],
|
||||
kv_cache_specs: dict[str, KVCacheSpec],
|
||||
available_memory: int,
|
||||
) -> KVCacheConfig:
|
||||
"""
|
||||
Generate the KV cache configuration from the KV cache groups and spec
|
||||
of each layer.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_groups: The KV cache groups
|
||||
kv_cache_specs: The KV cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes
|
||||
Returns:
|
||||
The generated KVCacheConfig
|
||||
"""
|
||||
if len(kv_cache_groups) == 0:
|
||||
# Attention free models do not have KV cache.
|
||||
# Return num_blocks=1 as BlockPool always needs a null_block.
|
||||
return KVCacheConfig(
|
||||
num_blocks=1,
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
)
|
||||
|
||||
# Determine how model runners should initialize the KV cache tensors.
|
||||
if len(kv_cache_groups) == 1 and isinstance(
|
||||
kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
|
||||
):
|
||||
# Special case: all layers have the same type of KV cache but with
|
||||
# different hidden size. Allocate different amount of memory for each
|
||||
# layer based on its hidden size.
|
||||
num_blocks = (
|
||||
available_memory // kv_cache_groups[0].kv_cache_spec.page_size_bytes
|
||||
)
|
||||
num_blocks = may_override_num_blocks(vllm_config, num_blocks)
|
||||
per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
|
||||
kv_cache_tensors = [
|
||||
KVCacheTensor(
|
||||
size=per_layer_specs[layer_name].page_size_bytes * num_blocks,
|
||||
shared_by=[layer_name],
|
||||
)
|
||||
for layer_name in kv_cache_groups[0].layer_names
|
||||
]
|
||||
else:
|
||||
# General case:
|
||||
# We will have group_size memory pools, each is shared by one layer from
|
||||
# each group. As layers of different groups have different block table,
|
||||
# they will use different parts of the shared Tensor.
|
||||
# The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2),
|
||||
# (sw.1, padding) will be: (group_size = 2)
|
||||
# full.0, sw.0, sw.1: share a Tensor with size=available_memory//2
|
||||
# full.1, sw.2: share another Tensor with size=available_memory//2
|
||||
group_size = max(len(group.layer_names) for group in kv_cache_groups)
|
||||
|
||||
page_size = get_uniform_page_size(kv_cache_specs)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: support qwen3-next
|
||||
'''
|
||||
if (vllm_config.mlu_config.enable_mamba_split_page_size):
|
||||
# Note(wulingchao): 预留出linear attention的内存不参与系统调度
|
||||
# 当前的 page size是小page,需要扩展到完整的linear attention的page
|
||||
mamba_page_size = (page_size \
|
||||
* vllm_config.mlu_config.mamba_to_attn_block_ratio
|
||||
* vllm_config.mlu_config.mamba_support_max_batch_size \
|
||||
* group_size * 3)
|
||||
logger.warning(f"all available memory {available_memory}, mamba mem used {mamba_page_size}")
|
||||
available_memory = available_memory - mamba_page_size
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
assert group_size > 0, "group_size must be greater than 0"
|
||||
num_blocks = get_num_blocks(
|
||||
vllm_config, group_size, available_memory, page_size
|
||||
)
|
||||
kv_cache_tensors = []
|
||||
for i in range(group_size):
|
||||
shared_by = []
|
||||
for j in range(len(kv_cache_groups)):
|
||||
if i < len(kv_cache_groups[j].layer_names):
|
||||
shared_by.append(kv_cache_groups[j].layer_names[i])
|
||||
kv_cache_tensors.append(
|
||||
KVCacheTensor(size=page_size * num_blocks, shared_by=shared_by)
|
||||
)
|
||||
|
||||
return KVCacheConfig(
|
||||
num_blocks=num_blocks,
|
||||
kv_cache_tensors=kv_cache_tensors,
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
)
|
||||
|
||||
MluHijackObject.apply_hijack(kv_cache_utils,
|
||||
kv_cache_utils.get_kv_cache_config_from_groups,
|
||||
vllm__v1__core__kv_cache_utils__get_kv_cache_config_from_groups)
|
||||
|
||||
|
||||
3
vllm_mlu/v1/core/sched/__init__.py
Normal file
3
vllm_mlu/v1/core/sched/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
136
vllm_mlu/v1/core/sched/async_scheduler.py
Normal file
136
vllm_mlu/v1/core/sched/async_scheduler.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
from vllm_mlu.v1.core.sched.scheduler import MLUUnchunkScheduler, SchedulerWithProfiler
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class AsyncScheduler(SchedulerWithProfiler):
|
||||
def _update_after_schedule(
|
||||
self,
|
||||
scheduler_output: SchedulerOutput,
|
||||
) -> None:
|
||||
super()._update_after_schedule(scheduler_output)
|
||||
pending_structured_output_tokens = False
|
||||
spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
|
||||
for req_id in scheduler_output.num_scheduled_tokens:
|
||||
request = self.requests[req_id]
|
||||
pending_structured_output_tokens |= (
|
||||
request.use_structured_output and request.num_output_placeholders > 0
|
||||
)
|
||||
cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
|
||||
if (
|
||||
request.num_computed_tokens
|
||||
== request.num_tokens
|
||||
+ request.num_output_placeholders
|
||||
+ cur_num_spec_tokens
|
||||
):
|
||||
# The request will generate a new token plus num_spec_tokens
|
||||
# in this scheduling step.
|
||||
request.num_output_placeholders += 1 + cur_num_spec_tokens
|
||||
# Add placeholders for the new tokens in spec_token_ids.
|
||||
# Wwe will update the actual spec token ids in the worker process.
|
||||
request.spec_token_ids = [-1] * self.num_spec_tokens
|
||||
|
||||
scheduler_output.pending_structured_output_tokens = (
|
||||
pending_structured_output_tokens
|
||||
)
|
||||
|
||||
def _update_request_with_output(
|
||||
self,
|
||||
request: Request,
|
||||
new_token_ids: list[int],
|
||||
) -> tuple[list[int], bool]:
|
||||
status_before_update = request.status
|
||||
new_token_ids, stopped = super()._update_request_with_output(
|
||||
request, new_token_ids
|
||||
)
|
||||
|
||||
# Update the number of output placeholders.
|
||||
request.num_output_placeholders -= len(new_token_ids)
|
||||
assert request.num_output_placeholders >= 0
|
||||
|
||||
# Cache the new tokens. Preempted requests should be skipped.
|
||||
if status_before_update == RequestStatus.RUNNING:
|
||||
self.kv_cache_manager.cache_blocks(
|
||||
request, request.num_computed_tokens - request.num_output_placeholders
|
||||
)
|
||||
return new_token_ids, stopped
|
||||
|
||||
class MLUUnchunkAsyncScheduler(MLUUnchunkScheduler):
|
||||
def _update_after_schedule(
|
||||
self,
|
||||
scheduler_output: SchedulerOutput,
|
||||
) -> None:
|
||||
super()._update_after_schedule(scheduler_output)
|
||||
spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
|
||||
for req_id in scheduler_output.num_scheduled_tokens:
|
||||
request = self.requests[req_id]
|
||||
cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, []))
|
||||
if (
|
||||
request.num_computed_tokens
|
||||
== request.num_tokens
|
||||
+ request.num_output_placeholders
|
||||
+ cur_num_spec_tokens
|
||||
):
|
||||
# The request will generate a new token plus num_spec_tokens
|
||||
# in this scheduling step.
|
||||
request.num_output_placeholders += 1 + cur_num_spec_tokens
|
||||
# Add a placeholder for the new token in spec_token_ids.
|
||||
# because the actual token id is not known yet. so just use -1
|
||||
# as a placeholder and the length of spec_token_ids is set to
|
||||
# self.num_spec_tokens. we will update the actual spec token id
|
||||
# in worker process.
|
||||
request.spec_token_ids = [-1] * self.num_spec_tokens
|
||||
|
||||
def _update_request_with_output(
|
||||
self,
|
||||
request: Request,
|
||||
new_token_ids: list[int],
|
||||
) -> tuple[list[int], bool]:
|
||||
status_before_update = request.status
|
||||
new_token_ids, stopped = super()._update_request_with_output(
|
||||
request, new_token_ids)
|
||||
|
||||
# num_output_placeholders = 0 happend when a request is preempted.
|
||||
# a preempted request will be added to waiting queue again and
|
||||
# num_output_placeholders is reset to 0,
|
||||
# so don't need to revert num_output_placeholders for this situation.
|
||||
if request.num_output_placeholders > 0:
|
||||
# Update the number of output placeholders.
|
||||
request.num_output_placeholders -= len(new_token_ids)
|
||||
assert request.num_output_placeholders >= 0
|
||||
|
||||
# Cache the new tokens. Preempted requests should be skipped.
|
||||
if status_before_update == RequestStatus.RUNNING:
|
||||
self.kv_cache_manager.cache_blocks(
|
||||
request,
|
||||
request.num_computed_tokens - request.num_output_placeholders)
|
||||
return new_token_ids, stopped
|
||||
|
||||
|
||||
def _update_computed_tokens_after_speculation(
|
||||
self, request: Request, num_rejected: int
|
||||
):
|
||||
"""Update the computed tokens for each request, which is necessary
|
||||
for spec decoding. In sync scheduler, we need to revert
|
||||
num_computed_tokens by num_rejected tokens,
|
||||
but in async scheduler, we also need to revert num_output_placeholders
|
||||
by num_rejected tokens for spec decoding.
|
||||
"""
|
||||
# num_computed_tokens = 0 happend when a request is preempted.
|
||||
# a preempted request will be added to waiting queue again and
|
||||
# num_computed_tokens is reset to 0,
|
||||
# so don't need to revert num_computed_tokens for this situation.
|
||||
if request.num_computed_tokens > 0:
|
||||
# when spec decoding is enabled, num_output_placeholders
|
||||
# is increased by num_spec_tokens in _update_after_schedule.
|
||||
# update num_output_placeholders here to reflect the actual number
|
||||
# of accepted output tokens.
|
||||
request.num_output_placeholders -= num_rejected
|
||||
super()._update_computed_tokens_after_speculation(request, num_rejected)
|
||||
111
vllm_mlu/v1/core/sched/output.py
Normal file
111
vllm_mlu/v1/core/sched/output.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from vllm._bc_linter import bc_linter_include
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
|
||||
from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.request import Request
|
||||
else:
|
||||
ECConnectorMetadata = object
|
||||
KVConnectorMetadata = object
|
||||
LoRARequest = object
|
||||
MultiModalFeatureSpec = object
|
||||
PoolingParams = object
|
||||
SamplingParams = object
|
||||
Request = object
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief: Add new_toked_ids to pass the first token generated
|
||||
by the prefiller to the decoder's model_runner.
|
||||
'''
|
||||
@bc_linter_include
|
||||
@dataclass
|
||||
class NewRequestData:
|
||||
req_id: str
|
||||
prompt_token_ids: list[int] | None
|
||||
mm_features: list[MultiModalFeatureSpec]
|
||||
sampling_params: SamplingParams | None
|
||||
pooling_params: PoolingParams | None
|
||||
block_ids: tuple[list[int], ...]
|
||||
num_computed_tokens: int
|
||||
lora_request: LoRARequest | None
|
||||
new_token_ids: list[list[int]]
|
||||
prompt_embeds: "torch.Tensor | None" = None
|
||||
|
||||
@classmethod
|
||||
def from_request(
|
||||
cls,
|
||||
request: Request,
|
||||
block_ids: tuple[list[int], ...],
|
||||
) -> "NewRequestData":
|
||||
return cls(
|
||||
req_id=request.request_id,
|
||||
prompt_token_ids=request.prompt_token_ids,
|
||||
mm_features=request.mm_features,
|
||||
sampling_params=request.sampling_params,
|
||||
pooling_params=request.pooling_params,
|
||||
block_ids=block_ids,
|
||||
num_computed_tokens=request.num_computed_tokens,
|
||||
lora_request=request.lora_request,
|
||||
prompt_embeds=request.prompt_embeds,
|
||||
new_token_ids=request._output_token_ids,
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
|
||||
return (
|
||||
f"NewRequestData("
|
||||
f"req_id={self.req_id},"
|
||||
f"prompt_token_ids={self.prompt_token_ids},"
|
||||
f"mm_features={self.mm_features},"
|
||||
f"sampling_params={self.sampling_params},"
|
||||
f"block_ids={self.block_ids},"
|
||||
f"num_computed_tokens={self.num_computed_tokens},"
|
||||
f"lora_request={self.lora_request},"
|
||||
f"prompt_embeds_shape={prompt_embeds_shape},"
|
||||
f"new_token_ids={self.new_token_ids}"
|
||||
")"
|
||||
)
|
||||
|
||||
# Version of __repr__ with the prompt data obfuscated
|
||||
def anon_repr(self) -> str:
|
||||
prompt_token_ids_len = (
|
||||
len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
|
||||
)
|
||||
prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
|
||||
return (
|
||||
f"NewRequestData("
|
||||
f"req_id={self.req_id},"
|
||||
f"prompt_token_ids_len={prompt_token_ids_len},"
|
||||
f"mm_features={self.mm_features},"
|
||||
f"sampling_params={self.sampling_params},"
|
||||
f"block_ids={self.block_ids},"
|
||||
f"num_computed_tokens={self.num_computed_tokens},"
|
||||
f"lora_request={self.lora_request},"
|
||||
f"prompt_embeds_shape={prompt_embeds_shape}"
|
||||
")"
|
||||
)
|
||||
'''
|
||||
==================
|
||||
End of MLU Hijack
|
||||
==================
|
||||
'''
|
||||
1723
vllm_mlu/v1/core/sched/scheduler.py
Normal file
1723
vllm_mlu/v1/core/sched/scheduler.py
Normal file
File diff suppressed because it is too large
Load Diff
21
vllm_mlu/v1/core/single_type_kv_cache_manager.py
Normal file
21
vllm_mlu/v1/core/single_type_kv_cache_manager.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
|
||||
from vllm.v1.core.single_type_kv_cache_manager import (
|
||||
FullAttentionManager,
|
||||
SlidingWindowManager,
|
||||
spec_manager_map,
|
||||
)
|
||||
|
||||
from vllm_mlu.v1.kv_cache_interface import (
|
||||
MLUFullAttentionSpec,
|
||||
MLUMLAAttentionSpec,
|
||||
MLUSlidingWindowSpec,
|
||||
)
|
||||
|
||||
|
||||
spec_manager_map.update({
|
||||
MLUFullAttentionSpec: FullAttentionManager,
|
||||
MLUSlidingWindowSpec: SlidingWindowManager,
|
||||
MLUMLAAttentionSpec: FullAttentionManager,
|
||||
})
|
||||
Reference in New Issue
Block a user