[Model] Support DeepSeek-V4

This commit is contained in:
chenxb002
2026-04-24 09:50:34 +08:00
commit b9925203b8
172 changed files with 44780 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project

View File

@@ -0,0 +1,136 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
from vllm.logger import init_logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.request import Request, RequestStatus
from vllm_mlu.v1.core.sched.scheduler import MLUUnchunkScheduler, SchedulerWithProfiler
logger = init_logger(__name__)
class AsyncScheduler(SchedulerWithProfiler):
def _update_after_schedule(
self,
scheduler_output: SchedulerOutput,
) -> None:
super()._update_after_schedule(scheduler_output)
pending_structured_output_tokens = False
spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
for req_id in scheduler_output.num_scheduled_tokens:
request = self.requests[req_id]
pending_structured_output_tokens |= (
request.use_structured_output and request.num_output_placeholders > 0
)
cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
if (
request.num_computed_tokens
== request.num_tokens
+ request.num_output_placeholders
+ cur_num_spec_tokens
):
# The request will generate a new token plus num_spec_tokens
# in this scheduling step.
request.num_output_placeholders += 1 + cur_num_spec_tokens
# Add placeholders for the new tokens in spec_token_ids.
# Wwe will update the actual spec token ids in the worker process.
request.spec_token_ids = [-1] * self.num_spec_tokens
scheduler_output.pending_structured_output_tokens = (
pending_structured_output_tokens
)
def _update_request_with_output(
self,
request: Request,
new_token_ids: list[int],
) -> tuple[list[int], bool]:
status_before_update = request.status
new_token_ids, stopped = super()._update_request_with_output(
request, new_token_ids
)
# Update the number of output placeholders.
request.num_output_placeholders -= len(new_token_ids)
assert request.num_output_placeholders >= 0
# Cache the new tokens. Preempted requests should be skipped.
if status_before_update == RequestStatus.RUNNING:
self.kv_cache_manager.cache_blocks(
request, request.num_computed_tokens - request.num_output_placeholders
)
return new_token_ids, stopped
class MLUUnchunkAsyncScheduler(MLUUnchunkScheduler):
def _update_after_schedule(
self,
scheduler_output: SchedulerOutput,
) -> None:
super()._update_after_schedule(scheduler_output)
spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
for req_id in scheduler_output.num_scheduled_tokens:
request = self.requests[req_id]
cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, []))
if (
request.num_computed_tokens
== request.num_tokens
+ request.num_output_placeholders
+ cur_num_spec_tokens
):
# The request will generate a new token plus num_spec_tokens
# in this scheduling step.
request.num_output_placeholders += 1 + cur_num_spec_tokens
# Add a placeholder for the new token in spec_token_ids.
# because the actual token id is not known yet. so just use -1
# as a placeholder and the length of spec_token_ids is set to
# self.num_spec_tokens. we will update the actual spec token id
# in worker process.
request.spec_token_ids = [-1] * self.num_spec_tokens
def _update_request_with_output(
self,
request: Request,
new_token_ids: list[int],
) -> tuple[list[int], bool]:
status_before_update = request.status
new_token_ids, stopped = super()._update_request_with_output(
request, new_token_ids)
# num_output_placeholders = 0 happend when a request is preempted.
# a preempted request will be added to waiting queue again and
# num_output_placeholders is reset to 0,
# so don't need to revert num_output_placeholders for this situation.
if request.num_output_placeholders > 0:
# Update the number of output placeholders.
request.num_output_placeholders -= len(new_token_ids)
assert request.num_output_placeholders >= 0
# Cache the new tokens. Preempted requests should be skipped.
if status_before_update == RequestStatus.RUNNING:
self.kv_cache_manager.cache_blocks(
request,
request.num_computed_tokens - request.num_output_placeholders)
return new_token_ids, stopped
def _update_computed_tokens_after_speculation(
self, request: Request, num_rejected: int
):
"""Update the computed tokens for each request, which is necessary
for spec decoding. In sync scheduler, we need to revert
num_computed_tokens by num_rejected tokens,
but in async scheduler, we also need to revert num_output_placeholders
by num_rejected tokens for spec decoding.
"""
# num_computed_tokens = 0 happend when a request is preempted.
# a preempted request will be added to waiting queue again and
# num_computed_tokens is reset to 0,
# so don't need to revert num_computed_tokens for this situation.
if request.num_computed_tokens > 0:
# when spec decoding is enabled, num_output_placeholders
# is increased by num_spec_tokens in _update_after_schedule.
# update num_output_placeholders here to reflect the actual number
# of accepted output tokens.
request.num_output_placeholders -= num_rejected
super()._update_computed_tokens_after_speculation(request, num_rejected)

View File

@@ -0,0 +1,111 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
from dataclasses import dataclass
from functools import cached_property
from typing import TYPE_CHECKING
from typing_extensions import deprecated
from vllm._bc_linter import bc_linter_include
if TYPE_CHECKING:
import numpy as np
import numpy.typing as npt
import torch
from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import MultiModalFeatureSpec
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.v1.request import Request
else:
ECConnectorMetadata = object
KVConnectorMetadata = object
LoRARequest = object
MultiModalFeatureSpec = object
PoolingParams = object
SamplingParams = object
Request = object
'''
=============================
Modify by vllm_mlu
=============================
@brief: Add new_toked_ids to pass the first token generated
by the prefiller to the decoder's model_runner.
'''
@bc_linter_include
@dataclass
class NewRequestData:
req_id: str
prompt_token_ids: list[int] | None
mm_features: list[MultiModalFeatureSpec]
sampling_params: SamplingParams | None
pooling_params: PoolingParams | None
block_ids: tuple[list[int], ...]
num_computed_tokens: int
lora_request: LoRARequest | None
new_token_ids: list[list[int]]
prompt_embeds: "torch.Tensor | None" = None
@classmethod
def from_request(
cls,
request: Request,
block_ids: tuple[list[int], ...],
) -> "NewRequestData":
return cls(
req_id=request.request_id,
prompt_token_ids=request.prompt_token_ids,
mm_features=request.mm_features,
sampling_params=request.sampling_params,
pooling_params=request.pooling_params,
block_ids=block_ids,
num_computed_tokens=request.num_computed_tokens,
lora_request=request.lora_request,
prompt_embeds=request.prompt_embeds,
new_token_ids=request._output_token_ids,
)
def __repr__(self) -> str:
prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
return (
f"NewRequestData("
f"req_id={self.req_id},"
f"prompt_token_ids={self.prompt_token_ids},"
f"mm_features={self.mm_features},"
f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens},"
f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape},"
f"new_token_ids={self.new_token_ids}"
")"
)
# Version of __repr__ with the prompt data obfuscated
def anon_repr(self) -> str:
prompt_token_ids_len = (
len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
)
prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
return (
f"NewRequestData("
f"req_id={self.req_id},"
f"prompt_token_ids_len={prompt_token_ids_len},"
f"mm_features={self.mm_features},"
f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens},"
f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape}"
")"
)
'''
==================
End of MLU Hijack
==================
'''

File diff suppressed because it is too large Load Diff