Add minimal vLLM 0.16.1 build repo for BI-V150

This commit is contained in:
2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Poolers that produce an output for each token in the sequence."""
from .heads import (
TokenClassifierPoolerHead,
TokenEmbeddingPoolerHead,
TokenPoolerHead,
TokenPoolerHeadOutputItem,
)
from .methods import (
AllPool,
StepPool,
TokenPoolingMethod,
TokenPoolingMethodOutputItem,
get_tok_pooling_method,
)
from .poolers import (
TokenPooler,
TokenPoolerOutput,
pooler_for_token_classify,
pooler_for_token_embed,
)
__all__ = [
"TokenPoolerHead",
"TokenPoolerHeadOutputItem",
"TokenClassifierPoolerHead",
"TokenEmbeddingPoolerHead",
"TokenPoolingMethod",
"TokenPoolingMethodOutputItem",
"AllPool",
"StepPool",
"get_tok_pooling_method",
"TokenPooler",
"TokenPoolerOutput",
"pooler_for_token_classify",
"pooler_for_token_embed",
]

View File

@@ -0,0 +1,133 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from collections.abc import Set
from typing import TypeAlias
import torch
import torch.nn as nn
from vllm.model_executor.layers.pooler import ActivationFn, ClassifierFn, ProjectorFn
from vllm.pooling_params import PoolingParams
from vllm.tasks import PoolingTask
from vllm.v1.pool.metadata import PoolingMetadata
from .methods import TokenPoolingMethodOutputItem
TokenPoolerHeadOutputItem: TypeAlias = torch.Tensor | None
class TokenPoolerHead(nn.Module, ABC):
@abstractmethod
def get_supported_tasks(self) -> Set[PoolingTask]:
raise NotImplementedError
@abstractmethod
def forward_chunk(
self,
pooled_data: TokenPoolingMethodOutputItem,
pooling_param: PoolingParams,
) -> TokenPoolerHeadOutputItem:
raise NotImplementedError
def forward(
self,
pooled_data: list[TokenPoolingMethodOutputItem],
pooling_metadata: PoolingMetadata,
) -> list[TokenPoolerHeadOutputItem]:
pooling_params = pooling_metadata.pooling_params
assert len(pooled_data) == len(pooling_params)
return [self.forward_chunk(d, p) for d, p in zip(pooled_data, pooling_params)]
class TokenEmbeddingPoolerHead(TokenPoolerHead):
def __init__(
self,
head_dtype: torch.dtype | str | None = None,
projector: ProjectorFn | None = None,
activation: ActivationFn | None = None,
) -> None:
super().__init__()
self.head_dtype = head_dtype
self.projector = projector
self.activation = activation
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"token_embed"}
def forward_chunk(
self,
pooled_data: TokenPoolingMethodOutputItem,
pooling_param: PoolingParams,
) -> TokenPoolerHeadOutputItem:
# for unfinished chunked prefill
if pooled_data is None:
return None
if self.head_dtype is not None:
pooled_data = pooled_data.to(self.head_dtype)
# pooled_data shape: [n_tokens, hidden_dimension]
# Apply ST projector
if self.projector is not None:
pooled_data = self.projector(pooled_data)
# pooled_data shape: [n_tokens, embedding_dimension]
# for matryoshka representation
pooled_data = pooled_data[..., : pooling_param.dimensions]
# for normalize
if self.activation is not None and pooling_param.use_activation:
pooled_data = self.activation(pooled_data)
# pooled_data shape: [n_tokens, embedding_dimension]
return pooled_data
class TokenClassifierPoolerHead(TokenPoolerHead):
def __init__(
self,
classifier: ClassifierFn | None = None,
logit_bias: float | None = None,
head_dtype: torch.dtype | str | None = None,
activation: ActivationFn | None = None,
) -> None:
super().__init__()
self.classifier = classifier
self.logit_bias = logit_bias
self.head_dtype = head_dtype
self.activation = activation
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"token_classify"}
def forward_chunk(
self,
pooled_data: TokenPoolingMethodOutputItem,
pooling_param: PoolingParams,
) -> TokenPoolerHeadOutputItem:
# for unfinished chunked prefill
if pooled_data is None:
return None
if self.head_dtype is not None:
pooled_data = pooled_data.to(self.head_dtype)
# hidden_states shape: [n_token, hidden_size]
if self.classifier is not None:
scores = self.classifier(pooled_data)
else:
scores = pooled_data
# scores shape: [n_token, num_labels]
if self.logit_bias is not None:
scores -= self.logit_bias
if self.activation is not None and pooling_param.use_activation:
scores = self.activation(scores)
# scores shape: [n_token, num_labels]
return scores

View File

@@ -0,0 +1,122 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from collections.abc import Set
from typing import TypeAlias
import torch
import torch.nn as nn
from vllm.config import get_current_vllm_config
from vllm.config.pooler import TokenPoolingType
from vllm.model_executor.layers.pooler import PoolingParamsUpdate
from vllm.tasks import PoolingTask
from vllm.v1.pool.metadata import PoolingMetadata
TokenPoolingMethodOutputItem: TypeAlias = torch.Tensor | None
class TokenPoolingMethod(nn.Module, ABC):
def get_supported_tasks(self) -> Set[PoolingTask]:
return {"token_embed", "token_classify"}
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return PoolingParamsUpdate()
@abstractmethod
def forward(
self,
hidden_states: torch.Tensor,
pooling_metadata: PoolingMetadata,
) -> list[TokenPoolingMethodOutputItem]:
raise NotImplementedError
class AllPool(TokenPoolingMethod):
def __init__(self):
super().__init__()
vllm_config = get_current_vllm_config()
scheduler_config = vllm_config.scheduler_config
self.enable_chunked_prefill = scheduler_config.enable_chunked_prefill
def forward(
self,
hidden_states: torch.Tensor,
pooling_metadata: PoolingMetadata,
) -> list[TokenPoolingMethodOutputItem]:
pooling_cursor = pooling_metadata.get_pooling_cursor()
hidden_states_all = hidden_states.split(
pooling_cursor.num_scheduled_tokens_cpu.tolist()
)
hidden_states_lst = [hidden_states_all[i] for i in pooling_cursor.index]
if not self.enable_chunked_prefill:
return hidden_states_lst
pooling_states = pooling_metadata.pooling_states
# If chunked_prefill is enabled
# 1. first store the chunked hidden_states in pooling_states.hidden_states_cache
for p, hs_chunk in zip(pooling_states, hidden_states_lst):
p.hidden_states_cache.append(hs_chunk)
# 2. Once prefill is finished, send hidden_states_cache to PoolerHead
output_list = list[TokenPoolingMethodOutputItem]()
for p, finished in zip(pooling_states, pooling_cursor.is_finished()):
if finished:
hidden_states_cache = p.hidden_states_cache
if len(hidden_states_cache) == 1:
output_list.append(hidden_states_cache[0])
else:
output_list.append(torch.concat(hidden_states_cache, dim=0))
p.clean()
else:
output_list.append(None)
return output_list
class StepPool(AllPool):
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
return PoolingParamsUpdate(requires_token_ids=True)
def forward(
self,
hidden_states: torch.Tensor,
pooling_metadata: PoolingMetadata,
) -> list[TokenPoolingMethodOutputItem]:
pooled_data_lst = super().forward(hidden_states, pooling_metadata)
prompt_token_ids = pooling_metadata.get_prompt_token_ids()
pooling_params = pooling_metadata.pooling_params
pooled_data = list[torch.Tensor | None]()
for data, token_id, pooling_param in zip(
pooled_data_lst, prompt_token_ids, pooling_params
):
# for unfinished chunked prefill
if data is None:
pass
else:
step_tag_id = pooling_param.step_tag_id
returned_token_ids = pooling_param.returned_token_ids
if returned_token_ids is not None and len(returned_token_ids) > 0:
data = data[:, returned_token_ids]
if step_tag_id is not None:
data = data[token_id == step_tag_id]
pooled_data.append(data)
return pooled_data
def get_tok_pooling_method(pooling_type: TokenPoolingType | str):
if pooling_type == "ALL":
return AllPool()
if pooling_type == "STEP":
return StepPool()
raise NotImplementedError(f"Unknown tokenwise pooling type: {pooling_type!r}")

View File

@@ -0,0 +1,135 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable, Set
from typing import TypeAlias
import torch
from vllm.config import PoolerConfig, get_current_vllm_config
from vllm.model_executor.layers.pooler import (
ClassifierFn,
PoolingParamsUpdate,
ProjectorFn,
)
from vllm.model_executor.layers.pooler.abstract import Pooler
from vllm.model_executor.layers.pooler.activations import (
PoolerActivation,
PoolerNormalize,
resolve_classifier_act_fn,
)
from vllm.model_executor.models.adapters import _load_st_projector
from vllm.tasks import POOLING_TASKS, PoolingTask
from vllm.v1.pool.metadata import PoolingMetadata
from .heads import (
TokenClassifierPoolerHead,
TokenEmbeddingPoolerHead,
TokenPoolerHead,
TokenPoolerHeadOutputItem,
)
from .methods import (
TokenPoolingMethod,
TokenPoolingMethodOutputItem,
get_tok_pooling_method,
)
TokenPoolingFn: TypeAlias = Callable[
[torch.Tensor, PoolingMetadata],
list[TokenPoolingMethodOutputItem],
]
TokenPoolingHeadFn: TypeAlias = Callable[
[list[TokenPoolingMethodOutputItem], PoolingMetadata],
list[TokenPoolerHeadOutputItem],
]
TokenPoolerOutput: TypeAlias = list[torch.Tensor | None]
class TokenPooler(Pooler):
"""
A layer that pools specific information from hidden states.
This layer does the following:
1. Extracts specific tokens or aggregates data based on pooling method.
2. Postprocesses the output based on pooling head.
3. Returns structured results as `PoolerOutput`.
"""
def __init__(
self,
pooling: TokenPoolingMethod | TokenPoolingFn,
head: TokenPoolerHead | TokenPoolingHeadFn,
) -> None:
super().__init__()
self.pooling = pooling
self.head = head
def get_supported_tasks(self) -> Set[PoolingTask]:
tasks = set(POOLING_TASKS)
if isinstance(self.pooling, TokenPoolingMethod):
tasks &= self.pooling.get_supported_tasks()
if isinstance(self.head, TokenPoolerHead):
tasks &= self.head.get_supported_tasks()
return tasks
def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
updates = PoolingParamsUpdate()
if isinstance(self.pooling, TokenPoolingMethod):
updates |= self.pooling.get_pooling_updates(task)
return updates
def forward(
self,
hidden_states: torch.Tensor,
pooling_metadata: PoolingMetadata,
) -> TokenPoolerOutput:
pooled_data = self.pooling(hidden_states, pooling_metadata)
pooled_data = self.head(pooled_data, pooling_metadata)
return pooled_data
def pooler_for_token_embed(
pooler_config: PoolerConfig, projector: ProjectorFn | None = None
) -> TokenPooler:
pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type())
vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config
head = TokenEmbeddingPoolerHead(
head_dtype=model_config.head_dtype,
projector=projector
if projector is not None
else _load_st_projector(model_config),
activation=PoolerNormalize(),
)
return TokenPooler(pooling=pooling, head=head)
def pooler_for_token_classify(
pooler_config: PoolerConfig,
*,
pooling: TokenPoolingMethod | TokenPoolingFn | None = None,
classifier: ClassifierFn | None = None,
act_fn: PoolerActivation | str | None = None,
):
if pooling is None:
pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type())
vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config
head = TokenClassifierPoolerHead(
head_dtype=model_config.head_dtype,
classifier=classifier,
logit_bias=model_config.pooler_config.logit_bias,
activation=resolve_classifier_act_fn(
model_config, static_num_labels=False, act_fn=act_fn
),
)
return TokenPooler(pooling=pooling, head=head)