Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -0,0 +1,215 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers import PretrainedConfig
class AXK1Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`AXK1Model`].
It is used to instantiate an A.X model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults
will yield a similar configuration to that of the A.X K1.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control
the model outputs. Read the documentation from [`PretrainedConfig`] for more
information.
Args:
vocab_size (`int`, *optional*, defaults to 163840):
Vocabulary size of the A.X K1 model. Defines the number of different
tokens that can be represented by the `inputs_ids` passed when calling
[`AXK1Model`]
hidden_size (`int`, *optional*, defaults to 7168):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 18432):
Dimension of the MLP representations.
moe_intermediate_size (`int`, *optional*, defaults to 2048):
Dimension of the MoE representations.
num_hidden_layers (`int`, *optional*, defaults to 61):
Number of hidden layers in the Transformer decoder.
num_nextn_predict_layers (`int`, *optional*, defaults to 1):
Number of nextn predict layers in the AXK1 Model.
num_attention_heads (`int`, *optional*, defaults to 64):
Number of attention heads for each attention layer in the Transformer
decoder.
n_shared_experts (`int`, *optional*, defaults to 1):
Number of shared experts, None means dense model.
n_routed_experts (`int`, *optional*, defaults to 192):
Number of routed experts, None means dense model.
routed_scaling_factor (`float`, *optional*, defaults to 2.5):
Scaling factor or routed experts.
topk_method (`str`, *optional*, defaults to `noaux_tc`):
Topk method used in routed gate.
n_group (`int`, *optional*, defaults to 8):
Number of groups for routed experts.
topk_group (`int`, *optional*, defaults to 4):
Number of selected groups for each token(for each token, ensuring the
selected experts is only within `topk_group` groups).
num_experts_per_tok (`int`, *optional*, defaults to 8):
Number of selected experts, None means dense model.
moe_layer_freq (`int`, *optional*, defaults to 1):
The frequency of the MoE layer: one expert layer for every
`moe_layer_freq - 1` dense layers.
first_k_dense_replace (`int`, *optional*, defaults to 1):
Number of dense layers in shallow layers
(embed->dense->dense->...->dense->moe->moe...->lm_head).
\--k dense layers--/
norm_topk_prob (`bool`, *optional*, defaults to True):
Whether to normalize the weights of the routed experts.
scoring_func (`str`, *optional*, defaults to 'sigmoid'):
Method of computing expert weights.
aux_loss_alpha (`float`, *optional*, defaults to 0.0001):
Auxiliary loss weight coefficient.
seq_aux = (`bool`, *optional*, defaults to True):
Whether to compute the auxiliary loss for each individual sample.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement
Grouped Query Attention. If `num_key_value_heads=num_attention_heads`,
the model will use Multi Head Attention (MHA), if `num_key_value_heads=1
the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and
value head should be constructed by meanpooling all the original heads
within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf).
If it is not specified, will default to `num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 131072):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions
(not used by all models). Only relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 163691):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 163691):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining.
Please refer to
[this document](https://huggingface.co/docs/transformers/parallelism)
to understand more about it. This value is necessary to ensure exact
reproducibility of the pretraining results. Please refer to
[this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings.
Currently supports two scaling strategies: linear and dynamic.
Their scaling factor must be a float greater than 1. The expected format
is `{"type": strategy name, "factor": scaling factor}`. When using this
flag, don't update `max_position_embeddings` to the expected new maximum.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection
layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
"""
model_type = "AXK1"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size: int = 163840,
hidden_size: int = 7168,
intermediate_size: int = 18432,
moe_intermediate_size: int = 2048,
num_hidden_layers: int = 61,
num_nextn_predict_layers: int | None = 1,
num_attention_heads: int = 64,
num_key_value_heads: int = 64,
n_shared_experts: int | None = 1,
n_routed_experts: int | None = 192,
ep_size: int | None = 8, ## Ignored - Expert parallel size
routed_scaling_factor: float | None = 2.5,
kv_lora_rank: int | None = 512,
q_lora_rank: int | None = 1536,
qk_rope_head_dim: int | None = 64,
v_head_dim: int | None = 128,
qk_nope_head_dim: int | None = 128,
topk_method: str | None = "noaux_tc",
n_group: int | None = 8,
topk_group: int | None = 4,
num_experts_per_tok: int | None = 8,
moe_layer_freq: int | None = 1,
first_k_dense_replace: int = 1,
norm_topk_prob: bool = True,
scoring_func: str | None = "sigmoid",
aux_loss_alpha: float | None = 0.0001,
seq_aux: float | None = True,
hidden_act: str | None = "silu",
max_position_embeddings: int | None = 131072,
initializer_range: float | None = 0.02,
rms_norm_eps: float = 1e-6,
use_cache: bool | None = True,
pad_token_id: int | None = None,
bos_token_id: int | None = 163691,
eos_token_id: int | None = 163691,
pretraining_tp: int | None = 1,
tie_word_embeddings: bool | None = False,
rope_theta: float | None = 10000.0,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
attention_bias: bool | None = False,
attention_dropout: float | None = 0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_nextn_predict_layers = num_nextn_predict_layers
self.num_attention_heads = num_attention_heads
self.n_shared_experts = n_shared_experts
self.n_routed_experts = n_routed_experts
self.ep_size = ep_size
self.routed_scaling_factor = routed_scaling_factor
self.kv_lora_rank = kv_lora_rank
self.q_lora_rank = q_lora_rank
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.qk_nope_head_dim = qk_nope_head_dim
self.topk_method = topk_method
self.n_group = n_group
self.topk_group = topk_group
self.num_experts_per_tok = num_experts_per_tok
self.moe_layer_freq = moe_layer_freq
self.first_k_dense_replace = first_k_dense_replace
self.norm_topk_prob = norm_topk_prob
self.scoring_func = scoring_func
self.aux_loss_alpha = aux_loss_alpha
self.seq_aux = seq_aux
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

View File

@@ -16,6 +16,7 @@ import importlib
_CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"AXK1Config": "vllm.transformers_utils.configs.AXK1",
"BagelConfig": "vllm.transformers_utils.configs.bagel",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"ColModernVBertConfig": "vllm.transformers_utils.configs.colmodernvbert",
@@ -70,6 +71,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
__all__ = [
"AfmoeConfig",
"AXK1Config",
"BagelConfig",
"ChatGLMConfig",
"ColModernVBertConfig",

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Config definitions for ExtractHiddenStatesModel, to be used with
the extract_hidden_states spec decoding method."""
import os
from transformers import PretrainedConfig
class ExtractHiddenStatesConfig(PretrainedConfig):
model_type = "extract_hidden_states"
def __init__(
self,
model: PretrainedConfig | dict | None = None,
method: str | None = "extract_hidden_states",
**kwargs,
):
assert method == "extract_hidden_states"
if isinstance(model, dict):
model_dict = model
elif isinstance(model, PretrainedConfig):
model_dict = model.to_dict()
else:
model_dict = {}
# Combine: model_dict first, then kwargs override
combined = {**model_dict, **kwargs}
# Remove architectures from the base, we'll set it explicitly
combined = {k: v for k, v in combined.items() if k != "architectures"}
combined["architectures"] = ["ExtractHiddenStatesModel"]
super().__init__(**combined)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str | os.PathLike,
**kwargs,
) -> "ExtractHiddenStatesConfig":
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs
)
return cls.from_dict(config_dict, **kwargs)
def to_json_string(self, use_diff: bool = True) -> str:
# we override use_diff to False as initializing
# ExtractHiddenStatesConfig with default arguments is not supported
del use_diff
return super().to_json_string(use_diff=False)

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
from transformers import ParakeetEncoderConfig, PretrainedConfig
class ParakeetConfig(ParakeetEncoderConfig):
llm_hidden_size: int
projection_hidden_size: int
projection_bias: bool
projection_eps: float = 1e-5
sampling_rate: int
@staticmethod
def from_hf_config(
config: PretrainedConfig, *, llm_hidden_size: int, max_model_len: int
) -> "ParakeetConfig":
assert isinstance(config, PretrainedConfig)
return ParakeetConfig(
**config.to_dict(),
scale_input=False,
attention_bias=False,
llm_hidden_size=llm_hidden_size,
max_position_embeddings=max_model_len
+ 1, # + 1 because it seems like max_model_len+1 can be passed
)
@dataclass(kw_only=True, frozen=True)
class ExtractorConfig:
feature_size: int
sampling_rate: int
subsampling_factor: int
subsampling_conv_kernel_size: int
subsampling_conv_stride: int
clip_duration_s: int = 30
clip_min_duration_s: float = 0.1
@staticmethod
def from_hf_config(config: PretrainedConfig) -> "ExtractorConfig":
assert isinstance(config, PretrainedConfig)
return ExtractorConfig(
feature_size=config.num_mel_bins,
sampling_rate=config.sampling_rate,
subsampling_factor=config.subsampling_factor,
subsampling_conv_kernel_size=config.subsampling_conv_kernel_size,
subsampling_conv_stride=config.subsampling_conv_stride,
)

View File

@@ -233,6 +233,7 @@ class ModelArchConfigConvertorBase:
if not hasattr(self.hf_text_config, "model_type"):
return False
elif self.hf_text_config.model_type in (
"AXK1",
"deepseek_v2",
"deepseek_v3",
"deepseek_v32",
@@ -245,6 +246,7 @@ class ModelArchConfigConvertorBase:
"longcat_flash",
"pangu_ultra_moe",
"pangu_ultra_moe_mtp",
"bailing_hybrid",
):
return self.hf_text_config.kv_lora_rank is not None
elif self.hf_text_config.model_type == "eagle":
@@ -252,7 +254,13 @@ class ModelArchConfigConvertorBase:
# underlying architecture
return (
self.hf_text_config.model.model_type
in ("deepseek_v2", "deepseek_v3", "deepseek_v32", "deepseek_mtp")
in (
"AXK1",
"deepseek_v2",
"deepseek_v3",
"deepseek_v32",
"deepseek_mtp",
)
and self.hf_text_config.kv_lora_rank is not None
)
return False

View File

@@ -111,29 +111,6 @@ def _get_processor_factory_fn(processor_cls: type | tuple[type, ...]):
return processor_cls
@lru_cache
def _collect_dynamic_keys_from_processing_kwargs(kwargs_cls: type) -> set[str]:
dynamic_kwargs: set[str] = set()
if kwargs_cls is None:
return dynamic_kwargs
# get kwargs annotations in processor
# merge text_kwargs / images_kwargs / videos_kwargs / audio_kwargs
kwargs_type_annotations = get_type_hints(kwargs_cls)
for kw_type in ("text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"):
if kw_type in kwargs_type_annotations:
# Use __annotations__ instead of get_type_hints() to avoid
# NameError from unresolved forward references (e.g.
# PILImageResampling). We only need key names, not types.
kw_cls = kwargs_type_annotations[kw_type]
kw_annotations: dict[str, Any] = {}
for base in reversed(kw_cls.__mro__):
kw_annotations.update(getattr(base, "__annotations__", {}))
for kw_name in kw_annotations:
dynamic_kwargs.add(kw_name)
dynamic_kwargs |= {"text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"}
return dynamic_kwargs
def _merge_mm_kwargs(
model_config: "ModelConfig",
processor_cls: type | tuple[type, ...],
@@ -224,38 +201,63 @@ cached_get_processor = lru_cache(get_processor)
@lru_cache
def get_processor_kwargs_from_processor(processor: _P) -> set[str]:
def get_processor_kwargs_type(
processor: ProcessorMixin,
) -> type[processing_utils.ProcessingKwargs]:
try:
# get kwargs annotations in processor
call_kwargs = inspect.signature(type(processor).__call__).parameters.get(
"kwargs"
)
call_params = inspect.signature(type(processor).__call__).parameters
call_kwargs = call_params.get("kwargs")
call_kwargs_annotations = call_kwargs.annotation if call_kwargs else None
# if the processor has explicit kwargs annotation, use it
if call_kwargs_annotations not in (None, inspect._empty):
# get_type_hints will parse all type annotations at runtime,
# and if an annotation refers to a type or
# name that hasnt been imported or defined, it will raise an error.
# So we use __annotations__ to get the raw annotations directly.
return _collect_dynamic_keys_from_processing_kwargs(
get_args(call_kwargs_annotations)[0]
)
# otherwise, try to get from ProcessingKwargs
else:
module_name = type(processor).__module__
mod = importlib.import_module(module_name)
# find *ProcessingKwargs in the module
processor_kwargs: set[str] = set()
for name, obj in vars(mod).items():
if name.endswith("ProcessingKwargs"):
processor_kwargs = (
processor_kwargs
| _collect_dynamic_keys_from_processing_kwargs(obj)
)
return processor_kwargs
return get_args(call_kwargs_annotations)[0]
# otherwise, try to get from ProcessorKwargs
module_name = type(processor).__module__
mod = importlib.import_module(module_name)
for name, obj in vars(mod).items():
if name.endswith("ProcessorKwargs"):
return obj
except Exception:
logger.exception("Failed to collect processor kwargs")
return set()
return processing_utils.ProcessingKwargs
@lru_cache
def get_processor_kwargs_keys(
kwargs_cls: type[processing_utils.ProcessingKwargs],
) -> set[str]:
dynamic_kwargs: set[str] = set()
modality_kwargs = {"text_kwargs", "images_kwargs", "videos_kwargs", "audio_kwargs"}
try:
# get kwargs annotations in processor
# merge text_kwargs / images_kwargs / videos_kwargs / audio_kwargs
kwargs_type_annotations = get_type_hints(kwargs_cls)
for kw_type in modality_kwargs:
if kw_type in kwargs_type_annotations:
# Use __annotations__ instead of get_type_hints() to avoid
# NameError from unresolved forward references (e.g.
# PILImageResampling). We only need key names, not types.
kw_cls = kwargs_type_annotations[kw_type]
kw_annotations: dict[str, Any] = {}
for base in reversed(kw_cls.__mro__):
kw_annotations.update(getattr(base, "__annotations__", {}))
for kw_name in kw_annotations:
dynamic_kwargs.add(kw_name)
except Exception:
logger.exception("Failed to collect processor kwargs")
return dynamic_kwargs | modality_kwargs
def cached_get_processor_without_dynamic_kwargs(
@@ -275,7 +277,9 @@ def cached_get_processor_without_dynamic_kwargs(
)
# Step 2: use temporary processor collect dynamic keys
dynamic_keys = get_processor_kwargs_from_processor(processor)
dynamic_keys = get_processor_kwargs_keys(
get_processor_kwargs_type(processor) # type: ignore[arg-type]
)
# Step 3: use dynamic_keys filter kwargs
filtered_kwargs = {k: v for k, v in kwargs.items() if k not in dynamic_keys}

View File

@@ -10,6 +10,9 @@ reasons:
from vllm.transformers_utils.processors.bagel import BagelProcessor
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.fireredasr2_processor import (
FireRedASR2Processor,
)
from vllm.transformers_utils.processors.funasr_processor import FunASRProcessor
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
@@ -19,6 +22,7 @@ from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
__all__ = [
"BagelProcessor",
"DeepseekVLV2Processor",
"FireRedASR2Processor",
"FunASRProcessor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",

View File

@@ -0,0 +1,341 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import kaldi_native_fbank as knf
import numpy as np
import torch
import torch.nn.functional as F
from transformers import (
AutoFeatureExtractor,
AutoProcessor,
BatchFeature,
)
from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
from transformers.processing_utils import ProcessorMixin
from transformers.utils import TensorType
from vllm.logger import init_logger
logger = init_logger(__name__)
class CMVN:
def __init__(self, dim, means, inverse_std_variences):
self.dim, self.means, self.inverse_std_variences = (
dim,
np.array(means),
np.array(inverse_std_variences),
)
def __call__(self, x):
assert x.shape[-1] == self.dim, "CMVN dim mismatch"
out = x - self.means
out = out * self.inverse_std_variences
return out
class KaldifeatFbank:
def __init__(self, num_mel_bins=80, frame_length=25, frame_shift=10, dither=1.0):
self.dither = dither
opts = knf.FbankOptions()
opts.frame_opts.dither = dither
opts.mel_opts.num_bins = num_mel_bins
opts.frame_opts.snip_edges = True
opts.mel_opts.debug_mel = False
self.opts = opts
def __call__(self, sample_rate, wav_np, is_train=False):
dither = self.dither if is_train else 0.0
self.opts.frame_opts.dither = dither
fbank = knf.OnlineFbank(self.opts)
fbank.accept_waveform(sample_rate, wav_np.tolist())
feat = []
for i in range(fbank.num_frames_ready):
feat.append(fbank.get_frame(i))
if len(feat) == 0:
print("Check data, len(feat) == 0", wav_np, flush=True)
return np.zeros((0, self.opts.mel_opts.num_bins))
feat = np.vstack(feat)
return feat
class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a FireRedASR2 feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_
utils.SequenceFeatureExtractor`] which contains most of the main
methods. Users should refer to this superclass for more information
regarding those methods.
This class extracts mel-filter bank features from raw speech using a custom
numpy implementation of the `Short Time Fourier Transform` which should
match pytorch's `torch.stft` equivalent.
Args:
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized
expressed in hertz (Hz).
chunk_length (`int`, *optional*, defaults to 30):
The maximum number of chunks of `sampling_rate` samples used to
trim and pad longer or shorter audio sequences.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
dither (`float`, *optional*, defaults to 0.0):
Adds dithering. In other words, adds a small Gaussian noise to each frame.
E.g. use 0.0001 to add dithering with a normal distribution centered
around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range
of raw_speech). The value 0.0 means no dithering.
Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
the high log_mel_fbank values for signals with hard-zero sections,
when VAD cutoff is present in the signal.
"""
model_input_names = ["input_features"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
chunk_length=30,
padding_value=0.0,
return_attention_mask=False,
dim=80,
means=None,
inverse_std_variences=None,
num_mel_bins=80,
frame_length=25,
frame_shift=10,
dither=0.0,
max_length=3000,
downsample_rate=2,
left_context=3,
right_context=3,
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
self.chunk_length = chunk_length
self.max_length = max_length
self.dim = dim
self.means = means
self.inverse_std_variences = inverse_std_variences
self.num_mel_bins = num_mel_bins
self.frame_length = frame_length
self.frame_shift = frame_shift
self.dither = dither
self.sampling_rate = sampling_rate
self.downsample_rate = downsample_rate
self.context = left_context + 1 + right_context
def __call__(
self,
raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
truncation: bool = True,
pad_to_multiple_of: int | None = None,
return_tensors: str | TensorType | None = None,
return_attention_mask: bool | None = None,
padding: str | None = "max_length",
max_length: int | None = None,
sampling_rate: int | None = None,
do_normalize: bool | None = None,
**kwargs,
) -> BatchFeature:
if sampling_rate != self.sampling_rate:
raise ValueError(
f"The model corresponding to this feature extractor: "
f"{self.__class__.__name__} was trained using a sampling "
f"rate of {self.sampling_rate}. Please make sure that the "
f"provided `raw_speech` input was sampled with "
f"{self.sampling_rate} and not {sampling_rate}."
)
def padding_position_is_0(padded_input, input_lengths):
N, T = padded_input.size()[:2]
mask = torch.ones((N, T)).to(padded_input.device)
for i in range(N):
mask[i, input_lengths[i] :] = 0
mask = mask.unsqueeze(dim=1)
return mask.to(torch.uint8)
# initialize the CMVN and Fbank objects
self.cmvn = CMVN(self.dim, self.means, self.inverse_std_variences)
self.fbank = KaldifeatFbank(
num_mel_bins=self.num_mel_bins,
frame_length=self.frame_length,
frame_shift=self.frame_shift,
dither=self.dither,
)
feats = []
speech_lengths = []
fake_token_lengths = []
for speech in raw_speech:
"""
We must multiply by 32768 here because FireRedASR2 loads audio data
using kaldiio.load_mat, while vLLM loads audio data using librosa.
"""
speech = speech * 32768
fbank = self.fbank(sampling_rate, speech)
fbank = self.cmvn(fbank)
fbank = torch.from_numpy(fbank).float()
length = fbank.size(0)
feats.append(fbank)
speech_lengths.append(length)
padded_input2 = fbank
padded_input2 = F.pad(
padded_input2, (0, 0, 0, self.context - 1), "constant", 0.0
)
src_mask = padding_position_is_0(
padded_input2[None, :, :], torch.tensor([length], dtype=torch.int32)
)
x_mask = src_mask
mask = x_mask[:, :, :-2:2][:, :, :-2:2]
input_lengths = mask[:, -1, :].sum(dim=-1)
input_lengths = input_lengths // self.downsample_rate
fake_token_len = torch.clamp(input_lengths, min=1)
fake_token_lengths.append(fake_token_len)
feats = torch.stack(feats, dim=0)
batched_speech = self.pad(
BatchFeature({"input_features": feats}),
padding=padding,
max_length=max_length if max_length else self.max_length,
truncation=truncation,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask or do_normalize,
)
if return_tensors is not None:
batched_speech = batched_speech.convert_to_tensors(return_tensors)
batched_speech["speech_lengths"] = torch.tensor(speech_lengths)
batched_speech["fake_token_lengths"] = torch.concat(fake_token_lengths)
return batched_speech
class FireRedASR2Processor(ProcessorMixin):
r"""
Constructs a FireRedASR2 processor which wraps a FireRedASR2 feature extractor and
a FireRedASR2 tokenizer into a single processor.
[`FireRedASR2Processor`] offers all the functionalities of
[`FireRedASR2FeatureExtractor`] and [`Qwen2Tokenizer`]. See the
[`~FireRedASR2Processor.__call__`] and [`~FireRedASR2Processor.decode`] for more
information.
Args:
feature_extractor (`FireRedASR2FeatureExtractor`): An instance of
[`FireRedASR2FeatureExtractor`].
The feature extractor is a required input.
tokenizer (`Qwen2Tokenizer`):
An instance of [`Qwen2Tokenizer`]. The tokenizer is a required
input.
"""
feature_extractor_class = "FireRedASR2FeatureExtractor"
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
def __init__(
self,
feature_extractor,
tokenizer,
audio_token="<|AUDIO|>",
):
super().__init__(feature_extractor, tokenizer)
self.current_processor = self.feature_extractor
self._in_target_context_manager = False
self.audio_token = (
tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
)
self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
return self.tokenizer.get_decoder_prompt_ids(
task=task, language=language, no_timestamps=no_timestamps
)
def __call__(self, *args, **kwargs):
"""
Forwards the `audio` argument to FireRedASR2FeatureExtractor's
[`~FireRedASR2FeatureExtractor.__call__`] and the `text` argument to
[`~Qwen2Tokenizer.__call__`]. Please refer to the docstring of the
above two methods for more information.
"""
if self._in_target_context_manager:
return self.current_processor(*args, **kwargs)
audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if len(args) > 0:
audio = args[0]
args = args[1:]
if text is None:
raise ValueError("You need to specify `text` input to process.")
elif isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError(
"Invalid input text. Please provide a string, or a list of strings"
)
if audio is not None:
# ensure we have as much audios as audio tokens
num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
num_audios = 1 if type(audio) is np.ndarray else len(audio)
if num_audio_tokens != num_audios:
raise ValueError(
f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}" # noqa: E501
)
inputs = self.feature_extractor(
audio, *args, sampling_rate=sampling_rate, **kwargs
)
expanded_text = []
for sample in text:
replace_str = []
while self.audio_token in sample:
num_audio_tokens = int(inputs["fake_token_lengths"].item())
expanded_audio_token = self.audio_token * num_audio_tokens
replace_str.append(expanded_audio_token)
sample = sample.replace(self.audio_token, "<placeholder>", 1)
while "<placeholder>" in sample:
sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
expanded_text.append(sample)
text = expanded_text
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif audio is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def get_prompt_ids(self, text: str, return_tensors="np"):
return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors)
AutoFeatureExtractor.register(
"FireRedASR2FeatureExtractor", FireRedASR2FeatureExtractor
)
AutoProcessor.register("FireRedASR2Processor", FireRedASR2Processor)

View File

@@ -285,7 +285,7 @@ def get_hf_file_to_dict(
EntryNotFoundError,
LocalEntryNotFoundError,
) as e:
logger.debug("File or repository not found in hf_hub_download", e)
logger.debug("File or repository not found in hf_hub_download:", exc_info=e)
return None
except HfHubHTTPError as e:
logger.warning(