# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations from transformers import PretrainedConfig # NOTE: Temporary shim for FunAudioChat checkpoints. # These checkpoints use `model_type="funaudiochat"`, which is not currently # recognized by released Transformers, and the public checkpoint does not # provide an `auto_map` to enable `trust_remote_code=True`. # Remove this file once Transformers adds native support (or the checkpoint # provides an `auto_map`) and vLLM can rely on `AutoConfig.from_pretrained()`. class FunAudioChatAudioEncoderConfig(PretrainedConfig): model_type = "funaudiochat_audio_encoder" def __init__( self, _attn_implementation: str | None = None, num_mel_bins: int = 128, encoder_layers: int = 32, encoder_attention_heads: int = 20, encoder_ffn_dim: int = 5120, d_model: int = 1280, dropout: float = 0.0, attention_dropout: float = 0.0, activation_function: str = "gelu", activation_dropout: float = 0.0, scale_embedding: bool = False, initializer_range: float = 0.02, max_source_positions: int = 1500, n_window: int = 100, output_dim: int = 3584, bos_token_id: int | None = None, codebook_size: int | None = None, continuous_features_mode: str = "replace", crq_transformer_config: dict | None = None, eos_token_id: int | None = None, group_size: int = 5, enable_audio_invert_tower: bool = True, pad_token_id: int | None = None, **kwargs, ) -> None: attn_impl = kwargs.pop("_attn_implementation", None) or _attn_implementation super().__init__(**kwargs) # Match HF default for attention implementation selection. self._attn_implementation = attn_impl or "sdpa" self.num_mel_bins = num_mel_bins self.d_model = d_model self.encoder_layers = encoder_layers self.encoder_attention_heads = encoder_attention_heads self.encoder_ffn_dim = encoder_ffn_dim self.dropout = dropout self.attention_dropout = attention_dropout self.activation_function = activation_function self.activation_dropout = activation_dropout self.num_hidden_layers = encoder_layers self.initializer_range = initializer_range self.scale_embedding = scale_embedding self.max_source_positions = max_source_positions self.n_window = n_window self.output_dim = output_dim self.bos_token_id = bos_token_id self.codebook_size = codebook_size self.continuous_features_mode = continuous_features_mode self.crq_transformer_config = crq_transformer_config self.eos_token_id = eos_token_id self.group_size = group_size self.enable_audio_invert_tower = enable_audio_invert_tower self.pad_token_id = pad_token_id class FunAudioChatConfig(PretrainedConfig): model_type = "funaudiochat" attribute_map = { "audio_token_id": "audio_token_index", } def __init__( self, audio_config: PretrainedConfig | dict | None = None, text_config: PretrainedConfig | dict | None = None, audio_token_index: int = 151646, ignore_index: int = -100, hidden_size: int | None = None, **kwargs, ) -> None: self.audio_token_index = audio_token_index self.ignore_index = ignore_index if isinstance(audio_config, dict): audio_config.setdefault( "model_type", FunAudioChatAudioEncoderConfig.model_type ) audio_config = FunAudioChatAudioEncoderConfig(**audio_config) elif audio_config is None: audio_config = FunAudioChatAudioEncoderConfig() self.audio_config = audio_config if isinstance(text_config, dict): # Default to qwen2 for backwards compatibility; FunAudioChat uses # qwen3 in practice for recent checkpoints. text_config.setdefault("model_type", "qwen2") import transformers text_cls = transformers.CONFIG_MAPPING[text_config["model_type"]] text_config = text_cls(**text_config) elif text_config is None: import transformers text_config = transformers.CONFIG_MAPPING["qwen2"]() self.text_config = text_config self.hidden_size = ( int(self.text_config.hidden_size) if hidden_size is None else int(hidden_size) ) super().__init__(**kwargs)