# Adapted from transformers 5.2.0 for compatibility with transformers 4.55.3 + torch 2.1.0 # Stubs layer_type_validation and RopeParameters which do not exist in 4.55.3 from typing import Optional, List from ...configuration_utils import PretrainedConfig as PreTrainedConfig # --- Local stubs for APIs not present in transformers 4.55.3 --- # Always use these definitions; do NOT import from the older transformers # as same-named functions there have incompatible signatures. def layer_type_validation(layer_types, num_hidden_layers=None, attention=True): allowed = {"full_attention", "linear_attention"} if not all(lt in allowed for lt in layer_types): raise ValueError(f"layer_types entries must be in {allowed}, got {layer_types}") if num_hidden_layers is not None and num_hidden_layers != len(layer_types): raise ValueError( f"num_hidden_layers ({num_hidden_layers}) != len(layer_types) ({len(layer_types)})" ) try: from typing import TypedDict class RopeParameters(TypedDict, total=False): rope_theta: float rope_type: str partial_rotary_factor: float factor: float except Exception: RopeParameters = dict # --- End stubs --- class Qwen3_5TextConfig(PreTrainedConfig): r""" Configuration for the text backbone of Qwen3.5 / Qwen3.6-27B models. model_type is "qwen3_5_text" (used internally by the nested config). """ model_type = "qwen3_5_text" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=248320, hidden_size=4096, intermediate_size=12288, num_hidden_layers=32, num_attention_heads=16, num_key_value_heads=4, hidden_act="silu", max_position_embeddings=32768, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, rope_parameters=None, attention_bias=False, attention_dropout=0.0, head_dim=256, linear_conv_kernel_dim=4, linear_key_head_dim=128, linear_value_head_dim=128, linear_num_key_heads=16, linear_num_value_heads=32, layer_types=None, pad_token_id=None, bos_token_id=None, eos_token_id=None, **kwargs, ): self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.tie_word_embeddings = tie_word_embeddings self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim self.rope_parameters = rope_parameters kwargs.setdefault("partial_rotary_factor", 0.25) self.layer_types = layer_types if self.layer_types is None: interval_pattern = kwargs.get("full_attention_interval", 4) self.layer_types = [ "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention" for i in range(self.num_hidden_layers) ] layer_type_validation(self.layer_types, self.num_hidden_layers) self.linear_conv_kernel_dim = linear_conv_kernel_dim self.linear_key_head_dim = linear_key_head_dim self.linear_value_head_dim = linear_value_head_dim self.linear_num_key_heads = linear_num_key_heads self.linear_num_value_heads = linear_num_value_heads super().__init__(**kwargs) class Qwen3_5VisionConfig(PreTrainedConfig): model_type = "qwen3_5_vision" def __init__( self, depth=27, hidden_size=1152, hidden_act="gelu_pytorch_tanh", intermediate_size=4304, num_heads=16, in_channels=3, patch_size=16, spatial_merge_size=2, temporal_patch_size=2, out_hidden_size=3584, num_position_embeddings=2304, initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) self.depth = depth self.hidden_size = hidden_size self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.num_heads = num_heads self.in_channels = in_channels self.patch_size = patch_size self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size self.out_hidden_size = out_hidden_size self.num_position_embeddings = num_position_embeddings self.initializer_range = initializer_range class Qwen3_5Config(PreTrainedConfig): r""" Top-level configuration for Qwen3.5 / Qwen3.6-27B. model_type = "qwen3_5" matches the model card / config.json. Wraps Qwen3_5TextConfig (and optionally Qwen3_5VisionConfig for multimodal use). For vLLM text-only inference only text_config is consumed. """ model_type = "qwen3_5" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, text_config=None, vision_config=None, image_token_id=248056, video_token_id=248057, vision_start_token_id=248053, vision_end_token_id=248054, tie_word_embeddings=False, **kwargs, ): if isinstance(text_config, dict): self.text_config = Qwen3_5TextConfig(**text_config) elif text_config is None: self.text_config = Qwen3_5TextConfig() else: self.text_config = text_config if isinstance(vision_config, dict): self.vision_config = Qwen3_5VisionConfig(**vision_config) elif vision_config is None: self.vision_config = Qwen3_5VisionConfig() else: self.vision_config = vision_config self.image_token_id = image_token_id self.video_token_id = video_token_id self.vision_start_token_id = vision_start_token_id self.vision_end_token_id = vision_end_token_id self.tie_word_embeddings = tie_word_embeddings super().__init__(**kwargs) __all__ = ["Qwen3_5Config", "Qwen3_5TextConfig", "Qwen3_5VisionConfig"]