Qwen3.6-27B iluvatar bi-v100 adaptation
This commit is contained in:
3
qwen3_6_scripts/qwen3_5/__init__.py
Normal file
3
qwen3_6_scripts/qwen3_5/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .configuration_qwen3_5 import Qwen3_5Config, Qwen3_5TextConfig, Qwen3_5VisionConfig
|
||||
|
||||
__all__ = ["Qwen3_5Config", "Qwen3_5TextConfig", "Qwen3_5VisionConfig"]
|
||||
188
qwen3_6_scripts/qwen3_5/configuration_qwen3_5.py
Normal file
188
qwen3_6_scripts/qwen3_5/configuration_qwen3_5.py
Normal file
@@ -0,0 +1,188 @@
|
||||
# Adapted from transformers 5.2.0 for compatibility with transformers 4.55.3 + torch 2.1.0
|
||||
# Stubs layer_type_validation and RopeParameters which do not exist in 4.55.3
|
||||
|
||||
from typing import Optional, List
|
||||
|
||||
from ...configuration_utils import PretrainedConfig as PreTrainedConfig
|
||||
|
||||
# --- Local stubs for APIs not present in transformers 4.55.3 ---
|
||||
# Always use these definitions; do NOT import from the older transformers
|
||||
# as same-named functions there have incompatible signatures.
|
||||
|
||||
def layer_type_validation(layer_types, num_hidden_layers=None, attention=True):
|
||||
allowed = {"full_attention", "linear_attention"}
|
||||
if not all(lt in allowed for lt in layer_types):
|
||||
raise ValueError(f"layer_types entries must be in {allowed}, got {layer_types}")
|
||||
if num_hidden_layers is not None and num_hidden_layers != len(layer_types):
|
||||
raise ValueError(
|
||||
f"num_hidden_layers ({num_hidden_layers}) != len(layer_types) ({len(layer_types)})"
|
||||
)
|
||||
|
||||
try:
|
||||
from typing import TypedDict
|
||||
class RopeParameters(TypedDict, total=False):
|
||||
rope_theta: float
|
||||
rope_type: str
|
||||
partial_rotary_factor: float
|
||||
factor: float
|
||||
except Exception:
|
||||
RopeParameters = dict
|
||||
|
||||
# --- End stubs ---
|
||||
|
||||
|
||||
class Qwen3_5TextConfig(PreTrainedConfig):
|
||||
r"""
|
||||
Configuration for the text backbone of Qwen3.5 / Qwen3.6-27B models.
|
||||
model_type is "qwen3_5_text" (used internally by the nested config).
|
||||
"""
|
||||
|
||||
model_type = "qwen3_5_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=248320,
|
||||
hidden_size=4096,
|
||||
intermediate_size=12288,
|
||||
num_hidden_layers=32,
|
||||
num_attention_heads=16,
|
||||
num_key_value_heads=4,
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=32768,
|
||||
initializer_range=0.02,
|
||||
rms_norm_eps=1e-6,
|
||||
use_cache=True,
|
||||
tie_word_embeddings=False,
|
||||
rope_parameters=None,
|
||||
attention_bias=False,
|
||||
attention_dropout=0.0,
|
||||
head_dim=256,
|
||||
linear_conv_kernel_dim=4,
|
||||
linear_key_head_dim=128,
|
||||
linear_value_head_dim=128,
|
||||
linear_num_key_heads=16,
|
||||
linear_num_value_heads=32,
|
||||
layer_types=None,
|
||||
pad_token_id=None,
|
||||
bos_token_id=None,
|
||||
eos_token_id=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.pad_token_id = pad_token_id
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
self.tie_word_embeddings = tie_word_embeddings
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.rms_norm_eps = rms_norm_eps
|
||||
self.use_cache = use_cache
|
||||
self.attention_bias = attention_bias
|
||||
self.attention_dropout = attention_dropout
|
||||
self.head_dim = head_dim
|
||||
self.rope_parameters = rope_parameters
|
||||
kwargs.setdefault("partial_rotary_factor", 0.25)
|
||||
|
||||
self.layer_types = layer_types
|
||||
if self.layer_types is None:
|
||||
interval_pattern = kwargs.get("full_attention_interval", 4)
|
||||
self.layer_types = [
|
||||
"linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
||||
|
||||
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||
self.linear_key_head_dim = linear_key_head_dim
|
||||
self.linear_value_head_dim = linear_value_head_dim
|
||||
self.linear_num_key_heads = linear_num_key_heads
|
||||
self.linear_num_value_heads = linear_num_value_heads
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class Qwen3_5VisionConfig(PreTrainedConfig):
|
||||
model_type = "qwen3_5_vision"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
depth=27,
|
||||
hidden_size=1152,
|
||||
hidden_act="gelu_pytorch_tanh",
|
||||
intermediate_size=4304,
|
||||
num_heads=16,
|
||||
in_channels=3,
|
||||
patch_size=16,
|
||||
spatial_merge_size=2,
|
||||
temporal_patch_size=2,
|
||||
out_hidden_size=3584,
|
||||
num_position_embeddings=2304,
|
||||
initializer_range=0.02,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.depth = depth
|
||||
self.hidden_size = hidden_size
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.num_heads = num_heads
|
||||
self.in_channels = in_channels
|
||||
self.patch_size = patch_size
|
||||
self.spatial_merge_size = spatial_merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.out_hidden_size = out_hidden_size
|
||||
self.num_position_embeddings = num_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
|
||||
class Qwen3_5Config(PreTrainedConfig):
|
||||
r"""
|
||||
Top-level configuration for Qwen3.5 / Qwen3.6-27B.
|
||||
model_type = "qwen3_5" matches the model card / config.json.
|
||||
Wraps Qwen3_5TextConfig (and optionally Qwen3_5VisionConfig for multimodal use).
|
||||
For vLLM text-only inference only text_config is consumed.
|
||||
"""
|
||||
|
||||
model_type = "qwen3_5"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
vision_config=None,
|
||||
image_token_id=248056,
|
||||
video_token_id=248057,
|
||||
vision_start_token_id=248053,
|
||||
vision_end_token_id=248054,
|
||||
tie_word_embeddings=False,
|
||||
**kwargs,
|
||||
):
|
||||
if isinstance(text_config, dict):
|
||||
self.text_config = Qwen3_5TextConfig(**text_config)
|
||||
elif text_config is None:
|
||||
self.text_config = Qwen3_5TextConfig()
|
||||
else:
|
||||
self.text_config = text_config
|
||||
|
||||
if isinstance(vision_config, dict):
|
||||
self.vision_config = Qwen3_5VisionConfig(**vision_config)
|
||||
elif vision_config is None:
|
||||
self.vision_config = Qwen3_5VisionConfig()
|
||||
else:
|
||||
self.vision_config = vision_config
|
||||
|
||||
self.image_token_id = image_token_id
|
||||
self.video_token_id = video_token_id
|
||||
self.vision_start_token_id = vision_start_token_id
|
||||
self.vision_end_token_id = vision_end_token_id
|
||||
self.tie_word_embeddings = tie_word_embeddings
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
__all__ = ["Qwen3_5Config", "Qwen3_5TextConfig", "Qwen3_5VisionConfig"]
|
||||
Reference in New Issue
Block a user