initial commit for qwen3.6-moe adaptation
This commit is contained in:
@@ -29,6 +29,7 @@ cp ./paged_attn.py /usr/local/corex/lib/python3/dist-packages/vllm/attention/ops
|
|||||||
# --- transformers: Qwen3_5 tokenizer / model files --------------------------
|
# --- transformers: Qwen3_5 tokenizer / model files --------------------------
|
||||||
pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
|
pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
cp -r ./qwen3_5 /usr/local/lib/python3.10/site-packages/transformers/models/
|
cp -r ./qwen3_5 /usr/local/lib/python3.10/site-packages/transformers/models/
|
||||||
|
cp -r ./qwen3_5_moe /usr/local/lib/python3.10/site-packages/transformers/models/
|
||||||
python3 ./patch_transformers_qwen3_5.py
|
python3 ./patch_transformers_qwen3_5.py
|
||||||
|
|
||||||
# --- vllm model: Qwen3.6-27B (Qwen3_5 arch) --------------------------------
|
# --- vllm model: Qwen3.6-27B (Qwen3_5 arch) --------------------------------
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
"""
|
"""
|
||||||
Patches transformers 4.55.3 to register the qwen3_5 model type.
|
Patches transformers 4.55.3 to register qwen3_5 and qwen3_5_moe model types.
|
||||||
|
|
||||||
Deploy steps on the remote machine:
|
Deploy steps on the remote machine:
|
||||||
1. cp -r modified_scripts/qwen3_5 /usr/local/lib/python3.10/site-packages/transformers/models/qwen3_5
|
1. cp -r modified_scripts/qwen3_5 /usr/local/lib/python3.10/site-packages/transformers/models/qwen3_5
|
||||||
2. python3 modified_scripts/patch_transformers_qwen3_5.py
|
2. cp -r modified_scripts/qwen3_5_moe /usr/local/lib/python3.10/site-packages/transformers/models/qwen3_5_moe
|
||||||
|
3. python3 modified_scripts/patch_transformers_qwen3_5.py
|
||||||
|
|
||||||
Target: pip-installed transformers at /usr/local/lib/python3.10/site-packages/transformers/
|
Target: pip-installed transformers at /usr/local/lib/python3.10/site-packages/transformers/
|
||||||
(Not the corex pre-installed path at /usr/local/corex/lib64/python3/dist-packages/)
|
(Not the corex pre-installed path at /usr/local/corex/lib64/python3/dist-packages/)
|
||||||
@@ -40,24 +41,23 @@ def patch_file(path, replacements):
|
|||||||
def main():
|
def main():
|
||||||
print(f"=== Patching {AUTO_CONFIG} ===")
|
print(f"=== Patching {AUTO_CONFIG} ===")
|
||||||
patch_file(AUTO_CONFIG, [
|
patch_file(AUTO_CONFIG, [
|
||||||
# CONFIG_MAPPING_NAMES: insert qwen3_5 right after qwen3
|
# CONFIG_MAPPING_NAMES: insert qwen3_5 + qwen3_5_moe right after qwen3
|
||||||
(
|
(
|
||||||
'("qwen3", "Qwen3Config"),',
|
'("qwen3", "Qwen3Config"),',
|
||||||
'("qwen3", "Qwen3Config"),\n ("qwen3_5", "Qwen3_5Config"),',
|
'("qwen3", "Qwen3Config"),\n ("qwen3_5", "Qwen3_5Config"),\n ("qwen3_5_moe", "Qwen3_5MoeConfig"),',
|
||||||
),
|
),
|
||||||
# Some versions don't have trailing comma — handle that too
|
|
||||||
(
|
(
|
||||||
'("qwen3", "Qwen3Config")\n',
|
'("qwen3", "Qwen3Config")\n',
|
||||||
'("qwen3", "Qwen3Config"),\n ("qwen3_5", "Qwen3_5Config"),\n',
|
'("qwen3", "Qwen3Config"),\n ("qwen3_5", "Qwen3_5Config"),\n ("qwen3_5_moe", "Qwen3_5MoeConfig"),\n',
|
||||||
),
|
),
|
||||||
# MODEL_NAMES_MAPPING (model_type -> human readable name, used by docstring generator)
|
# MODEL_NAMES_MAPPING (model_type -> human readable name)
|
||||||
(
|
(
|
||||||
'("qwen3", "Qwen3"),',
|
'("qwen3", "Qwen3"),',
|
||||||
'("qwen3", "Qwen3"),\n ("qwen3_5", "Qwen3_5"),',
|
'("qwen3", "Qwen3"),\n ("qwen3_5", "Qwen3_5"),\n ("qwen3_5_moe", "Qwen3_5_MoE"),',
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
'("qwen3", "Qwen3")\n',
|
'("qwen3", "Qwen3")\n',
|
||||||
'("qwen3", "Qwen3"),\n ("qwen3_5", "Qwen3_5"),\n',
|
'("qwen3", "Qwen3"),\n ("qwen3_5", "Qwen3_5"),\n ("qwen3_5_moe", "Qwen3_5_MoE"),\n',
|
||||||
),
|
),
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -65,7 +65,7 @@ def main():
|
|||||||
patch_file(MODELS_INIT, [
|
patch_file(MODELS_INIT, [
|
||||||
(
|
(
|
||||||
"from .qwen3 import *\n",
|
"from .qwen3 import *\n",
|
||||||
"from .qwen3 import *\n from .qwen3_5 import *\n",
|
"from .qwen3 import *\n from .qwen3_5 import *\n from .qwen3_5_moe import *\n",
|
||||||
),
|
),
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -74,19 +74,39 @@ def main():
|
|||||||
try:
|
try:
|
||||||
import importlib.util, types
|
import importlib.util, types
|
||||||
|
|
||||||
# Quick smoke-test: import the config class directly
|
def _load_config_mod(module_name, file_path):
|
||||||
spec = importlib.util.spec_from_file_location(
|
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
||||||
"configuration_qwen3_5",
|
mod = importlib.util.module_from_spec(spec)
|
||||||
|
mod.__package__ = ".".join(module_name.split(".")[:-1])
|
||||||
|
pkg = sys.modules.setdefault("transformers", types.ModuleType("transformers"))
|
||||||
|
pkg.__path__ = [TRANSFORMERS_ROOT]
|
||||||
|
cu = sys.modules.setdefault(
|
||||||
|
"transformers.configuration_utils", types.ModuleType("transformers.configuration_utils"))
|
||||||
|
class _PC:
|
||||||
|
def __init__(self, **kwargs): pass
|
||||||
|
cu.PretrainedConfig = _PC
|
||||||
|
for sub in ("transformers.models", f"transformers.models.{module_name.split('.')[-2]}"):
|
||||||
|
m = sys.modules.setdefault(sub, types.ModuleType(sub))
|
||||||
|
m.__path__ = [TRANSFORMERS_ROOT]
|
||||||
|
spec.loader.exec_module(mod)
|
||||||
|
return mod
|
||||||
|
|
||||||
|
mod27 = _load_config_mod(
|
||||||
|
"transformers.models.qwen3_5.configuration_qwen3_5",
|
||||||
f"{TRANSFORMERS_ROOT}/models/qwen3_5/configuration_qwen3_5.py",
|
f"{TRANSFORMERS_ROOT}/models/qwen3_5/configuration_qwen3_5.py",
|
||||||
)
|
)
|
||||||
mod = importlib.util.module_from_spec(spec)
|
cfg = mod27.Qwen3_5Config()
|
||||||
# Provide minimal parent package stubs so relative imports resolve
|
|
||||||
pkg = types.ModuleType("transformers")
|
|
||||||
pkg.__path__ = [TRANSFORMERS_ROOT]
|
|
||||||
sys.modules.setdefault("transformers", pkg)
|
|
||||||
spec.loader.exec_module(mod)
|
|
||||||
cfg = mod.Qwen3_5Config()
|
|
||||||
print(f" Qwen3_5Config() smoke-test OK (model_type={cfg.model_type})")
|
print(f" Qwen3_5Config() smoke-test OK (model_type={cfg.model_type})")
|
||||||
|
|
||||||
|
mod35 = _load_config_mod(
|
||||||
|
"transformers.models.qwen3_5_moe.configuration_qwen3_5_moe",
|
||||||
|
f"{TRANSFORMERS_ROOT}/models/qwen3_5_moe/configuration_qwen3_5_moe.py",
|
||||||
|
)
|
||||||
|
moe_cfg = mod35.Qwen3_5MoeConfig()
|
||||||
|
print(f" Qwen3_5MoeConfig() smoke-test OK (model_type={moe_cfg.model_type})")
|
||||||
|
t = moe_cfg.text_config
|
||||||
|
print(f" num_experts={t.num_experts}, top_k={t.num_experts_per_tok}, "
|
||||||
|
f"shared={t.shared_expert_intermediate_size}, layers={t.num_hidden_layers}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" [warn] smoke-test failed (may be fine at runtime): {e}")
|
print(f" [warn] smoke-test failed (may be fine at runtime): {e}")
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ def main():
|
|||||||
' "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),',
|
' "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),',
|
||||||
' "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),\n'
|
' "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"),\n'
|
||||||
' "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),\n'
|
' "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"),\n'
|
||||||
' "Qwen3_5ForCausalLM": ("qwen3_5", "Qwen3_5ForCausalLM"),',
|
' "Qwen3_5ForCausalLM": ("qwen3_5", "Qwen3_5ForCausalLM"),\n'
|
||||||
|
' "Qwen3_5MoeForCausalLM": ("qwen3_5", "Qwen3_5MoeForCausalLM"),',
|
||||||
),
|
),
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -61,11 +62,13 @@ def main():
|
|||||||
spec.loader.exec_module(mod)
|
spec.loader.exec_module(mod)
|
||||||
cls = mod.Qwen3_5ForCausalLM
|
cls = mod.Qwen3_5ForCausalLM
|
||||||
print(f" Qwen3_5ForCausalLM found: {cls}")
|
print(f" Qwen3_5ForCausalLM found: {cls}")
|
||||||
|
cls_moe = mod.Qwen3_5MoeForCausalLM
|
||||||
|
print(f" Qwen3_5MoeForCausalLM found: {cls_moe}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" [warn] verification failed (may be OK at runtime): {e}")
|
print(f" [warn] verification failed (may be OK at runtime): {e}")
|
||||||
|
|
||||||
print("\nDone. Remember to:")
|
print("\nDone. Remember to:")
|
||||||
print(" 1. Set config.json 'architectures': ['Qwen3_5ForCausalLM']")
|
print(" 1. Set config.json 'architectures': ['Qwen3_5ForCausalLM'] or ['Qwen3_5MoEForCausalLM']")
|
||||||
print(" 2. Run patch_transformers_qwen3_5.py if not already done")
|
print(" 2. Run patch_transformers_qwen3_5.py if not already done")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -11,12 +11,15 @@ from torch import nn
|
|||||||
from vllm.attention import Attention, AttentionMetadata
|
from vllm.attention import Attention, AttentionMetadata
|
||||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
||||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size)
|
get_tensor_model_parallel_world_size,
|
||||||
|
tensor_model_parallel_all_reduce)
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
from vllm.model_executor.layers.layernorm import GemmaRMSNorm
|
from vllm.model_executor.layers.layernorm import GemmaRMSNorm
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
MergedColumnParallelLinear,
|
MergedColumnParallelLinear,
|
||||||
|
ReplicatedLinear,
|
||||||
RowParallelLinear)
|
RowParallelLinear)
|
||||||
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
@@ -495,16 +498,35 @@ class Qwen3_5FullAttention(nn.Module):
|
|||||||
|
|
||||||
tp_size = get_tensor_model_parallel_world_size()
|
tp_size = get_tensor_model_parallel_world_size()
|
||||||
self.local_num_heads = self.num_heads // tp_size
|
self.local_num_heads = self.num_heads // tp_size
|
||||||
self.local_num_kv_heads = max(1, self.num_kv_heads // tp_size)
|
|
||||||
self.local_q_dim = self.local_num_heads * self.head_dim
|
|
||||||
self.local_kv_dim = self.local_num_kv_heads * self.head_dim
|
|
||||||
self.scaling = self.head_dim ** -0.5
|
self.scaling = self.head_dim ** -0.5
|
||||||
|
|
||||||
# q_proj includes gate: output = num_heads * head_dim * 2
|
# When num_kv_heads < tp_size we cannot shard KV further (would give
|
||||||
self.q_proj = ColumnParallelLinear(
|
# fractional heads per rank). Use ReplicatedLinear so every rank holds
|
||||||
self.hidden_size, self.num_heads * self.head_dim * 2,
|
# all KV heads; local_num_kv_heads equals the full count.
|
||||||
bias=False, quant_config=quant_config,
|
# When num_kv_heads >= tp_size standard ColumnParallel sharding applies.
|
||||||
prefix=f"{prefix}.q_proj")
|
if tp_size > self.num_kv_heads:
|
||||||
|
# GQA-aware TP sharding: ixformer kernel only supports num_kv_heads=1
|
||||||
|
# per rank. With num_kv_heads=2 < tp_size=4 we cannot shard KV
|
||||||
|
# evenly, but we CAN assign each rank the ONE KV head that serves
|
||||||
|
# its Q heads:
|
||||||
|
# q_per_kv = num_heads // num_kv_heads (e.g. 16//2 = 8)
|
||||||
|
# Rank r uses KV head r * local_num_heads // q_per_kv
|
||||||
|
# e.g. ranks 0,1 → KV head 0; ranks 2,3 → KV head 1.
|
||||||
|
# We replicate all KV heads to every rank and select in forward().
|
||||||
|
self.proj_kv_heads = self.num_kv_heads # heads available from projection
|
||||||
|
self.local_num_kv_heads = 1 # heads after rank-local selection
|
||||||
|
self.q_per_kv_global = self.num_heads // self.num_kv_heads
|
||||||
|
self.k_proj = ReplicatedLinear(
|
||||||
|
self.hidden_size, self.num_kv_heads * self.head_dim,
|
||||||
|
bias=False, quant_config=quant_config)
|
||||||
|
self.v_proj = ReplicatedLinear(
|
||||||
|
self.hidden_size, self.num_kv_heads * self.head_dim,
|
||||||
|
bias=False, quant_config=quant_config)
|
||||||
|
else:
|
||||||
|
# Standard sharding: each rank gets num_kv_heads // tp_size heads.
|
||||||
|
self.local_num_kv_heads = self.num_kv_heads // tp_size
|
||||||
|
self.proj_kv_heads = self.local_num_kv_heads # already sharded
|
||||||
|
self.q_per_kv_global = None
|
||||||
self.k_proj = ColumnParallelLinear(
|
self.k_proj = ColumnParallelLinear(
|
||||||
self.hidden_size, self.num_kv_heads * self.head_dim,
|
self.hidden_size, self.num_kv_heads * self.head_dim,
|
||||||
bias=False, quant_config=quant_config,
|
bias=False, quant_config=quant_config,
|
||||||
@@ -513,6 +535,15 @@ class Qwen3_5FullAttention(nn.Module):
|
|||||||
self.hidden_size, self.num_kv_heads * self.head_dim,
|
self.hidden_size, self.num_kv_heads * self.head_dim,
|
||||||
bias=False, quant_config=quant_config,
|
bias=False, quant_config=quant_config,
|
||||||
prefix=f"{prefix}.v_proj")
|
prefix=f"{prefix}.v_proj")
|
||||||
|
|
||||||
|
self.local_q_dim = self.local_num_heads * self.head_dim
|
||||||
|
self.local_kv_dim = self.local_num_kv_heads * self.head_dim
|
||||||
|
|
||||||
|
# q_proj includes gate: output = num_heads * head_dim * 2
|
||||||
|
self.q_proj = ColumnParallelLinear(
|
||||||
|
self.hidden_size, self.num_heads * self.head_dim * 2,
|
||||||
|
bias=False, quant_config=quant_config,
|
||||||
|
prefix=f"{prefix}.q_proj")
|
||||||
self.o_proj = RowParallelLinear(
|
self.o_proj = RowParallelLinear(
|
||||||
self.num_heads * self.head_dim, self.hidden_size,
|
self.num_heads * self.head_dim, self.hidden_size,
|
||||||
bias=False, quant_config=quant_config,
|
bias=False, quant_config=quant_config,
|
||||||
@@ -559,18 +590,34 @@ class Qwen3_5FullAttention(nn.Module):
|
|||||||
q = qg[:, :, :self.head_dim].reshape(total_tokens, -1)
|
q = qg[:, :, :self.head_dim].reshape(total_tokens, -1)
|
||||||
gate = qg[:, :, self.head_dim:].reshape(total_tokens, -1)
|
gate = qg[:, :, self.head_dim:].reshape(total_tokens, -1)
|
||||||
|
|
||||||
k, _ = self.k_proj(hidden_states) # (total, local_kv_dim)
|
k, _ = self.k_proj(hidden_states) # (total, proj_kv_heads * head_dim)
|
||||||
v, _ = self.v_proj(hidden_states)
|
v, _ = self.v_proj(hidden_states)
|
||||||
|
|
||||||
# Per-head RMSNorm
|
# q_norm on local Q heads
|
||||||
q = self.q_norm.forward_cuda(
|
q = self.q_norm.forward_cuda(
|
||||||
q.view(total_tokens, self.local_num_heads, self.head_dim)
|
q.view(total_tokens, self.local_num_heads, self.head_dim)
|
||||||
.contiguous()).view(total_tokens, -1)
|
.contiguous()).view(total_tokens, -1)
|
||||||
|
|
||||||
|
# GQA-aware TP: select rank-local KV head BEFORE k_norm and rope so
|
||||||
|
# that ixformer kernels always see num_kv_heads=1 (same as 27B path).
|
||||||
|
# Doing k_norm/rope on 2 KV heads (proj_kv_heads=2) triggers ixformer
|
||||||
|
# paths that can produce NaN; restricting to 1 head avoids the issue.
|
||||||
|
if self.q_per_kv_global is not None:
|
||||||
|
tp_rank = get_tensor_model_parallel_rank()
|
||||||
|
kv_idx = (tp_rank * self.local_num_heads) // self.q_per_kv_global
|
||||||
|
k = (k.view(total_tokens, self.proj_kv_heads, self.head_dim)
|
||||||
|
[:, kv_idx, :].contiguous()) # (T, head_dim) — 1 head
|
||||||
|
v = (v.view(total_tokens, self.proj_kv_heads, self.head_dim)
|
||||||
|
[:, kv_idx, :].contiguous()) # (T, head_dim) — 1 head
|
||||||
|
|
||||||
|
# k_norm on the (now always 1) rank-local KV head
|
||||||
k = self.k_norm.forward_cuda(
|
k = self.k_norm.forward_cuda(
|
||||||
k.view(total_tokens, self.local_num_kv_heads, self.head_dim)
|
k.view(total_tokens, self.local_num_kv_heads, self.head_dim)
|
||||||
.contiguous()).view(total_tokens, -1)
|
.contiguous()).view(total_tokens, -1)
|
||||||
|
|
||||||
|
# rope: q=(T, local_num_heads*head_dim), k=(T, 1*head_dim) — mirrors 27B
|
||||||
q, k = self.rotary_emb(positions, q, k)
|
q, k = self.rotary_emb(positions, q, k)
|
||||||
|
|
||||||
attn_out = self.attn(q, k, v, kv_cache, attn_metadata)
|
attn_out = self.attn(q, k, v, kv_cache, attn_metadata)
|
||||||
|
|
||||||
# Multiply by sigmoid gate before output projection
|
# Multiply by sigmoid gate before output projection
|
||||||
@@ -609,10 +656,130 @@ class Qwen3_5MLP(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# MoE sparse block (Qwen3.5-MoE / Qwen3.6-35B-A3B)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class Qwen3_5MoeSparseBlock(nn.Module):
|
||||||
|
"""Replaces Qwen3_5MLP for qwen3_5_moe_text layers.
|
||||||
|
|
||||||
|
FusedMoE is used ONLY for weight storage and loading (create_weights /
|
||||||
|
weight_loader are pure PyTorch). Its forward kernel is bypassed because
|
||||||
|
ixformer on BI-V100 lacks vllm_moe_topk_softmax / vllm_invoke_fused_moe_kernel.
|
||||||
|
Routing and expert computation use a pure-PyTorch loop instead.
|
||||||
|
|
||||||
|
Shared expert uses RowParallelLinear(reduce_results=False) so both paths
|
||||||
|
produce partial (pre-all-reduce) outputs that are combined before a single
|
||||||
|
all-reduce.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
text_cfg,
|
||||||
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
hidden_size = text_cfg.hidden_size
|
||||||
|
self.num_experts = text_cfg.num_experts
|
||||||
|
self.top_k = text_cfg.num_experts_per_tok
|
||||||
|
|
||||||
|
# Router: replicated (small: num_experts outputs)
|
||||||
|
self.gate = ReplicatedLinear(hidden_size, text_cfg.num_experts,
|
||||||
|
bias=False, quant_config=quant_config)
|
||||||
|
|
||||||
|
# FusedMoE: only used for weight storage + weight_loader.
|
||||||
|
# Forward is bypassed — see _pure_pytorch_experts().
|
||||||
|
self.experts = FusedMoE(
|
||||||
|
num_experts=text_cfg.num_experts,
|
||||||
|
top_k=text_cfg.num_experts_per_tok,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
intermediate_size=text_cfg.moe_intermediate_size,
|
||||||
|
reduce_results=False, # we do the all-reduce ourselves below
|
||||||
|
renormalize=True,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Shared expert: defer all-reduce to combine with routed output first
|
||||||
|
shared_size = text_cfg.shared_expert_intermediate_size
|
||||||
|
self.shared_expert_gate_up = MergedColumnParallelLinear(
|
||||||
|
hidden_size, [shared_size] * 2, bias=False,
|
||||||
|
quant_config=quant_config)
|
||||||
|
self.shared_expert_down = RowParallelLinear(
|
||||||
|
shared_size, hidden_size, bias=False, reduce_results=False,
|
||||||
|
quant_config=quant_config)
|
||||||
|
self.act_fn = SiluAndMul()
|
||||||
|
# Scalar sigmoid gate on shared expert output (same as Qwen2-MoE / Qwen3.5-MoE):
|
||||||
|
# shared_out *= sigmoid(shared_expert_gate(hidden_states))
|
||||||
|
# Without this, shared expert is always fully active → wrong logits.
|
||||||
|
self.shared_expert_gate = ReplicatedLinear(
|
||||||
|
hidden_size, 1, bias=False, quant_config=quant_config)
|
||||||
|
|
||||||
|
def _pure_pytorch_experts(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
router_logits: torch.Tensor,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""Pure-PyTorch MoE (ixformer has no MoE kernels on BI-V100).
|
||||||
|
|
||||||
|
w13_weight: (num_experts, 2*inter_per_partition, hidden) [TP-sharded]
|
||||||
|
w2_weight: (num_experts, hidden, inter_per_partition) [TP-sharded]
|
||||||
|
Output is partial (pre-all-reduce), same contract as FusedMoE
|
||||||
|
with reduce_results=False.
|
||||||
|
"""
|
||||||
|
# Routing: softmax → topk → renormalise
|
||||||
|
routing_weights = torch.softmax(router_logits.float(), dim=-1)
|
||||||
|
topk_weights, topk_ids = torch.topk(
|
||||||
|
routing_weights, self.top_k, dim=-1) # (T, top_k)
|
||||||
|
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
||||||
|
topk_weights = topk_weights.to(hidden_states.dtype)
|
||||||
|
|
||||||
|
out = torch.zeros_like(hidden_states)
|
||||||
|
w13 = self.experts.w13_weight # (E, 2*I, H)
|
||||||
|
w2 = self.experts.w2_weight # (E, H, I)
|
||||||
|
|
||||||
|
for eid in range(self.num_experts):
|
||||||
|
# Tokens routed to this expert
|
||||||
|
mask = (topk_ids == eid) # (T, top_k) bool
|
||||||
|
tok_ids, topk_pos = mask.nonzero(as_tuple=True)
|
||||||
|
if tok_ids.numel() == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens = hidden_states[tok_ids] # (n, H)
|
||||||
|
# gate + up projection (ColumnParallel shard)
|
||||||
|
gate_up = F.linear(tokens, w13[eid]) # (n, 2*I)
|
||||||
|
gate, up = gate_up.chunk(2, dim=-1)
|
||||||
|
act = F.silu(gate) * up # (n, I)
|
||||||
|
# down projection (RowParallel shard) — result is partial
|
||||||
|
# F.linear(x, W) = x @ W.T; w2[eid]: (H, I) → x @ W.T = (n,H) ✓
|
||||||
|
expert_out = F.linear(act, w2[eid]) # (n, H)
|
||||||
|
|
||||||
|
weights = topk_weights[tok_ids, topk_pos].unsqueeze(-1)
|
||||||
|
out.index_add_(0, tok_ids, (expert_out * weights).to(out.dtype))
|
||||||
|
|
||||||
|
return out # partial, all-reduce done in forward()
|
||||||
|
|
||||||
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||||
|
router_logits, _ = self.gate(hidden_states)
|
||||||
|
routed_out = self._pure_pytorch_experts(hidden_states, router_logits)
|
||||||
|
|
||||||
|
gate_up, _ = self.shared_expert_gate_up(hidden_states)
|
||||||
|
shared_out = self.act_fn(gate_up)
|
||||||
|
shared_out, _ = self.shared_expert_down(shared_out)
|
||||||
|
# Scalar sigmoid gate (Qwen2-MoE / Qwen3.5-MoE style)
|
||||||
|
gate_score, _ = self.shared_expert_gate(hidden_states) # (T, 1)
|
||||||
|
shared_out = shared_out * torch.sigmoid(gate_score)
|
||||||
|
|
||||||
|
out = routed_out + shared_out
|
||||||
|
if self.experts.tp_size > 1:
|
||||||
|
out = tensor_model_parallel_all_reduce(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Decoder layer (dispatches to GatedDeltaNet or Qwen3_5FullAttention)
|
# Decoder layer (dispatches to GatedDeltaNet or Qwen3_5FullAttention)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
class Qwen3_5DecoderLayer(nn.Module):
|
class Qwen3_5DecoderLayer(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -623,6 +790,7 @@ class Qwen3_5DecoderLayer(nn.Module):
|
|||||||
quant_config: Optional[QuantizationConfig] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.layer_idx = layer_idx
|
||||||
self.layer_type = layer_type
|
self.layer_type = layer_type
|
||||||
self.input_layernorm = GemmaRMSNorm(text_cfg.hidden_size,
|
self.input_layernorm = GemmaRMSNorm(text_cfg.hidden_size,
|
||||||
eps=text_cfg.rms_norm_eps)
|
eps=text_cfg.rms_norm_eps)
|
||||||
@@ -640,6 +808,9 @@ class Qwen3_5DecoderLayer(nn.Module):
|
|||||||
prefix=f"layers.{layer_idx}.self_attn",
|
prefix=f"layers.{layer_idx}.self_attn",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if getattr(text_cfg, 'model_type', '') == 'qwen3_5_moe_text':
|
||||||
|
self.mlp = Qwen3_5MoeSparseBlock(text_cfg, quant_config=quant_config)
|
||||||
|
else:
|
||||||
self.mlp = Qwen3_5MLP(
|
self.mlp = Qwen3_5MLP(
|
||||||
hidden_size=text_cfg.hidden_size,
|
hidden_size=text_cfg.hidden_size,
|
||||||
intermediate_size=text_cfg.intermediate_size,
|
intermediate_size=text_cfg.intermediate_size,
|
||||||
@@ -673,7 +844,9 @@ class Qwen3_5DecoderLayer(nn.Module):
|
|||||||
|
|
||||||
hidden_states, residual = self.post_attention_layernorm(
|
hidden_states, residual = self.post_attention_layernorm(
|
||||||
hidden_states, residual)
|
hidden_states, residual)
|
||||||
|
|
||||||
hidden_states = self.mlp(hidden_states)
|
hidden_states = self.mlp(hidden_states)
|
||||||
|
|
||||||
return hidden_states, residual
|
return hidden_states, residual
|
||||||
|
|
||||||
|
|
||||||
@@ -860,8 +1033,9 @@ class Qwen3_5ForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
|
|||||||
# With chunked prefill, intermediate chunks have seq_groups=None on all
|
# With chunked prefill, intermediate chunks have seq_groups=None on all
|
||||||
# ranks; _apply_logits_processors is guarded against this in
|
# ranks; _apply_logits_processors is guarded against this in
|
||||||
# logits_processor.py (patched by patch_xformers_sdpa_seq.py).
|
# logits_processor.py (patched by patch_xformers_sdpa_seq.py).
|
||||||
return self.logits_processor(self.lm_head, hidden_states,
|
logits = self.logits_processor(self.lm_head, hidden_states,
|
||||||
sampling_metadata)
|
sampling_metadata)
|
||||||
|
return logits
|
||||||
|
|
||||||
def sample(
|
def sample(
|
||||||
self,
|
self,
|
||||||
@@ -892,12 +1066,9 @@ class Qwen3_5ForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
|
|||||||
or name.startswith("model.mtp")):
|
or name.startswith("model.mtp")):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Remap checkpoint prefix → module path
|
# Prefix remapping: checkpoint may wrap under language_model
|
||||||
# Checkpoint: "model.language_model.{rest}" → our module: "model.{rest}"
|
|
||||||
# Checkpoint: "lm_head.weight" → our module: "lm_head.weight"
|
|
||||||
if name.startswith("model.language_model."):
|
if name.startswith("model.language_model."):
|
||||||
name = "model." + name[len("model.language_model."):]
|
name = "model." + name[len("model.language_model."):]
|
||||||
# lm_head is already at top level — no change needed
|
|
||||||
|
|
||||||
# Skip positional embedding caches
|
# Skip positional embedding caches
|
||||||
if "rotary_emb.inv_freq" in name:
|
if "rotary_emb.inv_freq" in name:
|
||||||
@@ -931,3 +1102,118 @@ class Qwen3_5ForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
|
|||||||
weight_loader = getattr(param, "weight_loader",
|
weight_loader = getattr(param, "weight_loader",
|
||||||
default_weight_loader)
|
default_weight_loader)
|
||||||
weight_loader(param, loaded_weight)
|
weight_loader(param, loaded_weight)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Qwen3.6-35B-A3B (Qwen3_5-MoE architecture)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLM):
|
||||||
|
"""Qwen3.6-35B-A3B: same hybrid-attention backbone as 27B, dense MLP
|
||||||
|
replaced by Qwen3_5MoeSparseBlock (256 routed experts + shared expert).
|
||||||
|
Only load_weights differs from the dense variant.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||||
|
# Checkpoint key format for this model (transformers Qwen3_5MoeExperts):
|
||||||
|
# mlp.experts.gate_up_proj shape (num_experts, 2*intermediate, hidden)
|
||||||
|
# mlp.experts.down_proj shape (num_experts, hidden, intermediate)
|
||||||
|
# mlp.gate.weight shape (num_experts, hidden) [router]
|
||||||
|
# mlp.shared_expert.{gate,up,down}_proj.weight [shared MLP]
|
||||||
|
# Our FusedMoE stores:
|
||||||
|
# mlp.experts.w13_weight shape (num_experts, 2*intermediate//tp, hidden)
|
||||||
|
# mlp.experts.w2_weight shape (num_experts, hidden, intermediate//tp)
|
||||||
|
# Our shared expert stores:
|
||||||
|
# mlp.shared_expert_gate_up.weight (merged gate+up)
|
||||||
|
# mlp.shared_expert_down.weight
|
||||||
|
|
||||||
|
stacked_params_mapping = [
|
||||||
|
# (param_name, weight_name, shard_id)
|
||||||
|
# shared expert
|
||||||
|
("shared_expert_gate_up", "shared_expert.gate_proj", 0),
|
||||||
|
("shared_expert_gate_up", "shared_expert.up_proj", 1),
|
||||||
|
# linear_attention dense proj (same as 27B)
|
||||||
|
("gate_up_proj", "gate_proj", 0),
|
||||||
|
("gate_up_proj", "up_proj", 1),
|
||||||
|
]
|
||||||
|
|
||||||
|
params_dict = dict(self.named_parameters())
|
||||||
|
|
||||||
|
for name, loaded_weight in weights:
|
||||||
|
# Skip vision and MTP branches
|
||||||
|
if (name.startswith("model.visual")
|
||||||
|
or name.startswith("mtp.")
|
||||||
|
or name.startswith("model.mtp")):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Prefix remapping for VL checkpoint (Qwen3_5MoeForConditionalGeneration):
|
||||||
|
# model.language_model.model.{layers,embed_tokens,norm} -> model.{...}
|
||||||
|
# model.language_model.lm_head -> lm_head
|
||||||
|
# Prefix remapping: checkpoint may wrap under language_model
|
||||||
|
if name.startswith("model.language_model."):
|
||||||
|
name = "model." + name[len("model.language_model."):]
|
||||||
|
|
||||||
|
if "rotary_emb.inv_freq" in name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if ".linear_attn.conv1d.weight" in name:
|
||||||
|
name = name.replace(".linear_attn.conv1d.weight",
|
||||||
|
".linear_attn.conv1d_weight")
|
||||||
|
|
||||||
|
# --- Fused routed-expert weights (all experts in one tensor) ---
|
||||||
|
|
||||||
|
if "mlp.experts.gate_up_proj" in name:
|
||||||
|
# loaded_weight: (num_experts, 2*intermediate, hidden)
|
||||||
|
w13_name = name.replace("mlp.experts.gate_up_proj",
|
||||||
|
"mlp.experts.w13_weight")
|
||||||
|
if w13_name not in params_dict:
|
||||||
|
continue
|
||||||
|
param = params_dict[w13_name]
|
||||||
|
n_exp = loaded_weight.shape[0]
|
||||||
|
inter = loaded_weight.shape[1] // 2
|
||||||
|
gate_w = loaded_weight[:, :inter, :].contiguous()
|
||||||
|
up_w = loaded_weight[:, inter:, :].contiguous()
|
||||||
|
for eid in range(n_exp):
|
||||||
|
param.weight_loader(param, gate_w[eid], "w1_weight", "w1", eid)
|
||||||
|
param.weight_loader(param, up_w[eid], "w3_weight", "w3", eid)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "mlp.experts.down_proj" in name:
|
||||||
|
# loaded_weight: (num_experts, hidden, intermediate)
|
||||||
|
w2_name = name.replace("mlp.experts.down_proj",
|
||||||
|
"mlp.experts.w2_weight")
|
||||||
|
if w2_name not in params_dict:
|
||||||
|
continue
|
||||||
|
param = params_dict[w2_name]
|
||||||
|
n_exp = loaded_weight.shape[0]
|
||||||
|
for eid in range(n_exp):
|
||||||
|
param.weight_loader(param, loaded_weight[eid], "w2_weight", "w2", eid)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# --- Shared expert down_proj rename ---
|
||||||
|
if "mlp.shared_expert.down_proj" in name:
|
||||||
|
name = name.replace("mlp.shared_expert.down_proj",
|
||||||
|
"mlp.shared_expert_down")
|
||||||
|
if name not in params_dict:
|
||||||
|
continue
|
||||||
|
param = params_dict[name]
|
||||||
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||||
|
weight_loader(param, loaded_weight)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# --- Stacked / standard weights ---
|
||||||
|
for param_name, weight_name, shard_id in stacked_params_mapping:
|
||||||
|
if weight_name not in name:
|
||||||
|
continue
|
||||||
|
name = name.replace(weight_name, param_name)
|
||||||
|
if name not in params_dict:
|
||||||
|
break
|
||||||
|
param = params_dict[name]
|
||||||
|
param.weight_loader(param, loaded_weight, shard_id)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if name not in params_dict:
|
||||||
|
continue
|
||||||
|
param = params_dict[name]
|
||||||
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||||
|
weight_loader(param, loaded_weight)
|
||||||
|
|||||||
3
qwen3_6_scripts/qwen3_5_moe/__init__.py
Normal file
3
qwen3_6_scripts/qwen3_5_moe/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .configuration_qwen3_5_moe import Qwen3_5MoeConfig, Qwen3_5MoeTextConfig
|
||||||
|
|
||||||
|
__all__ = ["Qwen3_5MoeConfig", "Qwen3_5MoeTextConfig"]
|
||||||
198
qwen3_6_scripts/qwen3_5_moe/configuration_qwen3_5_moe.py
Normal file
198
qwen3_6_scripts/qwen3_5_moe/configuration_qwen3_5_moe.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
# Adapted from transformers 5.2.0 for compatibility with transformers 4.55.3 + torch 2.1.0
|
||||||
|
# Source: transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py
|
||||||
|
# Stubs layer_type_validation and RopeParameters which do not exist in 4.55.3
|
||||||
|
# Removes ignore_keys_at_rope_validation / base_model_tp_plan / base_model_pp_plan
|
||||||
|
# which are 5.x-only and irrelevant for vLLM inference.
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from ...configuration_utils import PretrainedConfig as PreTrainedConfig
|
||||||
|
|
||||||
|
# --- Local stubs for APIs not present in transformers 4.55.3 ---
|
||||||
|
def layer_type_validation(layer_types, num_hidden_layers=None, attention=True):
|
||||||
|
allowed = {"full_attention", "linear_attention"}
|
||||||
|
if not all(lt in allowed for lt in layer_types):
|
||||||
|
raise ValueError(f"layer_types entries must be in {allowed}, got {layer_types}")
|
||||||
|
if num_hidden_layers is not None and num_hidden_layers != len(layer_types):
|
||||||
|
raise ValueError(
|
||||||
|
f"num_hidden_layers ({num_hidden_layers}) != len(layer_types) ({len(layer_types)})"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from typing import TypedDict
|
||||||
|
class RopeParameters(TypedDict, total=False):
|
||||||
|
rope_theta: float
|
||||||
|
rope_type: str
|
||||||
|
partial_rotary_factor: float
|
||||||
|
factor: float
|
||||||
|
except Exception:
|
||||||
|
RopeParameters = dict
|
||||||
|
|
||||||
|
# --- End stubs ---
|
||||||
|
|
||||||
|
|
||||||
|
class Qwen3_5MoeTextConfig(PreTrainedConfig):
|
||||||
|
r"""
|
||||||
|
Configuration for the text backbone of Qwen3.5-MoE / Qwen3.6-35B-A3B models.
|
||||||
|
model_type is "qwen3_5_moe_text" (used internally by the nested config).
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_type = "qwen3_5_moe_text"
|
||||||
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=248320,
|
||||||
|
hidden_size=2048,
|
||||||
|
num_hidden_layers=40,
|
||||||
|
num_attention_heads=16,
|
||||||
|
num_key_value_heads=2,
|
||||||
|
hidden_act="silu",
|
||||||
|
max_position_embeddings=32768,
|
||||||
|
initializer_range=0.02,
|
||||||
|
rms_norm_eps=1e-6,
|
||||||
|
use_cache=True,
|
||||||
|
tie_word_embeddings=False,
|
||||||
|
rope_parameters=None,
|
||||||
|
attention_bias=False,
|
||||||
|
attention_dropout=0.0,
|
||||||
|
head_dim=256,
|
||||||
|
linear_conv_kernel_dim=4,
|
||||||
|
linear_key_head_dim=128,
|
||||||
|
linear_value_head_dim=128,
|
||||||
|
linear_num_key_heads=16,
|
||||||
|
linear_num_value_heads=32,
|
||||||
|
moe_intermediate_size=512,
|
||||||
|
shared_expert_intermediate_size=512,
|
||||||
|
num_experts_per_tok=8,
|
||||||
|
num_experts=256,
|
||||||
|
output_router_logits=False,
|
||||||
|
router_aux_loss_coef=0.001,
|
||||||
|
layer_types=None,
|
||||||
|
pad_token_id=None,
|
||||||
|
bos_token_id=None,
|
||||||
|
eos_token_id=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
self.pad_token_id = pad_token_id
|
||||||
|
self.bos_token_id = bos_token_id
|
||||||
|
self.eos_token_id = eos_token_id
|
||||||
|
self.tie_word_embeddings = tie_word_embeddings
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.num_key_value_heads = num_key_value_heads
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.rms_norm_eps = rms_norm_eps
|
||||||
|
self.use_cache = use_cache
|
||||||
|
self.attention_bias = attention_bias
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.head_dim = head_dim
|
||||||
|
self.rope_parameters = rope_parameters
|
||||||
|
kwargs.setdefault("partial_rotary_factor", 0.25)
|
||||||
|
|
||||||
|
self.layer_types = layer_types
|
||||||
|
if self.layer_types is None:
|
||||||
|
interval_pattern = kwargs.get("full_attention_interval", 4)
|
||||||
|
self.layer_types = [
|
||||||
|
"linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
|
||||||
|
for i in range(self.num_hidden_layers)
|
||||||
|
]
|
||||||
|
layer_type_validation(self.layer_types, self.num_hidden_layers)
|
||||||
|
|
||||||
|
self.linear_conv_kernel_dim = linear_conv_kernel_dim
|
||||||
|
self.linear_key_head_dim = linear_key_head_dim
|
||||||
|
self.linear_value_head_dim = linear_value_head_dim
|
||||||
|
self.linear_num_key_heads = linear_num_key_heads
|
||||||
|
self.linear_num_value_heads = linear_num_value_heads
|
||||||
|
self.moe_intermediate_size = moe_intermediate_size
|
||||||
|
self.shared_expert_intermediate_size = shared_expert_intermediate_size
|
||||||
|
self.num_experts_per_tok = num_experts_per_tok
|
||||||
|
self.num_experts = num_experts
|
||||||
|
self.output_router_logits = output_router_logits
|
||||||
|
self.router_aux_loss_coef = router_aux_loss_coef
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class Qwen3_5MoeVisionConfig(PreTrainedConfig):
|
||||||
|
model_type = "qwen3_5_moe"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
depth=27,
|
||||||
|
hidden_size=1152,
|
||||||
|
hidden_act="gelu_pytorch_tanh",
|
||||||
|
intermediate_size=4304,
|
||||||
|
num_heads=16,
|
||||||
|
in_channels=3,
|
||||||
|
patch_size=16,
|
||||||
|
spatial_merge_size=2,
|
||||||
|
temporal_patch_size=2,
|
||||||
|
out_hidden_size=3584,
|
||||||
|
num_position_embeddings=2304,
|
||||||
|
initializer_range=0.02,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.depth = depth
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.spatial_merge_size = spatial_merge_size
|
||||||
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
self.out_hidden_size = out_hidden_size
|
||||||
|
self.num_position_embeddings = num_position_embeddings
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
|
||||||
|
|
||||||
|
class Qwen3_5MoeConfig(PreTrainedConfig):
|
||||||
|
r"""
|
||||||
|
Top-level configuration for Qwen3.5-MoE / Qwen3.6-35B-A3B.
|
||||||
|
model_type = "qwen3_5_moe" matches the model card / config.json.
|
||||||
|
Wraps Qwen3_5MoeTextConfig (and optionally Qwen3_5MoeVisionConfig).
|
||||||
|
For vLLM text-only inference only text_config is consumed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_type = "qwen3_5_moe"
|
||||||
|
keys_to_ignore_at_inference = ["past_key_values"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
text_config=None,
|
||||||
|
vision_config=None,
|
||||||
|
image_token_id=248056,
|
||||||
|
video_token_id=248057,
|
||||||
|
vision_start_token_id=248053,
|
||||||
|
vision_end_token_id=248054,
|
||||||
|
tie_word_embeddings=False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
if isinstance(text_config, dict):
|
||||||
|
self.text_config = Qwen3_5MoeTextConfig(**text_config)
|
||||||
|
elif text_config is None:
|
||||||
|
self.text_config = Qwen3_5MoeTextConfig()
|
||||||
|
else:
|
||||||
|
self.text_config = text_config
|
||||||
|
|
||||||
|
if isinstance(vision_config, dict):
|
||||||
|
self.vision_config = Qwen3_5MoeVisionConfig(**vision_config)
|
||||||
|
elif vision_config is None:
|
||||||
|
self.vision_config = Qwen3_5MoeVisionConfig()
|
||||||
|
else:
|
||||||
|
self.vision_config = vision_config
|
||||||
|
|
||||||
|
self.image_token_id = image_token_id
|
||||||
|
self.video_token_id = video_token_id
|
||||||
|
self.vision_start_token_id = vision_start_token_id
|
||||||
|
self.vision_end_token_id = vision_end_token_id
|
||||||
|
self.tie_word_embeddings = tie_word_embeddings
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Qwen3_5MoeConfig", "Qwen3_5MoeTextConfig"]
|
||||||
Reference in New Issue
Block a user