sync from b7516

This commit is contained in:
2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions

View File

@@ -123,40 +123,6 @@ class TensorNameMap:
MODEL_TENSOR.CONV1D: (
"backbone.embed", # roberta
),
MODEL_TENSOR.V_MM_EMBEDDING: (
"model.embed_vision.embedding", # gemma3n
),
MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
"model.embed_vision.hard_embedding_norm", # gemma3n
),
MODEL_TENSOR.V_MM_INP_PROJ: (
"model.embed_vision.embedding_projection", # gemma3n
),
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
"model.embed_vision.soft_embedding_norm", # gemma3n
),
MODEL_TENSOR.V_ENC_CONV_STEM: (
"model.vision_tower.timm_model.conv_stem.conv", # gemma3n
),
MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
"model.vision_tower.timm_model.conv_stem.bn", # gemma3n
),
MODEL_TENSOR.V_ENC_MSFA_EXP: (
"model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
),
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
"model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
),
MODEL_TENSOR.V_ENC_MSFA_PROJ: (
"model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
),
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
"model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
),
MODEL_TENSOR.V_ENC_MSFA_NORM: (
"model.vision_tower.timm_model.msfa.norm", # gemma3n
),
}
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -354,7 +320,6 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_SINKS: (
"model.layers.{bid}.self_attn.sinks", # openai-moe
"model.layers.{bid}.self_attn.attention_sink_bias", # mimov2
),
MODEL_TENSOR.ATTN_GATE: (
@@ -436,8 +401,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.expert_bias", # afmoe
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
"backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe
"model.layers.{bid}.mlp.e_score_correction", # exaone-moe
"backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe
),
# Feed-forward up
@@ -630,7 +594,6 @@ class TensorNameMap:
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm
"model.layers.layers.{bid}.mixer.q", # plamo2
"model.layers.layers.{bid}.mixer.q_norm", # plamo3
"layers.{bid}.self_attn.q_norm", # qwen3-embedding
"model.layers.{bid}.attention.query_layernorm", # apertus
),
@@ -646,7 +609,6 @@ class TensorNameMap:
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm
"model.layers.layers.{bid}.mixer.k", # plamo2
"model.layers.layers.{bid}.mixer.k_norm", # plamo3
"layers.{bid}.self_attn.k_norm", # qwen3-embedding
"model.layers.{bid}.attention.key_layernorm", # apertus
),
@@ -1256,7 +1218,6 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}",
"visual.merger.mlp.{bid}", # qwen2vl
"merger.mlp.{bid}",
),
MODEL_TENSOR.V_MMPROJ_FC: (
@@ -1294,7 +1255,6 @@ class TensorNameMap:
"visual.patch_embed.proj", # qwen2vl
"vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm
"siglip2.vision_model.embeddings.patch_embedding",
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1328,7 +1288,6 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
),
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1346,7 +1305,6 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
),
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1364,7 +1322,6 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
),
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1379,7 +1336,6 @@ class TensorNameMap:
"visual.blocks.{bid}.norm1", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
),
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1395,7 +1351,6 @@ class TensorNameMap:
"visual.blocks.{bid}.attn.proj", # qwen2vl
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1410,7 +1365,6 @@ class TensorNameMap:
"visual.blocks.{bid}.norm2", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
),
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1426,7 +1380,6 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1448,7 +1401,6 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
),
MODEL_TENSOR.V_LAYER_SCALE_1: (
@@ -1475,7 +1427,6 @@ class TensorNameMap:
"visual.merger.ln_q", # qwen2vl
"vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v
"siglip2.vision_model.post_layernorm",
),
MODEL_TENSOR.V_MM_POST_NORM: (
@@ -1492,7 +1443,6 @@ class TensorNameMap:
"multi_modal_projector.pre_norm",
"pre_mm_projector_norm",
"model.vision.linear_proj.norm1", # cogvlm
"merger.ln_q",
),
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
@@ -1610,11 +1560,6 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_CONV1D: (
"audio_tower.conv{bid}", # ultravox
"conformer.pre_encode.conv.{bid}", # lfm2
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
),
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
),
MODEL_TENSOR.A_PRE_NORM: (),
@@ -1627,64 +1572,40 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_ATTN_Q: (
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
),
MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
),
MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
),
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
),
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
"conformer.layers.{bid}.norm", # gemma3n
),
MODEL_TENSOR.A_ENC_INPUT_NORM: (
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
"conformer.layers.{bid}.norm_self_att", # lfm2
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_OUTPUT: (
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
"conformer.layers.{bid}.attention.post", # gemma3n
),
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
"conformer.layers.{bid}.norm_out", # lfm2
"conformer.layers.{bid}.attention.post_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_SCALE: (
"conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_UP: (
"audio_tower.layers.{bid}.fc1", # ultravox
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_GATE: (),
@@ -1692,35 +1613,22 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_FFN_DOWN: (
"audio_tower.layers.{bid}.fc2", # ultravox
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
"conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
),
MODEL_TENSOR.A_ENC_LINEAR_POS: (
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
"conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
),
MODEL_TENSOR.A_ENC_POS_BIAS_U: (
@@ -1733,7 +1641,6 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_OUT: (
"conformer.pre_encode.out", # lfm2
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
),
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
@@ -1759,43 +1666,25 @@ class TensorNameMap:
MODEL_TENSOR.A_ENC_CONV_DW: (
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
),
MODEL_TENSOR.A_ENC_CONV_NORM: (
"conformer.layers.{bid}.conv.batch_norm", # lfm2
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_CONV_PW1: (
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
),
MODEL_TENSOR.A_ENC_CONV_PW2: (
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
),
MODEL_TENSOR.A_ENC_NORM_CONV: (
"conformer.layers.{bid}.norm_conv", # lfm2
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
),
MODEL_TENSOR.A_MM_EMBEDDING: (
"model.embed_audio.embedding", # gemma3n
),
MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
"model.embed_audio.hard_embedding_norm", # gemma3n
),
MODEL_TENSOR.A_MM_INP_PROJ: (
"model.embed_audio.embedding_projection", # gemma3n
),
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
"model.embed_audio.soft_embedding_norm", # gemma3n
),
# NextN/MTP tensors
# NextN/MTP tensors for GLM4_MOE
MODEL_TENSOR.NEXTN_EH_PROJ: (
"model.layers.{bid}.eh_proj",
),