sync from b7516
This commit is contained in:
@@ -123,40 +123,6 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.CONV1D: (
|
||||
"backbone.embed", # roberta
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_EMBEDDING: (
|
||||
"model.embed_vision.embedding", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
|
||||
"model.embed_vision.hard_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
"model.embed_vision.embedding_projection", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
||||
"model.embed_vision.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM: (
|
||||
"model.vision_tower.timm_model.conv_stem.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
|
||||
"model.vision_tower.timm_model.conv_stem.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.V_ENC_MSFA_NORM: (
|
||||
"model.vision_tower.timm_model.msfa.norm", # gemma3n
|
||||
),
|
||||
}
|
||||
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
@@ -354,7 +320,6 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.ATTN_SINKS: (
|
||||
"model.layers.{bid}.self_attn.sinks", # openai-moe
|
||||
"model.layers.{bid}.self_attn.attention_sink_bias", # mimov2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_GATE: (
|
||||
@@ -436,8 +401,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
||||
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
||||
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
||||
"backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe
|
||||
"model.layers.{bid}.mlp.e_score_correction", # exaone-moe
|
||||
"backbone.layers.{bid}.mixer.gate.e_score_correction" # nemotron-h-moe
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
@@ -630,7 +594,6 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.q", # plamo2
|
||||
"model.layers.layers.{bid}.mixer.q_norm", # plamo3
|
||||
"layers.{bid}.self_attn.q_norm", # qwen3-embedding
|
||||
"model.layers.{bid}.attention.query_layernorm", # apertus
|
||||
),
|
||||
@@ -646,7 +609,6 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
"model.layers.layers.{bid}.mixer.k", # plamo2
|
||||
"model.layers.layers.{bid}.mixer.k_norm", # plamo3
|
||||
"layers.{bid}.self_attn.k_norm", # qwen3-embedding
|
||||
"model.layers.{bid}.attention.key_layernorm", # apertus
|
||||
),
|
||||
@@ -1256,7 +1218,6 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_MMPROJ: (
|
||||
"multi_modal_projector.linear_{bid}",
|
||||
"visual.merger.mlp.{bid}", # qwen2vl
|
||||
"merger.mlp.{bid}",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
@@ -1294,7 +1255,6 @@ class TensorNameMap:
|
||||
"visual.patch_embed.proj", # qwen2vl
|
||||
"vision_tower.patch_embed.proj", # kimi-vl
|
||||
"model.vision.patch_embedding.proj", # cogvlm
|
||||
"siglip2.vision_model.embeddings.patch_embedding",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||
@@ -1328,7 +1288,6 @@ class TensorNameMap:
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
|
||||
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||
@@ -1346,7 +1305,6 @@ class TensorNameMap:
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
|
||||
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||
@@ -1364,7 +1322,6 @@ class TensorNameMap:
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
|
||||
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
|
||||
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
@@ -1379,7 +1336,6 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.norm1", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
|
||||
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_O: (
|
||||
@@ -1395,7 +1351,6 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.attn.proj", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
|
||||
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
||||
@@ -1410,7 +1365,6 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.norm2", # qwen2vl
|
||||
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
|
||||
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
@@ -1426,7 +1380,6 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
|
||||
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
|
||||
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||
@@ -1448,7 +1401,6 @@ class TensorNameMap:
|
||||
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
|
||||
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
|
||||
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
||||
@@ -1475,7 +1427,6 @@ class TensorNameMap:
|
||||
"visual.merger.ln_q", # qwen2vl
|
||||
"vision_tower.encoder.final_layernorm", # kimi-vl
|
||||
"visual.post_layernorm", # glm4v
|
||||
"siglip2.vision_model.post_layernorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||
@@ -1492,7 +1443,6 @@ class TensorNameMap:
|
||||
"multi_modal_projector.pre_norm",
|
||||
"pre_mm_projector_norm",
|
||||
"model.vision.linear_proj.norm1", # cogvlm
|
||||
"merger.ln_q",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
||||
@@ -1610,11 +1560,6 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.A_ENC_CONV1D: (
|
||||
"audio_tower.conv{bid}", # ultravox
|
||||
"conformer.pre_encode.conv.{bid}", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV1D_NORM: (
|
||||
"model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_PRE_NORM: (),
|
||||
@@ -1627,64 +1572,40 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
||||
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_K: (
|
||||
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_V: (
|
||||
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
|
||||
"conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
|
||||
"conformer.layers.{bid}.norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_INPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_self_att", # lfm2
|
||||
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT: (
|
||||
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
|
||||
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
|
||||
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
|
||||
"conformer.layers.{bid}.norm_out", # lfm2
|
||||
"conformer.layers.{bid}.attention.post_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM: (
|
||||
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE: (
|
||||
"conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP: (
|
||||
"audio_tower.layers.{bid}.fc1", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_GATE: (),
|
||||
@@ -1692,35 +1613,22 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN: (
|
||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_UP_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
|
||||
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
|
||||
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
|
||||
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
|
||||
"conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_LINEAR_POS: (
|
||||
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
|
||||
"conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_POS_BIAS_U: (
|
||||
@@ -1733,7 +1641,6 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.A_ENC_OUT: (
|
||||
"conformer.pre_encode.out", # lfm2
|
||||
"model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
@@ -1759,43 +1666,25 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_DW: (
|
||||
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_NORM: (
|
||||
"conformer.layers.{bid}.conv.batch_norm", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW1: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_CONV_PW2: (
|
||||
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_NORM_CONV: (
|
||||
"conformer.layers.{bid}.norm_conv", # lfm2
|
||||
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_EMBEDDING: (
|
||||
"model.embed_audio.embedding", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
|
||||
"model.embed_audio.hard_embedding_norm", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_INP_PROJ: (
|
||||
"model.embed_audio.embedding_projection", # gemma3n
|
||||
),
|
||||
MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
|
||||
"model.embed_audio.soft_embedding_norm", # gemma3n
|
||||
),
|
||||
|
||||
# NextN/MTP tensors
|
||||
# NextN/MTP tensors for GLM4_MOE
|
||||
MODEL_TENSOR.NEXTN_EH_PROJ: (
|
||||
"model.layers.{bid}.eh_proj",
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user