[MM][Bugfix] Update hf_config to hf_text_config (#5319)

### What this PR does / why we need it? Following https://github.com/vllm-project/vllm-ascend/pull/5205, update `hf_config` to `hf_text_config`. Find more details at https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3675417534 and https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3677920872. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: 5fbfa8d9ef Signed-off-by: shen-shanshan <467638484@qq.com>
2026-01-06 16:41:39 +08:00
parent 293b2275df
commit b94d589769
23 changed files with 44 additions and 43 deletions
--- a/tests/e2e/multicard/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py
@@ -114,7 +114,7 @@ def _run_worker_process(

        # Expose model config to the main test process
        counters["hidden_layers"].value = (
-            llm.llm_engine.model_config.hf_config.num_hidden_layers)
+            llm.llm_engine.model_config.hf_text_config.num_hidden_layers)

        llm.generate(local_prompts,
                     SamplingParams(max_tokens=max_tokens, temperature=0.0))
--- a/tests/ut/core/test_scheduler_dynamic_batch.py
+++ b/tests/ut/core/test_scheduler_dynamic_batch.py
@@ -130,8 +130,8 @@ class TestSchedulerDynamicBatch(TestBase):
        )
        model_config.pooler_config = MagicMock()
        model_config.multimodal_config = MagicMock()
-        model_config.hf_config = MagicMock()
-        model_config.hf_config.is_encoder_decoder = False
+        model_config.hf_text_config = MagicMock()
+        model_config.hf_text_config.is_encoder_decoder = False
        # Cache config, optionally force APC
        kwargs_cache: Dict[str,
                           Any] = ({} if ENABLE_PREFIX_CACHING is None else {
--- a/tests/ut/ops/test_mla.py
+++ b/tests/ut/ops/test_mla.py
@@ -87,7 +87,7 @@ class TestAscendMultiHeadLatentAttention(TestBase):
            mock_tp_size.return_value = 2
            mock_ascend_config.return_value.enable_shared_expert_dp = True
            mock_vllm_config = MagicMock(spec=VllmConfig)
-            mock_vllm_config.model_config.hf_config = MagicMock(
+            mock_vllm_config.model_config.hf_text_config = MagicMock(
                num_hidden_layers=32, first_k_dense_replace=True)
            mock_get_vllm_config.return_value = mock_vllm_config
            mock_vllm_config.compilation_config = CompilationConfig()
@@ -122,7 +122,7 @@ class TestAscendMultiHeadLatentAttention(TestBase):
        mock_tp_size.return_value = 1
        mock_ascend_config.return_value.enable_shared_expert_dp = False
        mock_vllm_config = MagicMock(spec=VllmConfig)
-        mock_vllm_config.model_config.hf_config = MagicMock(
+        mock_vllm_config.model_config.hf_text_config = MagicMock(
            num_hidden_layers=32, first_k_dense_replace=False)
        mock_get_vllm_config.return_value = mock_vllm_config
        mock_vllm_config.compilation_config = CompilationConfig()
--- a/tests/ut/ops/test_rotary_embedding.py
+++ b/tests/ut/ops/test_rotary_embedding.py
@@ -115,7 +115,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
        model_config = ModelConfig(MODEL,
                                   tokenizer=MODEL,
                                   max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
        vllm_config.model_config = model_config
        with set_ascend_forward_context(None, vllm_config):
            result_q, result_k = self.layer.forward(self.positions, self.query,
@@ -141,7 +141,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
        model_config = ModelConfig(MODEL,
                                   tokenizer=MODEL,
                                   max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
        vllm_config.model_config = model_config
        with set_ascend_forward_context(None, vllm_config):
            result_q, result_k = self.layer.forward(self.positions,
@@ -164,7 +164,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
            model_config = ModelConfig(MODEL,
                                       tokenizer=MODEL,
                                       max_model_len=MAX_NUM_BATCHED_TOKEND)
-            model_config.hf_config = PretrainedConfig()
+            model_config.hf_text_config = PretrainedConfig()
            vllm_config.model_config = model_config
            with set_ascend_forward_context(None, vllm_config):
                self.layer.forward(self.positions, self.query, self.key,
@@ -184,7 +184,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
        model_config = ModelConfig(MODEL,
                                   tokenizer=MODEL,
                                   max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
        vllm_config.model_config = model_config
        with set_ascend_forward_context(None, vllm_config):
            result_q, result_k = self.layer.forward(
@@ -213,7 +213,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
        model_config = ModelConfig(MODEL,
                                   tokenizer=MODEL,
                                   max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
        vllm_config.model_config = model_config
        with set_ascend_forward_context(None, vllm_config):
            result_q, result_k = self.layer.forward(self.positions, self.query,
@@ -404,7 +404,7 @@ class TestAscendMRotaryEmbedding(unittest.TestCase):
        model_config = ModelConfig(MODEL_VL,
                                   tokenizer=MODEL_VL,
                                   max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
        vllm_config.model_config = model_config
        return vllm_config

--- a/tests/ut/quantization/test_quant_config.py
+++ b/tests/ut/quantization/test_quant_config.py
@@ -79,7 +79,7 @@ class TestAscendQuantConfig(TestBase):

    def test_get_quant_method_for_linear(self):
        mock_config = MagicMock()
-        mock_config.model_config.hf_config.model_type = None
+        mock_config.model_config.hf_text_config.model_type = None
        linear_layer = MagicMock(spec=LinearBase)
        # Test skipped layer
        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
@@ -103,7 +103,7 @@ class TestAscendQuantConfig(TestBase):
    def test_get_quant_method_for_attention(self):
        attention_layer = MagicMock(spec=Attention)
        mock_config = MagicMock()
-        mock_config.model_config.hf_config.model_type = None
+        mock_config.model_config.hf_text_config.model_type = None
        with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
            patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
                   return_value=MagicMock()) as mock_ascend_kvcache:
@@ -117,7 +117,7 @@ class TestAscendQuantConfig(TestBase):
        fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
        fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
        mock_config = MagicMock()
-        mock_config.model_config.hf_config.model_type = None
+        mock_config.model_config.hf_text_config.model_type = None

        # Test skipped layer
        with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
--- a/tests/ut/spec_decode/test_mtp_proposer.py
+++ b/tests/ut/spec_decode/test_mtp_proposer.py
@@ -41,7 +41,7 @@ class TestMtpProposer:
        config.model_config.dtype = torch.float16
        config.model_config.max_model_len = 2048
        config.model_config.uses_mrope = False
-        config.model_config.hf_config = None
+        config.model_config.hf_text_config = None

        config.load_config = None

--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -96,7 +96,7 @@ class AscendConfig:
                try:
                    # only support Qwen model now
                    # TODO: use a more robust method to get kv_head_num
-                    num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
+                    num_kv_head = vllm_config.model_config.hf_text_config.num_key_value_heads
                    self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1
                    prefill_tp_size = min(prefill_tp_size, num_kv_head)
                    decode_tp_size = min(decode_tp_size, num_kv_head)
@@ -126,7 +126,7 @@ class AscendConfig:

        self.enable_kv_nz = additional_config.get("enable_kv_nz", False)
        if self.enable_kv_nz:
-            use_sparse = hasattr(vllm_config.model_config.hf_config,
+            use_sparse = hasattr(vllm_config.model_config.hf_text_config,
                                 "index_topk")
            if not vllm_config.model_config.is_deepseek_mla or use_sparse:
                raise RuntimeError(
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -224,8 +224,8 @@ def select_moe_comm_method(num_tokens: int,
    mc2_tokens_capacity = get_mc2_tokens_capacity()
    soc_version = get_ascend_device_type()
    quant_type = getattr(
-        vllm_config.model_config.hf_config, 'moe_quantize',
-        getattr(vllm_config.model_config.hf_config, 'quantize', None))
+        vllm_config.model_config.hf_text_config, 'moe_quantize',
+        getattr(vllm_config.model_config.hf_text_config, 'quantize', None))

    if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group(
    ).world_size == 1:
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -149,7 +149,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):

        self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
        self.enable_sfa_cp = enable_sp() and \
-            hasattr(self.model_config.hf_config, "index_topk")
+            hasattr(self.model_config.hf_text_config, "index_topk")

        assert not (
            self.enable_sfa_cp
@@ -963,7 +963,7 @@ class AscendSFAImpl(MLAAttentionImpl):
        # Dispose tensor from the original o_proj
        dispose_layer(self.o_proj)
        # Construct the new o_proj using ReplicatedLinear
-        config = vllm_config.model_config.hf_config
+        config = vllm_config.model_config.hf_text_config
        new_o_proj = ReplicatedLinear(config.num_attention_heads *
                                      config.v_head_dim,
                                      config.hidden_size,
--- a/vllm_ascend/distributed/kvpool/pool_worker.py
+++ b/vllm_ascend/distributed/kvpool/pool_worker.py
@@ -96,7 +96,7 @@ class KVPoolWorker:

        partitions = None
        if self.kv_role == "kv_consumer" and self.consumer_is_to_put:
-            num_hidden_layers = model_config.hf_config.num_hidden_layers
+            num_hidden_layers = model_config.hf_text_config.num_hidden_layers
            partition_list_str = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
                "prefill_pp_layer_partition", None)
            prefill_pp_size = int(
--- a/vllm_ascend/distributed/mooncake_connector.py
+++ b/vllm_ascend/distributed/mooncake_connector.py
@@ -345,7 +345,7 @@ class KVCacheRecvingThread(threading.Thread):
        self.vllm_config = vllm_config
        self.model_config = self.vllm_config.model_config
        self.block_size = self.vllm_config.cache_config.block_size
-        self.num_layers = self.model_config.hf_config.num_hidden_layers
+        self.num_layers = self.model_config.hf_text_config.num_hidden_layers
        self.pp_layer_indices = {
            rank:
            get_prefill_pp_indices(self.num_layers, rank,
--- a/vllm_ascend/distributed/parallel_state.py
+++ b/vllm_ascend/distributed/parallel_state.py
@@ -167,7 +167,7 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ):

    global _SHARED_WEIGHT
    # TODO: Check if the model is Deepseek V3.2 with enabled SFA CP and activated shared weights. It will then be normalized within the PCP parameters. -- clrs97
-    is_ds_v32 = hasattr(vllm_config.model_config.hf_config, "index_topk")
+    is_ds_v32 = hasattr(vllm_config.model_config.hf_text_config, "index_topk")
    if enable_sp() and is_ds_v32 and _SHARED_WEIGHT is None:
        _SHARED_WEIGHT = _create_shared_weight_group("CP_shared_weight")
    # TODO: Extract and unify the logic across different communication group.
--- a/vllm_ascend/eplb/utils.py
+++ b/vllm_ascend/eplb/utils.py
@@ -69,7 +69,7 @@ def model_register(model, model_config):
    model.get_all_moe_loads = types.MethodType(get_all_moe_loads, model)
    model.clear_all_moe_loads = types.MethodType(clear_all_moe_loads, model)

-    config = model_config.hf_config
+    config = model_config.hf_text_config

    if config.model_type == "qwen3_moe":
        model.num_moe_layers = config.num_hidden_layers
--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -697,7 +697,7 @@ def is_moe_layer(prefix: str) -> bool:
    def get_moe_params():
        from vllm.config import get_current_vllm_config
        vllm_config = get_current_vllm_config()
-        config = vllm_config.model_config.hf_config
+        config = vllm_config.model_config.hf_text_config
        n_routed_experts = getattr(config, 'n_routed_experts', 0)
        first_k_dense_replace = getattr(config, 'first_k_dense_replace',
                                        float('inf'))
--- a/vllm_ascend/ops/mla.py
+++ b/vllm_ascend/ops/mla.py
@@ -91,7 +91,7 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.prefix = prefix
-        hf_config = get_current_vllm_config().model_config.hf_config
+        hf_config = get_current_vllm_config().model_config.hf_text_config
        self.enable_shared_expert_dp = get_ascend_config(
        ).enable_shared_expert_dp
        self.tp_size = get_tensor_model_parallel_world_size()
--- a/vllm_ascend/ops/shared_weight_layer.py
+++ b/vllm_ascend/ops/shared_weight_layer.py
@@ -247,6 +247,6 @@ def reach_layer_for_shared_weight_series(layer: LinearBase):


 def is_hidden_layer(vllm_config, layer: LinearBase) -> bool:
-    num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+    num_hidden_layers = vllm_config.model_config.hf_text_config.num_hidden_layers
    layer_idx = extract_layer_index(layer.prefix)
    return layer_idx < num_hidden_layers
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -174,7 +174,8 @@ class NPUPlatform(Platform):
                         ) if not isinstance(ascend_compilation_config, dict)
                    else ascend_compilation_config)

-        elif model_config and hasattr(model_config.hf_config, "index_topk"):
+        elif model_config and hasattr(model_config.hf_text_config,
+                                      "index_topk"):
            vllm_config.cache_config.cache_dtype = str(
                model_config.dtype).replace("torch.", "")
        if model_config is None:
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -116,7 +116,7 @@ class AscendQuantConfig(QuantizationConfig):
    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional["QuantizeMethodBase"]:
        vllm_config = get_current_vllm_config()
-        model_type = vllm_config.model_config.hf_config.model_type
+        model_type = vllm_config.model_config.hf_text_config.model_type
        if model_type in packed_modules_model_mapping:
            self.packed_modules_mapping = packed_modules_model_mapping[
                model_type]
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -90,7 +90,7 @@ class EagleProposer(VllmEagleProposer):
            self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
            self.pcp_size * self.dcp_size * self.runner.max_num_reqs)

-        self.use_sparse = hasattr(vllm_config.model_config.hf_config,
+        self.use_sparse = hasattr(vllm_config.model_config.hf_text_config,
                                  "index_topk")

    def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -468,7 +468,7 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
    # on special shapes.
    # TODO(Angazenn): we will remove this once _npu_paged_attention is fully
    # replaced by npu_fused_infer_attention_score which does not contain such bugs.
-    if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
+    if vllm_config.model_config and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe" \
        and vllm_config.parallel_config.tensor_parallel_size == 1 \
        and vllm_config.parallel_config.data_parallel_size > 1 :

@@ -503,7 +503,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
        )

        return
-    hf_config = vllm_config.model_config.hf_config
+    hf_config = vllm_config.model_config.hf_text_config
    if hasattr(hf_config, 'num_hidden_layers'):
        num_hidden_layers = hf_config.num_hidden_layers
    else:
@@ -826,7 +826,7 @@ def is_moe_model(vllm_config: VllmConfig):
    """Checks if the model is a MoE model by config"""
    global _IS_MOE_MODEL
    if _IS_MOE_MODEL is None:
-        model_configs = vllm_config.model_config.hf_config.to_dict()
+        model_configs = vllm_config.model_config.hf_text_config.to_dict()
        _IS_MOE_MODEL = _is_contain_expert(model_configs)
    return _IS_MOE_MODEL

@@ -842,7 +842,7 @@ def speculative_enable_dispatch_gmm_combine_decode(
    if speculative_method in ["eagle", "eagle3"]:
        return False
    if speculative_method == "mtp":
-        mtp_quant_type = getattr(vllm_config.model_config.hf_config,
+        mtp_quant_type = getattr(vllm_config.model_config.hf_text_config,
                                 "mtp_quantize", None)
        return mtp_quant_type == "w8a8_dynamic"
    return False
@@ -875,7 +875,7 @@ def has_rope(vllm_config: VllmConfig):
    """Checks if the model uses rope."""
    global _HAS_ROPE
    if _HAS_ROPE is None and vllm_config and vllm_config.model_config:
-        hf_config = vllm_config.model_config.hf_config.to_dict()
+        hf_config = vllm_config.model_config.hf_text_config.to_dict()
        _HAS_ROPE = "rope_parameters" in hf_config
    return _HAS_ROPE

@@ -1091,7 +1091,7 @@ def refresh_block_size(vllm_config):
        return

    # TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
-    if not model_config.hf_config.model_type == "qwen3_next" and cache_config.block_size != 128:
+    if not model_config.hf_text_config.model_type == "qwen3_next" and cache_config.block_size != 128:
        if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill:
            logger.info(
                "Block size is set to 128 if prefix cache or chunked prefill is enabled."
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -253,7 +253,7 @@ class NPUModelRunner(GPUModelRunner):
        self.is_multimodal_model = self.model_config.is_multimodal_model
        self.block_size = vllm_config.cache_config.block_size
        # Set up Attention
-        self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
+        self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config,
                                  "index_topk")
        self.attn_backend = get_attn_backend(
            0,
@@ -2398,7 +2398,7 @@ class NPUModelRunner(GPUModelRunner):
            kv_caches[layer_name] = kv_caches[target_layer_name]

        from vllm.v1.worker.utils import bind_kv_cache
-        num_attn_module = 2 if self.model_config.hf_config.model_type == "longcat_flash" else 1
+        num_attn_module = 2 if self.model_config.hf_text_config.model_type == "longcat_flash" else 1
        bind_kv_cache(kv_caches,
                      self.compilation_config.static_forward_context,
                      self.kv_caches, num_attn_module)
@@ -2932,7 +2932,7 @@ class NPUModelRunner(GPUModelRunner):
        mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
        if len(mamba_layers) > 0:
            if (self.vllm_config.speculative_config is not None
-                    and self.vllm_config.model_config.hf_config.model_type
+                    and self.vllm_config.model_config.hf_text_config.model_type
                    not in ["qwen3_next"]):
                raise NotImplementedError(
                    "Mamba with speculative decoding is not supported yet.")
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -173,7 +173,7 @@ class NPUWorker(WorkerBase):
        allocator = CaMemAllocator.get_instance()
        allocator.wake_up(tags=tags)

-        hidden_size = self.vllm_config.model_config.hf_config.hidden_size
+        hidden_size = self.vllm_config.model_config.hf_text_config.hidden_size
        model = self.model_runner.model
        if tags is None or "weights" in tags:
            for name, param in model.named_parameters():
--- a/vllm_ascend/xlite/xlite.py
+++ b/vllm_ascend/xlite/xlite.py
@@ -61,7 +61,7 @@ class LlamaXliteModel(XliteModel):
        xlite_model.embed = params_dict.get(model_prefix +
                                            "model.embed_tokens.weight")
        xlite_model.norm = params_dict.get(model_prefix + "model.norm.weight")
-        if vllm_config.model_config.hf_config.tie_word_embeddings:
+        if vllm_config.model_config.hf_text_config.tie_word_embeddings:
            xlite_model.head = xlite_model.embed
        else:
            xlite_model.head = params_dict.get(model_prefix + "lm_head.weight")
@@ -118,7 +118,7 @@ class LlamaXliteModel(XliteModel):
        return (xlite_model, freq_cis, config.hidden_size, dtype)

    def _build_model_config(self, vllm_config: VllmConfig) -> ModelConfig:
-        hf_config = vllm_config.model_config.hf_config
+        hf_config = vllm_config.model_config.hf_text_config
        if hasattr(hf_config, "text_config"):
            hf_config = hf_config.text_config
        config = ModelConfig()