From b94d5897691bb4f7cb49dca57e580f7bf4127cae Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Tue, 6 Jan 2026 16:41:39 +0800
Subject: [PATCH] [MM][Bugfix] Update `hf_config` to `hf_text_config` (#5319)

### What this PR does / why we need it?

Following https://github.com/vllm-project/vllm-ascend/pull/5205, update
`hf_config` to `hf_text_config`.

Find more details at
https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3675417534
and
https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3677920872.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 tests/e2e/multicard/test_aclgraph_capture_replay.py |  2 +-
 tests/ut/core/test_scheduler_dynamic_batch.py       |  4 ++--
 tests/ut/ops/test_mla.py                            |  4 ++--
 tests/ut/ops/test_rotary_embedding.py               | 12 ++++++------
 tests/ut/quantization/test_quant_config.py          |  6 +++---
 tests/ut/spec_decode/test_mtp_proposer.py           |  2 +-
 vllm_ascend/ascend_config.py                        |  4 ++--
 vllm_ascend/ascend_forward_context.py               |  4 ++--
 vllm_ascend/attention/sfa_v1.py                     |  4 ++--
 vllm_ascend/distributed/kvpool/pool_worker.py       |  2 +-
 vllm_ascend/distributed/mooncake_connector.py       |  2 +-
 vllm_ascend/distributed/parallel_state.py           |  2 +-
 vllm_ascend/eplb/utils.py                           |  2 +-
 vllm_ascend/ops/linear_op.py                        |  2 +-
 vllm_ascend/ops/mla.py                              |  2 +-
 vllm_ascend/ops/shared_weight_layer.py              |  2 +-
 vllm_ascend/platform.py                             |  3 ++-
 vllm_ascend/quantization/quant_config.py            |  2 +-
 vllm_ascend/spec_decode/eagle_proposer.py           |  2 +-
 vllm_ascend/utils.py                                | 12 ++++++------
 vllm_ascend/worker/model_runner_v1.py               |  6 +++---
 vllm_ascend/worker/worker.py                        |  2 +-
 vllm_ascend/xlite/xlite.py                          |  4 ++--
 23 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py
index c06f1a07..c4195bae 100644
--- a/tests/e2e/multicard/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py
@@ -114,7 +114,7 @@ def _run_worker_process(
 
         # Expose model config to the main test process
         counters["hidden_layers"].value = (
-            llm.llm_engine.model_config.hf_config.num_hidden_layers)
+            llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
 
         llm.generate(local_prompts,
                      SamplingParams(max_tokens=max_tokens, temperature=0.0))
diff --git a/tests/ut/core/test_scheduler_dynamic_batch.py b/tests/ut/core/test_scheduler_dynamic_batch.py
index 8f38c18f..8d52e35b 100644
--- a/tests/ut/core/test_scheduler_dynamic_batch.py
+++ b/tests/ut/core/test_scheduler_dynamic_batch.py
@@ -130,8 +130,8 @@ class TestSchedulerDynamicBatch(TestBase):
         )
         model_config.pooler_config = MagicMock()
         model_config.multimodal_config = MagicMock()
-        model_config.hf_config = MagicMock()
-        model_config.hf_config.is_encoder_decoder = False
+        model_config.hf_text_config = MagicMock()
+        model_config.hf_text_config.is_encoder_decoder = False
         # Cache config, optionally force APC
         kwargs_cache: Dict[str,
                            Any] = ({} if ENABLE_PREFIX_CACHING is None else {
diff --git a/tests/ut/ops/test_mla.py b/tests/ut/ops/test_mla.py
index 28363450..d4501145 100644
--- a/tests/ut/ops/test_mla.py
+++ b/tests/ut/ops/test_mla.py
@@ -87,7 +87,7 @@ class TestAscendMultiHeadLatentAttention(TestBase):
             mock_tp_size.return_value = 2
             mock_ascend_config.return_value.enable_shared_expert_dp = True
             mock_vllm_config = MagicMock(spec=VllmConfig)
-            mock_vllm_config.model_config.hf_config = MagicMock(
+            mock_vllm_config.model_config.hf_text_config = MagicMock(
                 num_hidden_layers=32, first_k_dense_replace=True)
             mock_get_vllm_config.return_value = mock_vllm_config
             mock_vllm_config.compilation_config = CompilationConfig()
@@ -122,7 +122,7 @@ class TestAscendMultiHeadLatentAttention(TestBase):
         mock_tp_size.return_value = 1
         mock_ascend_config.return_value.enable_shared_expert_dp = False
         mock_vllm_config = MagicMock(spec=VllmConfig)
-        mock_vllm_config.model_config.hf_config = MagicMock(
+        mock_vllm_config.model_config.hf_text_config = MagicMock(
             num_hidden_layers=32, first_k_dense_replace=False)
         mock_get_vllm_config.return_value = mock_vllm_config
         mock_vllm_config.compilation_config = CompilationConfig()
diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py
index 569b70ab..567c15d9 100644
--- a/tests/ut/ops/test_rotary_embedding.py
+++ b/tests/ut/ops/test_rotary_embedding.py
@@ -115,7 +115,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
         model_config = ModelConfig(MODEL,
                                    tokenizer=MODEL,
                                    max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
         vllm_config.model_config = model_config
         with set_ascend_forward_context(None, vllm_config):
             result_q, result_k = self.layer.forward(self.positions, self.query,
@@ -141,7 +141,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
         model_config = ModelConfig(MODEL,
                                    tokenizer=MODEL,
                                    max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
         vllm_config.model_config = model_config
         with set_ascend_forward_context(None, vllm_config):
             result_q, result_k = self.layer.forward(self.positions,
@@ -164,7 +164,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
             model_config = ModelConfig(MODEL,
                                        tokenizer=MODEL,
                                        max_model_len=MAX_NUM_BATCHED_TOKEND)
-            model_config.hf_config = PretrainedConfig()
+            model_config.hf_text_config = PretrainedConfig()
             vllm_config.model_config = model_config
             with set_ascend_forward_context(None, vllm_config):
                 self.layer.forward(self.positions, self.query, self.key,
@@ -184,7 +184,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
         model_config = ModelConfig(MODEL,
                                    tokenizer=MODEL,
                                    max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
         vllm_config.model_config = model_config
         with set_ascend_forward_context(None, vllm_config):
             result_q, result_k = self.layer.forward(
@@ -213,7 +213,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
         model_config = ModelConfig(MODEL,
                                    tokenizer=MODEL,
                                    max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
         vllm_config.model_config = model_config
         with set_ascend_forward_context(None, vllm_config):
             result_q, result_k = self.layer.forward(self.positions, self.query,
@@ -404,7 +404,7 @@ class TestAscendMRotaryEmbedding(unittest.TestCase):
         model_config = ModelConfig(MODEL_VL,
                                    tokenizer=MODEL_VL,
                                    max_model_len=MAX_NUM_BATCHED_TOKEND)
-        model_config.hf_config = PretrainedConfig()
+        model_config.hf_text_config = PretrainedConfig()
         vllm_config.model_config = model_config
         return vllm_config
 
diff --git a/tests/ut/quantization/test_quant_config.py b/tests/ut/quantization/test_quant_config.py
index 2ba313cb..f75f8042 100644
--- a/tests/ut/quantization/test_quant_config.py
+++ b/tests/ut/quantization/test_quant_config.py
@@ -79,7 +79,7 @@ class TestAscendQuantConfig(TestBase):
 
     def test_get_quant_method_for_linear(self):
         mock_config = MagicMock()
-        mock_config.model_config.hf_config.model_type = None
+        mock_config.model_config.hf_text_config.model_type = None
         linear_layer = MagicMock(spec=LinearBase)
         # Test skipped layer
         with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
@@ -103,7 +103,7 @@ class TestAscendQuantConfig(TestBase):
     def test_get_quant_method_for_attention(self):
         attention_layer = MagicMock(spec=Attention)
         mock_config = MagicMock()
-        mock_config.model_config.hf_config.model_type = None
+        mock_config.model_config.hf_text_config.model_type = None
         with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
             patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
                    return_value=MagicMock()) as mock_ascend_kvcache:
@@ -117,7 +117,7 @@ class TestAscendQuantConfig(TestBase):
         fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
         fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
         mock_config = MagicMock()
-        mock_config.model_config.hf_config.model_type = None
+        mock_config.model_config.hf_text_config.model_type = None
 
         # Test skipped layer
         with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py
index d6915cfb..703c1597 100644
--- a/tests/ut/spec_decode/test_mtp_proposer.py
+++ b/tests/ut/spec_decode/test_mtp_proposer.py
@@ -41,7 +41,7 @@ class TestMtpProposer:
         config.model_config.dtype = torch.float16
         config.model_config.max_model_len = 2048
         config.model_config.uses_mrope = False
-        config.model_config.hf_config = None
+        config.model_config.hf_text_config = None
 
         config.load_config = None
 
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
index fec3ade8..ad53e687 100644
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -96,7 +96,7 @@ class AscendConfig:
                 try:
                     # only support Qwen model now
                     # TODO: use a more robust method to get kv_head_num
-                    num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
+                    num_kv_head = vllm_config.model_config.hf_text_config.num_key_value_heads
                     self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1
                     prefill_tp_size = min(prefill_tp_size, num_kv_head)
                     decode_tp_size = min(decode_tp_size, num_kv_head)
@@ -126,7 +126,7 @@ class AscendConfig:
 
         self.enable_kv_nz = additional_config.get("enable_kv_nz", False)
         if self.enable_kv_nz:
-            use_sparse = hasattr(vllm_config.model_config.hf_config,
+            use_sparse = hasattr(vllm_config.model_config.hf_text_config,
                                  "index_topk")
             if not vllm_config.model_config.is_deepseek_mla or use_sparse:
                 raise RuntimeError(
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
index 6baa199b..be528453 100644
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -224,8 +224,8 @@ def select_moe_comm_method(num_tokens: int,
     mc2_tokens_capacity = get_mc2_tokens_capacity()
     soc_version = get_ascend_device_type()
     quant_type = getattr(
-        vllm_config.model_config.hf_config, 'moe_quantize',
-        getattr(vllm_config.model_config.hf_config, 'quantize', None))
+        vllm_config.model_config.hf_text_config, 'moe_quantize',
+        getattr(vllm_config.model_config.hf_text_config, 'quantize', None))
 
     if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group(
     ).world_size == 1:
diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
index 12ac00bc..119eef56 100644
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -149,7 +149,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
 
         self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
         self.enable_sfa_cp = enable_sp() and \
-            hasattr(self.model_config.hf_config, "index_topk")
+            hasattr(self.model_config.hf_text_config, "index_topk")
 
         assert not (
             self.enable_sfa_cp
@@ -963,7 +963,7 @@ class AscendSFAImpl(MLAAttentionImpl):
         # Dispose tensor from the original o_proj
         dispose_layer(self.o_proj)
         # Construct the new o_proj using ReplicatedLinear
-        config = vllm_config.model_config.hf_config
+        config = vllm_config.model_config.hf_text_config
         new_o_proj = ReplicatedLinear(config.num_attention_heads *
                                       config.v_head_dim,
                                       config.hidden_size,
diff --git a/vllm_ascend/distributed/kvpool/pool_worker.py b/vllm_ascend/distributed/kvpool/pool_worker.py
index 8a5e6718..863ee2bc 100644
--- a/vllm_ascend/distributed/kvpool/pool_worker.py
+++ b/vllm_ascend/distributed/kvpool/pool_worker.py
@@ -96,7 +96,7 @@ class KVPoolWorker:
 
         partitions = None
         if self.kv_role == "kv_consumer" and self.consumer_is_to_put:
-            num_hidden_layers = model_config.hf_config.num_hidden_layers
+            num_hidden_layers = model_config.hf_text_config.num_hidden_layers
             partition_list_str = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
                 "prefill_pp_layer_partition", None)
             prefill_pp_size = int(
diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py
index 2b0fe92a..1d3619ab 100644
--- a/vllm_ascend/distributed/mooncake_connector.py
+++ b/vllm_ascend/distributed/mooncake_connector.py
@@ -345,7 +345,7 @@ class KVCacheRecvingThread(threading.Thread):
         self.vllm_config = vllm_config
         self.model_config = self.vllm_config.model_config
         self.block_size = self.vllm_config.cache_config.block_size
-        self.num_layers = self.model_config.hf_config.num_hidden_layers
+        self.num_layers = self.model_config.hf_text_config.num_hidden_layers
         self.pp_layer_indices = {
             rank:
             get_prefill_pp_indices(self.num_layers, rank,
diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py
index e886a311..4d50cec0 100644
--- a/vllm_ascend/distributed/parallel_state.py
+++ b/vllm_ascend/distributed/parallel_state.py
@@ -167,7 +167,7 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ):
 
     global _SHARED_WEIGHT
     # TODO: Check if the model is Deepseek V3.2 with enabled SFA CP and activated shared weights. It will then be normalized within the PCP parameters. -- clrs97
-    is_ds_v32 = hasattr(vllm_config.model_config.hf_config, "index_topk")
+    is_ds_v32 = hasattr(vllm_config.model_config.hf_text_config, "index_topk")
     if enable_sp() and is_ds_v32 and _SHARED_WEIGHT is None:
         _SHARED_WEIGHT = _create_shared_weight_group("CP_shared_weight")
     # TODO: Extract and unify the logic across different communication group.
diff --git a/vllm_ascend/eplb/utils.py b/vllm_ascend/eplb/utils.py
index 7099c25f..6f703f10 100644
--- a/vllm_ascend/eplb/utils.py
+++ b/vllm_ascend/eplb/utils.py
@@ -69,7 +69,7 @@ def model_register(model, model_config):
     model.get_all_moe_loads = types.MethodType(get_all_moe_loads, model)
     model.clear_all_moe_loads = types.MethodType(clear_all_moe_loads, model)
 
-    config = model_config.hf_config
+    config = model_config.hf_text_config
 
     if config.model_type == "qwen3_moe":
         model.num_moe_layers = config.num_hidden_layers
diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py
index 674dab54..53130e67 100644
--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -697,7 +697,7 @@ def is_moe_layer(prefix: str) -> bool:
     def get_moe_params():
         from vllm.config import get_current_vllm_config
         vllm_config = get_current_vllm_config()
-        config = vllm_config.model_config.hf_config
+        config = vllm_config.model_config.hf_text_config
         n_routed_experts = getattr(config, 'n_routed_experts', 0)
         first_k_dense_replace = getattr(config, 'first_k_dense_replace',
                                         float('inf'))
diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py
index 1c952aa6..111b9cdc 100644
--- a/vllm_ascend/ops/mla.py
+++ b/vllm_ascend/ops/mla.py
@@ -91,7 +91,7 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
         self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
         self.v_head_dim = v_head_dim
         self.prefix = prefix
-        hf_config = get_current_vllm_config().model_config.hf_config
+        hf_config = get_current_vllm_config().model_config.hf_text_config
         self.enable_shared_expert_dp = get_ascend_config(
         ).enable_shared_expert_dp
         self.tp_size = get_tensor_model_parallel_world_size()
diff --git a/vllm_ascend/ops/shared_weight_layer.py b/vllm_ascend/ops/shared_weight_layer.py
index 48a5179f..1dc2e88d 100644
--- a/vllm_ascend/ops/shared_weight_layer.py
+++ b/vllm_ascend/ops/shared_weight_layer.py
@@ -247,6 +247,6 @@ def reach_layer_for_shared_weight_series(layer: LinearBase):
 
 
 def is_hidden_layer(vllm_config, layer: LinearBase) -> bool:
-    num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+    num_hidden_layers = vllm_config.model_config.hf_text_config.num_hidden_layers
     layer_idx = extract_layer_index(layer.prefix)
     return layer_idx < num_hidden_layers
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 51bc5e66..4a60d3d5 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -174,7 +174,8 @@ class NPUPlatform(Platform):
                          ) if not isinstance(ascend_compilation_config, dict)
                     else ascend_compilation_config)
 
-        elif model_config and hasattr(model_config.hf_config, "index_topk"):
+        elif model_config and hasattr(model_config.hf_text_config,
+                                      "index_topk"):
             vllm_config.cache_config.cache_dtype = str(
                 model_config.dtype).replace("torch.", "")
         if model_config is None:
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 49a1a5ba..f6a98241 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -116,7 +116,7 @@ class AscendQuantConfig(QuantizationConfig):
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         vllm_config = get_current_vllm_config()
-        model_type = vllm_config.model_config.hf_config.model_type
+        model_type = vllm_config.model_config.hf_text_config.model_type
         if model_type in packed_modules_model_mapping:
             self.packed_modules_mapping = packed_modules_model_mapping[
                 model_type]
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index 625908cd..4fbf8532 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -90,7 +90,7 @@ class EagleProposer(VllmEagleProposer):
             self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
             self.pcp_size * self.dcp_size * self.runner.max_num_reqs)
 
-        self.use_sparse = hasattr(vllm_config.model_config.hf_config,
+        self.use_sparse = hasattr(vllm_config.model_config.hf_text_config,
                                   "index_topk")
 
     def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index bbe63625..d9d92754 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -468,7 +468,7 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
     # on special shapes.
     # TODO(Angazenn): we will remove this once _npu_paged_attention is fully
     # replaced by npu_fused_infer_attention_score which does not contain such bugs.
-    if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
+    if vllm_config.model_config and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe" \
         and vllm_config.parallel_config.tensor_parallel_size == 1 \
         and vllm_config.parallel_config.data_parallel_size > 1 :
 
@@ -503,7 +503,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         )
 
         return
-    hf_config = vllm_config.model_config.hf_config
+    hf_config = vllm_config.model_config.hf_text_config
     if hasattr(hf_config, 'num_hidden_layers'):
         num_hidden_layers = hf_config.num_hidden_layers
     else:
@@ -826,7 +826,7 @@ def is_moe_model(vllm_config: VllmConfig):
     """Checks if the model is a MoE model by config"""
     global _IS_MOE_MODEL
     if _IS_MOE_MODEL is None:
-        model_configs = vllm_config.model_config.hf_config.to_dict()
+        model_configs = vllm_config.model_config.hf_text_config.to_dict()
         _IS_MOE_MODEL = _is_contain_expert(model_configs)
     return _IS_MOE_MODEL
 
@@ -842,7 +842,7 @@ def speculative_enable_dispatch_gmm_combine_decode(
     if speculative_method in ["eagle", "eagle3"]:
         return False
     if speculative_method == "mtp":
-        mtp_quant_type = getattr(vllm_config.model_config.hf_config,
+        mtp_quant_type = getattr(vllm_config.model_config.hf_text_config,
                                  "mtp_quantize", None)
         return mtp_quant_type == "w8a8_dynamic"
     return False
@@ -875,7 +875,7 @@ def has_rope(vllm_config: VllmConfig):
     """Checks if the model uses rope."""
     global _HAS_ROPE
     if _HAS_ROPE is None and vllm_config and vllm_config.model_config:
-        hf_config = vllm_config.model_config.hf_config.to_dict()
+        hf_config = vllm_config.model_config.hf_text_config.to_dict()
         _HAS_ROPE = "rope_parameters" in hf_config
     return _HAS_ROPE
 
@@ -1091,7 +1091,7 @@ def refresh_block_size(vllm_config):
         return
 
     # TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
-    if not model_config.hf_config.model_type == "qwen3_next" and cache_config.block_size != 128:
+    if not model_config.hf_text_config.model_type == "qwen3_next" and cache_config.block_size != 128:
         if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill:
             logger.info(
                 "Block size is set to 128 if prefix cache or chunked prefill is enabled."
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 47f66d13..0eab7345 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -253,7 +253,7 @@ class NPUModelRunner(GPUModelRunner):
         self.is_multimodal_model = self.model_config.is_multimodal_model
         self.block_size = vllm_config.cache_config.block_size
         # Set up Attention
-        self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
+        self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config,
                                   "index_topk")
         self.attn_backend = get_attn_backend(
             0,
@@ -2398,7 +2398,7 @@ class NPUModelRunner(GPUModelRunner):
             kv_caches[layer_name] = kv_caches[target_layer_name]
 
         from vllm.v1.worker.utils import bind_kv_cache
-        num_attn_module = 2 if self.model_config.hf_config.model_type == "longcat_flash" else 1
+        num_attn_module = 2 if self.model_config.hf_text_config.model_type == "longcat_flash" else 1
         bind_kv_cache(kv_caches,
                       self.compilation_config.static_forward_context,
                       self.kv_caches, num_attn_module)
@@ -2932,7 +2932,7 @@ class NPUModelRunner(GPUModelRunner):
         mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
         if len(mamba_layers) > 0:
             if (self.vllm_config.speculative_config is not None
-                    and self.vllm_config.model_config.hf_config.model_type
+                    and self.vllm_config.model_config.hf_text_config.model_type
                     not in ["qwen3_next"]):
                 raise NotImplementedError(
                     "Mamba with speculative decoding is not supported yet.")
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 0b290a57..ea69c3f8 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -173,7 +173,7 @@ class NPUWorker(WorkerBase):
         allocator = CaMemAllocator.get_instance()
         allocator.wake_up(tags=tags)
 
-        hidden_size = self.vllm_config.model_config.hf_config.hidden_size
+        hidden_size = self.vllm_config.model_config.hf_text_config.hidden_size
         model = self.model_runner.model
         if tags is None or "weights" in tags:
             for name, param in model.named_parameters():
diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py
index 00f6b542..e6c7437a 100644
--- a/vllm_ascend/xlite/xlite.py
+++ b/vllm_ascend/xlite/xlite.py
@@ -61,7 +61,7 @@ class LlamaXliteModel(XliteModel):
         xlite_model.embed = params_dict.get(model_prefix +
                                             "model.embed_tokens.weight")
         xlite_model.norm = params_dict.get(model_prefix + "model.norm.weight")
-        if vllm_config.model_config.hf_config.tie_word_embeddings:
+        if vllm_config.model_config.hf_text_config.tie_word_embeddings:
             xlite_model.head = xlite_model.embed
         else:
             xlite_model.head = params_dict.get(model_prefix + "lm_head.weight")
@@ -118,7 +118,7 @@ class LlamaXliteModel(XliteModel):
         return (xlite_model, freq_cis, config.hidden_size, dtype)
 
     def _build_model_config(self, vllm_config: VllmConfig) -> ModelConfig:
-        hf_config = vllm_config.model_config.hf_config
+        hf_config = vllm_config.model_config.hf_text_config
         if hasattr(hf_config, "text_config"):
             hf_config = hf_config.text_config
         config = ModelConfig()