From b94d5897691bb4f7cb49dca57e580f7bf4127cae Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Tue, 6 Jan 2026 16:41:39 +0800 Subject: [PATCH] [MM][Bugfix] Update `hf_config` to `hf_text_config` (#5319) ### What this PR does / why we need it? Following https://github.com/vllm-project/vllm-ascend/pull/5205, update `hf_config` to `hf_text_config`. Find more details at https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3675417534 and https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3677920872. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc Signed-off-by: shen-shanshan <467638484@qq.com> --- tests/e2e/multicard/test_aclgraph_capture_replay.py | 2 +- tests/ut/core/test_scheduler_dynamic_batch.py | 4 ++-- tests/ut/ops/test_mla.py | 4 ++-- tests/ut/ops/test_rotary_embedding.py | 12 ++++++------ tests/ut/quantization/test_quant_config.py | 6 +++--- tests/ut/spec_decode/test_mtp_proposer.py | 2 +- vllm_ascend/ascend_config.py | 4 ++-- vllm_ascend/ascend_forward_context.py | 4 ++-- vllm_ascend/attention/sfa_v1.py | 4 ++-- vllm_ascend/distributed/kvpool/pool_worker.py | 2 +- vllm_ascend/distributed/mooncake_connector.py | 2 +- vllm_ascend/distributed/parallel_state.py | 2 +- vllm_ascend/eplb/utils.py | 2 +- vllm_ascend/ops/linear_op.py | 2 +- vllm_ascend/ops/mla.py | 2 +- vllm_ascend/ops/shared_weight_layer.py | 2 +- vllm_ascend/platform.py | 3 ++- vllm_ascend/quantization/quant_config.py | 2 +- vllm_ascend/spec_decode/eagle_proposer.py | 2 +- vllm_ascend/utils.py | 12 ++++++------ vllm_ascend/worker/model_runner_v1.py | 6 +++--- vllm_ascend/worker/worker.py | 2 +- vllm_ascend/xlite/xlite.py | 4 ++-- 23 files changed, 44 insertions(+), 43 deletions(-) diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index c06f1a07..c4195bae 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -114,7 +114,7 @@ def _run_worker_process( # Expose model config to the main test process counters["hidden_layers"].value = ( - llm.llm_engine.model_config.hf_config.num_hidden_layers) + llm.llm_engine.model_config.hf_text_config.num_hidden_layers) llm.generate(local_prompts, SamplingParams(max_tokens=max_tokens, temperature=0.0)) diff --git a/tests/ut/core/test_scheduler_dynamic_batch.py b/tests/ut/core/test_scheduler_dynamic_batch.py index 8f38c18f..8d52e35b 100644 --- a/tests/ut/core/test_scheduler_dynamic_batch.py +++ b/tests/ut/core/test_scheduler_dynamic_batch.py @@ -130,8 +130,8 @@ class TestSchedulerDynamicBatch(TestBase): ) model_config.pooler_config = MagicMock() model_config.multimodal_config = MagicMock() - model_config.hf_config = MagicMock() - model_config.hf_config.is_encoder_decoder = False + model_config.hf_text_config = MagicMock() + model_config.hf_text_config.is_encoder_decoder = False # Cache config, optionally force APC kwargs_cache: Dict[str, Any] = ({} if ENABLE_PREFIX_CACHING is None else { diff --git a/tests/ut/ops/test_mla.py b/tests/ut/ops/test_mla.py index 28363450..d4501145 100644 --- a/tests/ut/ops/test_mla.py +++ b/tests/ut/ops/test_mla.py @@ -87,7 +87,7 @@ class TestAscendMultiHeadLatentAttention(TestBase): mock_tp_size.return_value = 2 mock_ascend_config.return_value.enable_shared_expert_dp = True mock_vllm_config = MagicMock(spec=VllmConfig) - mock_vllm_config.model_config.hf_config = MagicMock( + mock_vllm_config.model_config.hf_text_config = MagicMock( num_hidden_layers=32, first_k_dense_replace=True) mock_get_vllm_config.return_value = mock_vllm_config mock_vllm_config.compilation_config = CompilationConfig() @@ -122,7 +122,7 @@ class TestAscendMultiHeadLatentAttention(TestBase): mock_tp_size.return_value = 1 mock_ascend_config.return_value.enable_shared_expert_dp = False mock_vllm_config = MagicMock(spec=VllmConfig) - mock_vllm_config.model_config.hf_config = MagicMock( + mock_vllm_config.model_config.hf_text_config = MagicMock( num_hidden_layers=32, first_k_dense_replace=False) mock_get_vllm_config.return_value = mock_vllm_config mock_vllm_config.compilation_config = CompilationConfig() diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py index 569b70ab..567c15d9 100644 --- a/tests/ut/ops/test_rotary_embedding.py +++ b/tests/ut/ops/test_rotary_embedding.py @@ -115,7 +115,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase): model_config = ModelConfig(MODEL, tokenizer=MODEL, max_model_len=MAX_NUM_BATCHED_TOKEND) - model_config.hf_config = PretrainedConfig() + model_config.hf_text_config = PretrainedConfig() vllm_config.model_config = model_config with set_ascend_forward_context(None, vllm_config): result_q, result_k = self.layer.forward(self.positions, self.query, @@ -141,7 +141,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase): model_config = ModelConfig(MODEL, tokenizer=MODEL, max_model_len=MAX_NUM_BATCHED_TOKEND) - model_config.hf_config = PretrainedConfig() + model_config.hf_text_config = PretrainedConfig() vllm_config.model_config = model_config with set_ascend_forward_context(None, vllm_config): result_q, result_k = self.layer.forward(self.positions, @@ -164,7 +164,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase): model_config = ModelConfig(MODEL, tokenizer=MODEL, max_model_len=MAX_NUM_BATCHED_TOKEND) - model_config.hf_config = PretrainedConfig() + model_config.hf_text_config = PretrainedConfig() vllm_config.model_config = model_config with set_ascend_forward_context(None, vllm_config): self.layer.forward(self.positions, self.query, self.key, @@ -184,7 +184,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase): model_config = ModelConfig(MODEL, tokenizer=MODEL, max_model_len=MAX_NUM_BATCHED_TOKEND) - model_config.hf_config = PretrainedConfig() + model_config.hf_text_config = PretrainedConfig() vllm_config.model_config = model_config with set_ascend_forward_context(None, vllm_config): result_q, result_k = self.layer.forward( @@ -213,7 +213,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase): model_config = ModelConfig(MODEL, tokenizer=MODEL, max_model_len=MAX_NUM_BATCHED_TOKEND) - model_config.hf_config = PretrainedConfig() + model_config.hf_text_config = PretrainedConfig() vllm_config.model_config = model_config with set_ascend_forward_context(None, vllm_config): result_q, result_k = self.layer.forward(self.positions, self.query, @@ -404,7 +404,7 @@ class TestAscendMRotaryEmbedding(unittest.TestCase): model_config = ModelConfig(MODEL_VL, tokenizer=MODEL_VL, max_model_len=MAX_NUM_BATCHED_TOKEND) - model_config.hf_config = PretrainedConfig() + model_config.hf_text_config = PretrainedConfig() vllm_config.model_config = model_config return vllm_config diff --git a/tests/ut/quantization/test_quant_config.py b/tests/ut/quantization/test_quant_config.py index 2ba313cb..f75f8042 100644 --- a/tests/ut/quantization/test_quant_config.py +++ b/tests/ut/quantization/test_quant_config.py @@ -79,7 +79,7 @@ class TestAscendQuantConfig(TestBase): def test_get_quant_method_for_linear(self): mock_config = MagicMock() - mock_config.model_config.hf_config.model_type = None + mock_config.model_config.hf_text_config.model_type = None linear_layer = MagicMock(spec=LinearBase) # Test skipped layer with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ @@ -103,7 +103,7 @@ class TestAscendQuantConfig(TestBase): def test_get_quant_method_for_attention(self): attention_layer = MagicMock(spec=Attention) mock_config = MagicMock() - mock_config.model_config.hf_config.model_type = None + mock_config.model_config.hf_text_config.model_type = None with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \ patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \ return_value=MagicMock()) as mock_ascend_kvcache: @@ -117,7 +117,7 @@ class TestAscendQuantConfig(TestBase): fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig) fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig) mock_config = MagicMock() - mock_config.model_config.hf_config.model_type = None + mock_config.model_config.hf_text_config.model_type = None # Test skipped layer with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \ diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py index d6915cfb..703c1597 100644 --- a/tests/ut/spec_decode/test_mtp_proposer.py +++ b/tests/ut/spec_decode/test_mtp_proposer.py @@ -41,7 +41,7 @@ class TestMtpProposer: config.model_config.dtype = torch.float16 config.model_config.max_model_len = 2048 config.model_config.uses_mrope = False - config.model_config.hf_config = None + config.model_config.hf_text_config = None config.load_config = None diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index fec3ade8..ad53e687 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -96,7 +96,7 @@ class AscendConfig: try: # only support Qwen model now # TODO: use a more robust method to get kv_head_num - num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads + num_kv_head = vllm_config.model_config.hf_text_config.num_key_value_heads self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1 prefill_tp_size = min(prefill_tp_size, num_kv_head) decode_tp_size = min(decode_tp_size, num_kv_head) @@ -126,7 +126,7 @@ class AscendConfig: self.enable_kv_nz = additional_config.get("enable_kv_nz", False) if self.enable_kv_nz: - use_sparse = hasattr(vllm_config.model_config.hf_config, + use_sparse = hasattr(vllm_config.model_config.hf_text_config, "index_topk") if not vllm_config.model_config.is_deepseek_mla or use_sparse: raise RuntimeError( diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 6baa199b..be528453 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -224,8 +224,8 @@ def select_moe_comm_method(num_tokens: int, mc2_tokens_capacity = get_mc2_tokens_capacity() soc_version = get_ascend_device_type() quant_type = getattr( - vllm_config.model_config.hf_config, 'moe_quantize', - getattr(vllm_config.model_config.hf_config, 'quantize', None)) + vllm_config.model_config.hf_text_config, 'moe_quantize', + getattr(vllm_config.model_config.hf_text_config, 'quantize', None)) if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group( ).world_size == 1: diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 12ac00bc..119eef56 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -149,7 +149,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]): self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim self.enable_sfa_cp = enable_sp() and \ - hasattr(self.model_config.hf_config, "index_topk") + hasattr(self.model_config.hf_text_config, "index_topk") assert not ( self.enable_sfa_cp @@ -963,7 +963,7 @@ class AscendSFAImpl(MLAAttentionImpl): # Dispose tensor from the original o_proj dispose_layer(self.o_proj) # Construct the new o_proj using ReplicatedLinear - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_text_config new_o_proj = ReplicatedLinear(config.num_attention_heads * config.v_head_dim, config.hidden_size, diff --git a/vllm_ascend/distributed/kvpool/pool_worker.py b/vllm_ascend/distributed/kvpool/pool_worker.py index 8a5e6718..863ee2bc 100644 --- a/vllm_ascend/distributed/kvpool/pool_worker.py +++ b/vllm_ascend/distributed/kvpool/pool_worker.py @@ -96,7 +96,7 @@ class KVPoolWorker: partitions = None if self.kv_role == "kv_consumer" and self.consumer_is_to_put: - num_hidden_layers = model_config.hf_config.num_hidden_layers + num_hidden_layers = model_config.hf_text_config.num_hidden_layers partition_list_str = vllm_config.kv_transfer_config.kv_connector_extra_config.get( "prefill_pp_layer_partition", None) prefill_pp_size = int( diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py index 2b0fe92a..1d3619ab 100644 --- a/vllm_ascend/distributed/mooncake_connector.py +++ b/vllm_ascend/distributed/mooncake_connector.py @@ -345,7 +345,7 @@ class KVCacheRecvingThread(threading.Thread): self.vllm_config = vllm_config self.model_config = self.vllm_config.model_config self.block_size = self.vllm_config.cache_config.block_size - self.num_layers = self.model_config.hf_config.num_hidden_layers + self.num_layers = self.model_config.hf_text_config.num_hidden_layers self.pp_layer_indices = { rank: get_prefill_pp_indices(self.num_layers, rank, diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py index e886a311..4d50cec0 100644 --- a/vllm_ascend/distributed/parallel_state.py +++ b/vllm_ascend/distributed/parallel_state.py @@ -167,7 +167,7 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ): global _SHARED_WEIGHT # TODO: Check if the model is Deepseek V3.2 with enabled SFA CP and activated shared weights. It will then be normalized within the PCP parameters. -- clrs97 - is_ds_v32 = hasattr(vllm_config.model_config.hf_config, "index_topk") + is_ds_v32 = hasattr(vllm_config.model_config.hf_text_config, "index_topk") if enable_sp() and is_ds_v32 and _SHARED_WEIGHT is None: _SHARED_WEIGHT = _create_shared_weight_group("CP_shared_weight") # TODO: Extract and unify the logic across different communication group. diff --git a/vllm_ascend/eplb/utils.py b/vllm_ascend/eplb/utils.py index 7099c25f..6f703f10 100644 --- a/vllm_ascend/eplb/utils.py +++ b/vllm_ascend/eplb/utils.py @@ -69,7 +69,7 @@ def model_register(model, model_config): model.get_all_moe_loads = types.MethodType(get_all_moe_loads, model) model.clear_all_moe_loads = types.MethodType(clear_all_moe_loads, model) - config = model_config.hf_config + config = model_config.hf_text_config if config.model_type == "qwen3_moe": model.num_moe_layers = config.num_hidden_layers diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index 674dab54..53130e67 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -697,7 +697,7 @@ def is_moe_layer(prefix: str) -> bool: def get_moe_params(): from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_text_config n_routed_experts = getattr(config, 'n_routed_experts', 0) first_k_dense_replace = getattr(config, 'first_k_dense_replace', float('inf')) diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 1c952aa6..111b9cdc 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -91,7 +91,7 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper): self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim self.v_head_dim = v_head_dim self.prefix = prefix - hf_config = get_current_vllm_config().model_config.hf_config + hf_config = get_current_vllm_config().model_config.hf_text_config self.enable_shared_expert_dp = get_ascend_config( ).enable_shared_expert_dp self.tp_size = get_tensor_model_parallel_world_size() diff --git a/vllm_ascend/ops/shared_weight_layer.py b/vllm_ascend/ops/shared_weight_layer.py index 48a5179f..1dc2e88d 100644 --- a/vllm_ascend/ops/shared_weight_layer.py +++ b/vllm_ascend/ops/shared_weight_layer.py @@ -247,6 +247,6 @@ def reach_layer_for_shared_weight_series(layer: LinearBase): def is_hidden_layer(vllm_config, layer: LinearBase) -> bool: - num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers + num_hidden_layers = vllm_config.model_config.hf_text_config.num_hidden_layers layer_idx = extract_layer_index(layer.prefix) return layer_idx < num_hidden_layers diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 51bc5e66..4a60d3d5 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -174,7 +174,8 @@ class NPUPlatform(Platform): ) if not isinstance(ascend_compilation_config, dict) else ascend_compilation_config) - elif model_config and hasattr(model_config.hf_config, "index_topk"): + elif model_config and hasattr(model_config.hf_text_config, + "index_topk"): vllm_config.cache_config.cache_dtype = str( model_config.dtype).replace("torch.", "") if model_config is None: diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 49a1a5ba..f6a98241 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -116,7 +116,7 @@ class AscendQuantConfig(QuantizationConfig): def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: vllm_config = get_current_vllm_config() - model_type = vllm_config.model_config.hf_config.model_type + model_type = vllm_config.model_config.hf_text_config.model_type if model_type in packed_modules_model_mapping: self.packed_modules_mapping = packed_modules_model_mapping[ model_type] diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 625908cd..4fbf8532 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -90,7 +90,7 @@ class EagleProposer(VllmEagleProposer): self.runner.max_num_tokens * self.pcp_size * self.dcp_size + self.pcp_size * self.dcp_size * self.runner.max_num_reqs) - self.use_sparse = hasattr(vllm_config.model_config.hf_config, + self.use_sparse = hasattr(vllm_config.model_config.hf_text_config, "index_topk") def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool: diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index bbe63625..d9d92754 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -468,7 +468,7 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None: # on special shapes. # TODO(Angazenn): we will remove this once _npu_paged_attention is fully # replaced by npu_fused_infer_attention_score which does not contain such bugs. - if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \ + if vllm_config.model_config and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe" \ and vllm_config.parallel_config.tensor_parallel_size == 1 \ and vllm_config.parallel_config.data_parallel_size > 1 : @@ -503,7 +503,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: ) return - hf_config = vllm_config.model_config.hf_config + hf_config = vllm_config.model_config.hf_text_config if hasattr(hf_config, 'num_hidden_layers'): num_hidden_layers = hf_config.num_hidden_layers else: @@ -826,7 +826,7 @@ def is_moe_model(vllm_config: VllmConfig): """Checks if the model is a MoE model by config""" global _IS_MOE_MODEL if _IS_MOE_MODEL is None: - model_configs = vllm_config.model_config.hf_config.to_dict() + model_configs = vllm_config.model_config.hf_text_config.to_dict() _IS_MOE_MODEL = _is_contain_expert(model_configs) return _IS_MOE_MODEL @@ -842,7 +842,7 @@ def speculative_enable_dispatch_gmm_combine_decode( if speculative_method in ["eagle", "eagle3"]: return False if speculative_method == "mtp": - mtp_quant_type = getattr(vllm_config.model_config.hf_config, + mtp_quant_type = getattr(vllm_config.model_config.hf_text_config, "mtp_quantize", None) return mtp_quant_type == "w8a8_dynamic" return False @@ -875,7 +875,7 @@ def has_rope(vllm_config: VllmConfig): """Checks if the model uses rope.""" global _HAS_ROPE if _HAS_ROPE is None and vllm_config and vllm_config.model_config: - hf_config = vllm_config.model_config.hf_config.to_dict() + hf_config = vllm_config.model_config.hf_text_config.to_dict() _HAS_ROPE = "rope_parameters" in hf_config return _HAS_ROPE @@ -1091,7 +1091,7 @@ def refresh_block_size(vllm_config): return # TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups. - if not model_config.hf_config.model_type == "qwen3_next" and cache_config.block_size != 128: + if not model_config.hf_text_config.model_type == "qwen3_next" and cache_config.block_size != 128: if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill: logger.info( "Block size is set to 128 if prefix cache or chunked prefill is enabled." diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 47f66d13..0eab7345 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -253,7 +253,7 @@ class NPUModelRunner(GPUModelRunner): self.is_multimodal_model = self.model_config.is_multimodal_model self.block_size = vllm_config.cache_config.block_size # Set up Attention - self.use_sparse = hasattr(self.vllm_config.model_config.hf_config, + self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config, "index_topk") self.attn_backend = get_attn_backend( 0, @@ -2398,7 +2398,7 @@ class NPUModelRunner(GPUModelRunner): kv_caches[layer_name] = kv_caches[target_layer_name] from vllm.v1.worker.utils import bind_kv_cache - num_attn_module = 2 if self.model_config.hf_config.model_type == "longcat_flash" else 1 + num_attn_module = 2 if self.model_config.hf_text_config.model_type == "longcat_flash" else 1 bind_kv_cache(kv_caches, self.compilation_config.static_forward_context, self.kv_caches, num_attn_module) @@ -2932,7 +2932,7 @@ class NPUModelRunner(GPUModelRunner): mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase) if len(mamba_layers) > 0: if (self.vllm_config.speculative_config is not None - and self.vllm_config.model_config.hf_config.model_type + and self.vllm_config.model_config.hf_text_config.model_type not in ["qwen3_next"]): raise NotImplementedError( "Mamba with speculative decoding is not supported yet.") diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 0b290a57..ea69c3f8 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -173,7 +173,7 @@ class NPUWorker(WorkerBase): allocator = CaMemAllocator.get_instance() allocator.wake_up(tags=tags) - hidden_size = self.vllm_config.model_config.hf_config.hidden_size + hidden_size = self.vllm_config.model_config.hf_text_config.hidden_size model = self.model_runner.model if tags is None or "weights" in tags: for name, param in model.named_parameters(): diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py index 00f6b542..e6c7437a 100644 --- a/vllm_ascend/xlite/xlite.py +++ b/vllm_ascend/xlite/xlite.py @@ -61,7 +61,7 @@ class LlamaXliteModel(XliteModel): xlite_model.embed = params_dict.get(model_prefix + "model.embed_tokens.weight") xlite_model.norm = params_dict.get(model_prefix + "model.norm.weight") - if vllm_config.model_config.hf_config.tie_word_embeddings: + if vllm_config.model_config.hf_text_config.tie_word_embeddings: xlite_model.head = xlite_model.embed else: xlite_model.head = params_dict.get(model_prefix + "lm_head.weight") @@ -118,7 +118,7 @@ class LlamaXliteModel(XliteModel): return (xlite_model, freq_cis, config.hidden_size, dtype) def _build_model_config(self, vllm_config: VllmConfig) -> ModelConfig: - hf_config = vllm_config.model_config.hf_config + hf_config = vllm_config.model_config.hf_text_config if hasattr(hf_config, "text_config"): hf_config = hf_config.text_config config = ModelConfig()