[MM][Bugfix] Update hf_config to hf_text_config (#5319)
### What this PR does / why we need it?
Following https://github.com/vllm-project/vllm-ascend/pull/5205, update
`hf_config` to `hf_text_config`.
Find more details at
https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3675417534
and
https://github.com/vllm-project/vllm-ascend/pull/5205#issuecomment-3677920872.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
5fbfa8d9ef
Signed-off-by: shen-shanshan <467638484@qq.com>
This commit is contained in:
@@ -114,7 +114,7 @@ def _run_worker_process(
|
||||
|
||||
# Expose model config to the main test process
|
||||
counters["hidden_layers"].value = (
|
||||
llm.llm_engine.model_config.hf_config.num_hidden_layers)
|
||||
llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
|
||||
|
||||
llm.generate(local_prompts,
|
||||
SamplingParams(max_tokens=max_tokens, temperature=0.0))
|
||||
|
||||
@@ -130,8 +130,8 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
)
|
||||
model_config.pooler_config = MagicMock()
|
||||
model_config.multimodal_config = MagicMock()
|
||||
model_config.hf_config = MagicMock()
|
||||
model_config.hf_config.is_encoder_decoder = False
|
||||
model_config.hf_text_config = MagicMock()
|
||||
model_config.hf_text_config.is_encoder_decoder = False
|
||||
# Cache config, optionally force APC
|
||||
kwargs_cache: Dict[str,
|
||||
Any] = ({} if ENABLE_PREFIX_CACHING is None else {
|
||||
|
||||
@@ -87,7 +87,7 @@ class TestAscendMultiHeadLatentAttention(TestBase):
|
||||
mock_tp_size.return_value = 2
|
||||
mock_ascend_config.return_value.enable_shared_expert_dp = True
|
||||
mock_vllm_config = MagicMock(spec=VllmConfig)
|
||||
mock_vllm_config.model_config.hf_config = MagicMock(
|
||||
mock_vllm_config.model_config.hf_text_config = MagicMock(
|
||||
num_hidden_layers=32, first_k_dense_replace=True)
|
||||
mock_get_vllm_config.return_value = mock_vllm_config
|
||||
mock_vllm_config.compilation_config = CompilationConfig()
|
||||
@@ -122,7 +122,7 @@ class TestAscendMultiHeadLatentAttention(TestBase):
|
||||
mock_tp_size.return_value = 1
|
||||
mock_ascend_config.return_value.enable_shared_expert_dp = False
|
||||
mock_vllm_config = MagicMock(spec=VllmConfig)
|
||||
mock_vllm_config.model_config.hf_config = MagicMock(
|
||||
mock_vllm_config.model_config.hf_text_config = MagicMock(
|
||||
num_hidden_layers=32, first_k_dense_replace=False)
|
||||
mock_get_vllm_config.return_value = mock_vllm_config
|
||||
mock_vllm_config.compilation_config = CompilationConfig()
|
||||
|
||||
@@ -115,7 +115,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
|
||||
model_config = ModelConfig(MODEL,
|
||||
tokenizer=MODEL,
|
||||
max_model_len=MAX_NUM_BATCHED_TOKEND)
|
||||
model_config.hf_config = PretrainedConfig()
|
||||
model_config.hf_text_config = PretrainedConfig()
|
||||
vllm_config.model_config = model_config
|
||||
with set_ascend_forward_context(None, vllm_config):
|
||||
result_q, result_k = self.layer.forward(self.positions, self.query,
|
||||
@@ -141,7 +141,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
|
||||
model_config = ModelConfig(MODEL,
|
||||
tokenizer=MODEL,
|
||||
max_model_len=MAX_NUM_BATCHED_TOKEND)
|
||||
model_config.hf_config = PretrainedConfig()
|
||||
model_config.hf_text_config = PretrainedConfig()
|
||||
vllm_config.model_config = model_config
|
||||
with set_ascend_forward_context(None, vllm_config):
|
||||
result_q, result_k = self.layer.forward(self.positions,
|
||||
@@ -164,7 +164,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
|
||||
model_config = ModelConfig(MODEL,
|
||||
tokenizer=MODEL,
|
||||
max_model_len=MAX_NUM_BATCHED_TOKEND)
|
||||
model_config.hf_config = PretrainedConfig()
|
||||
model_config.hf_text_config = PretrainedConfig()
|
||||
vllm_config.model_config = model_config
|
||||
with set_ascend_forward_context(None, vllm_config):
|
||||
self.layer.forward(self.positions, self.query, self.key,
|
||||
@@ -184,7 +184,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
|
||||
model_config = ModelConfig(MODEL,
|
||||
tokenizer=MODEL,
|
||||
max_model_len=MAX_NUM_BATCHED_TOKEND)
|
||||
model_config.hf_config = PretrainedConfig()
|
||||
model_config.hf_text_config = PretrainedConfig()
|
||||
vllm_config.model_config = model_config
|
||||
with set_ascend_forward_context(None, vllm_config):
|
||||
result_q, result_k = self.layer.forward(
|
||||
@@ -213,7 +213,7 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
|
||||
model_config = ModelConfig(MODEL,
|
||||
tokenizer=MODEL,
|
||||
max_model_len=MAX_NUM_BATCHED_TOKEND)
|
||||
model_config.hf_config = PretrainedConfig()
|
||||
model_config.hf_text_config = PretrainedConfig()
|
||||
vllm_config.model_config = model_config
|
||||
with set_ascend_forward_context(None, vllm_config):
|
||||
result_q, result_k = self.layer.forward(self.positions, self.query,
|
||||
@@ -404,7 +404,7 @@ class TestAscendMRotaryEmbedding(unittest.TestCase):
|
||||
model_config = ModelConfig(MODEL_VL,
|
||||
tokenizer=MODEL_VL,
|
||||
max_model_len=MAX_NUM_BATCHED_TOKEND)
|
||||
model_config.hf_config = PretrainedConfig()
|
||||
model_config.hf_text_config = PretrainedConfig()
|
||||
vllm_config.model_config = model_config
|
||||
return vllm_config
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ class TestAscendQuantConfig(TestBase):
|
||||
|
||||
def test_get_quant_method_for_linear(self):
|
||||
mock_config = MagicMock()
|
||||
mock_config.model_config.hf_config.model_type = None
|
||||
mock_config.model_config.hf_text_config.model_type = None
|
||||
linear_layer = MagicMock(spec=LinearBase)
|
||||
# Test skipped layer
|
||||
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
|
||||
@@ -103,7 +103,7 @@ class TestAscendQuantConfig(TestBase):
|
||||
def test_get_quant_method_for_attention(self):
|
||||
attention_layer = MagicMock(spec=Attention)
|
||||
mock_config = MagicMock()
|
||||
mock_config.model_config.hf_config.model_type = None
|
||||
mock_config.model_config.hf_text_config.model_type = None
|
||||
with patch("vllm_ascend.quantization.quant_config.get_current_vllm_config", return_value=mock_config), \
|
||||
patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', \
|
||||
return_value=MagicMock()) as mock_ascend_kvcache:
|
||||
@@ -117,7 +117,7 @@ class TestAscendQuantConfig(TestBase):
|
||||
fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig)
|
||||
fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig)
|
||||
mock_config = MagicMock()
|
||||
mock_config.model_config.hf_config.model_type = None
|
||||
mock_config.model_config.hf_text_config.model_type = None
|
||||
|
||||
# Test skipped layer
|
||||
with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \
|
||||
|
||||
@@ -41,7 +41,7 @@ class TestMtpProposer:
|
||||
config.model_config.dtype = torch.float16
|
||||
config.model_config.max_model_len = 2048
|
||||
config.model_config.uses_mrope = False
|
||||
config.model_config.hf_config = None
|
||||
config.model_config.hf_text_config = None
|
||||
|
||||
config.load_config = None
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ class AscendConfig:
|
||||
try:
|
||||
# only support Qwen model now
|
||||
# TODO: use a more robust method to get kv_head_num
|
||||
num_kv_head = vllm_config.model_config.hf_config.num_key_value_heads
|
||||
num_kv_head = vllm_config.model_config.hf_text_config.num_key_value_heads
|
||||
self.num_head_replica = prefill_tp_size // num_kv_head if prefill_tp_size >= num_kv_head else 1
|
||||
prefill_tp_size = min(prefill_tp_size, num_kv_head)
|
||||
decode_tp_size = min(decode_tp_size, num_kv_head)
|
||||
@@ -126,7 +126,7 @@ class AscendConfig:
|
||||
|
||||
self.enable_kv_nz = additional_config.get("enable_kv_nz", False)
|
||||
if self.enable_kv_nz:
|
||||
use_sparse = hasattr(vllm_config.model_config.hf_config,
|
||||
use_sparse = hasattr(vllm_config.model_config.hf_text_config,
|
||||
"index_topk")
|
||||
if not vllm_config.model_config.is_deepseek_mla or use_sparse:
|
||||
raise RuntimeError(
|
||||
|
||||
@@ -224,8 +224,8 @@ def select_moe_comm_method(num_tokens: int,
|
||||
mc2_tokens_capacity = get_mc2_tokens_capacity()
|
||||
soc_version = get_ascend_device_type()
|
||||
quant_type = getattr(
|
||||
vllm_config.model_config.hf_config, 'moe_quantize',
|
||||
getattr(vllm_config.model_config.hf_config, 'quantize', None))
|
||||
vllm_config.model_config.hf_text_config, 'moe_quantize',
|
||||
getattr(vllm_config.model_config.hf_text_config, 'quantize', None))
|
||||
|
||||
if not vllm_config.parallel_config.enable_expert_parallel or get_ep_group(
|
||||
).world_size == 1:
|
||||
|
||||
@@ -149,7 +149,7 @@ class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
|
||||
|
||||
self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim
|
||||
self.enable_sfa_cp = enable_sp() and \
|
||||
hasattr(self.model_config.hf_config, "index_topk")
|
||||
hasattr(self.model_config.hf_text_config, "index_topk")
|
||||
|
||||
assert not (
|
||||
self.enable_sfa_cp
|
||||
@@ -963,7 +963,7 @@ class AscendSFAImpl(MLAAttentionImpl):
|
||||
# Dispose tensor from the original o_proj
|
||||
dispose_layer(self.o_proj)
|
||||
# Construct the new o_proj using ReplicatedLinear
|
||||
config = vllm_config.model_config.hf_config
|
||||
config = vllm_config.model_config.hf_text_config
|
||||
new_o_proj = ReplicatedLinear(config.num_attention_heads *
|
||||
config.v_head_dim,
|
||||
config.hidden_size,
|
||||
|
||||
@@ -96,7 +96,7 @@ class KVPoolWorker:
|
||||
|
||||
partitions = None
|
||||
if self.kv_role == "kv_consumer" and self.consumer_is_to_put:
|
||||
num_hidden_layers = model_config.hf_config.num_hidden_layers
|
||||
num_hidden_layers = model_config.hf_text_config.num_hidden_layers
|
||||
partition_list_str = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
|
||||
"prefill_pp_layer_partition", None)
|
||||
prefill_pp_size = int(
|
||||
|
||||
@@ -345,7 +345,7 @@ class KVCacheRecvingThread(threading.Thread):
|
||||
self.vllm_config = vllm_config
|
||||
self.model_config = self.vllm_config.model_config
|
||||
self.block_size = self.vllm_config.cache_config.block_size
|
||||
self.num_layers = self.model_config.hf_config.num_hidden_layers
|
||||
self.num_layers = self.model_config.hf_text_config.num_hidden_layers
|
||||
self.pp_layer_indices = {
|
||||
rank:
|
||||
get_prefill_pp_indices(self.num_layers, rank,
|
||||
|
||||
@@ -167,7 +167,7 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ):
|
||||
|
||||
global _SHARED_WEIGHT
|
||||
# TODO: Check if the model is Deepseek V3.2 with enabled SFA CP and activated shared weights. It will then be normalized within the PCP parameters. -- clrs97
|
||||
is_ds_v32 = hasattr(vllm_config.model_config.hf_config, "index_topk")
|
||||
is_ds_v32 = hasattr(vllm_config.model_config.hf_text_config, "index_topk")
|
||||
if enable_sp() and is_ds_v32 and _SHARED_WEIGHT is None:
|
||||
_SHARED_WEIGHT = _create_shared_weight_group("CP_shared_weight")
|
||||
# TODO: Extract and unify the logic across different communication group.
|
||||
|
||||
@@ -69,7 +69,7 @@ def model_register(model, model_config):
|
||||
model.get_all_moe_loads = types.MethodType(get_all_moe_loads, model)
|
||||
model.clear_all_moe_loads = types.MethodType(clear_all_moe_loads, model)
|
||||
|
||||
config = model_config.hf_config
|
||||
config = model_config.hf_text_config
|
||||
|
||||
if config.model_type == "qwen3_moe":
|
||||
model.num_moe_layers = config.num_hidden_layers
|
||||
|
||||
@@ -697,7 +697,7 @@ def is_moe_layer(prefix: str) -> bool:
|
||||
def get_moe_params():
|
||||
from vllm.config import get_current_vllm_config
|
||||
vllm_config = get_current_vllm_config()
|
||||
config = vllm_config.model_config.hf_config
|
||||
config = vllm_config.model_config.hf_text_config
|
||||
n_routed_experts = getattr(config, 'n_routed_experts', 0)
|
||||
first_k_dense_replace = getattr(config, 'first_k_dense_replace',
|
||||
float('inf'))
|
||||
|
||||
@@ -91,7 +91,7 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
|
||||
self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
|
||||
self.v_head_dim = v_head_dim
|
||||
self.prefix = prefix
|
||||
hf_config = get_current_vllm_config().model_config.hf_config
|
||||
hf_config = get_current_vllm_config().model_config.hf_text_config
|
||||
self.enable_shared_expert_dp = get_ascend_config(
|
||||
).enable_shared_expert_dp
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
|
||||
@@ -247,6 +247,6 @@ def reach_layer_for_shared_weight_series(layer: LinearBase):
|
||||
|
||||
|
||||
def is_hidden_layer(vllm_config, layer: LinearBase) -> bool:
|
||||
num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
|
||||
num_hidden_layers = vllm_config.model_config.hf_text_config.num_hidden_layers
|
||||
layer_idx = extract_layer_index(layer.prefix)
|
||||
return layer_idx < num_hidden_layers
|
||||
|
||||
@@ -174,7 +174,8 @@ class NPUPlatform(Platform):
|
||||
) if not isinstance(ascend_compilation_config, dict)
|
||||
else ascend_compilation_config)
|
||||
|
||||
elif model_config and hasattr(model_config.hf_config, "index_topk"):
|
||||
elif model_config and hasattr(model_config.hf_text_config,
|
||||
"index_topk"):
|
||||
vllm_config.cache_config.cache_dtype = str(
|
||||
model_config.dtype).replace("torch.", "")
|
||||
if model_config is None:
|
||||
|
||||
@@ -116,7 +116,7 @@ class AscendQuantConfig(QuantizationConfig):
|
||||
def get_quant_method(self, layer: torch.nn.Module,
|
||||
prefix: str) -> Optional["QuantizeMethodBase"]:
|
||||
vllm_config = get_current_vllm_config()
|
||||
model_type = vllm_config.model_config.hf_config.model_type
|
||||
model_type = vllm_config.model_config.hf_text_config.model_type
|
||||
if model_type in packed_modules_model_mapping:
|
||||
self.packed_modules_mapping = packed_modules_model_mapping[
|
||||
model_type]
|
||||
|
||||
@@ -90,7 +90,7 @@ class EagleProposer(VllmEagleProposer):
|
||||
self.runner.max_num_tokens * self.pcp_size * self.dcp_size +
|
||||
self.pcp_size * self.dcp_size * self.runner.max_num_reqs)
|
||||
|
||||
self.use_sparse = hasattr(vllm_config.model_config.hf_config,
|
||||
self.use_sparse = hasattr(vllm_config.model_config.hf_text_config,
|
||||
"index_topk")
|
||||
|
||||
def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
|
||||
|
||||
@@ -468,7 +468,7 @@ def update_default_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
# on special shapes.
|
||||
# TODO(Angazenn): we will remove this once _npu_paged_attention is fully
|
||||
# replaced by npu_fused_infer_attention_score which does not contain such bugs.
|
||||
if vllm_config.model_config and vllm_config.model_config.hf_config.model_type == "qwen3_moe" \
|
||||
if vllm_config.model_config and vllm_config.model_config.hf_text_config.model_type == "qwen3_moe" \
|
||||
and vllm_config.parallel_config.tensor_parallel_size == 1 \
|
||||
and vllm_config.parallel_config.data_parallel_size > 1 :
|
||||
|
||||
@@ -503,7 +503,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
)
|
||||
|
||||
return
|
||||
hf_config = vllm_config.model_config.hf_config
|
||||
hf_config = vllm_config.model_config.hf_text_config
|
||||
if hasattr(hf_config, 'num_hidden_layers'):
|
||||
num_hidden_layers = hf_config.num_hidden_layers
|
||||
else:
|
||||
@@ -826,7 +826,7 @@ def is_moe_model(vllm_config: VllmConfig):
|
||||
"""Checks if the model is a MoE model by config"""
|
||||
global _IS_MOE_MODEL
|
||||
if _IS_MOE_MODEL is None:
|
||||
model_configs = vllm_config.model_config.hf_config.to_dict()
|
||||
model_configs = vllm_config.model_config.hf_text_config.to_dict()
|
||||
_IS_MOE_MODEL = _is_contain_expert(model_configs)
|
||||
return _IS_MOE_MODEL
|
||||
|
||||
@@ -842,7 +842,7 @@ def speculative_enable_dispatch_gmm_combine_decode(
|
||||
if speculative_method in ["eagle", "eagle3"]:
|
||||
return False
|
||||
if speculative_method == "mtp":
|
||||
mtp_quant_type = getattr(vllm_config.model_config.hf_config,
|
||||
mtp_quant_type = getattr(vllm_config.model_config.hf_text_config,
|
||||
"mtp_quantize", None)
|
||||
return mtp_quant_type == "w8a8_dynamic"
|
||||
return False
|
||||
@@ -875,7 +875,7 @@ def has_rope(vllm_config: VllmConfig):
|
||||
"""Checks if the model uses rope."""
|
||||
global _HAS_ROPE
|
||||
if _HAS_ROPE is None and vllm_config and vllm_config.model_config:
|
||||
hf_config = vllm_config.model_config.hf_config.to_dict()
|
||||
hf_config = vllm_config.model_config.hf_text_config.to_dict()
|
||||
_HAS_ROPE = "rope_parameters" in hf_config
|
||||
return _HAS_ROPE
|
||||
|
||||
@@ -1091,7 +1091,7 @@ def refresh_block_size(vllm_config):
|
||||
return
|
||||
|
||||
# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
|
||||
if not model_config.hf_config.model_type == "qwen3_next" and cache_config.block_size != 128:
|
||||
if not model_config.hf_text_config.model_type == "qwen3_next" and cache_config.block_size != 128:
|
||||
if cache_config.enable_prefix_caching or scheduler_config.enable_chunked_prefill:
|
||||
logger.info(
|
||||
"Block size is set to 128 if prefix cache or chunked prefill is enabled."
|
||||
|
||||
@@ -253,7 +253,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
self.is_multimodal_model = self.model_config.is_multimodal_model
|
||||
self.block_size = vllm_config.cache_config.block_size
|
||||
# Set up Attention
|
||||
self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
|
||||
self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config,
|
||||
"index_topk")
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
@@ -2398,7 +2398,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
kv_caches[layer_name] = kv_caches[target_layer_name]
|
||||
|
||||
from vllm.v1.worker.utils import bind_kv_cache
|
||||
num_attn_module = 2 if self.model_config.hf_config.model_type == "longcat_flash" else 1
|
||||
num_attn_module = 2 if self.model_config.hf_text_config.model_type == "longcat_flash" else 1
|
||||
bind_kv_cache(kv_caches,
|
||||
self.compilation_config.static_forward_context,
|
||||
self.kv_caches, num_attn_module)
|
||||
@@ -2932,7 +2932,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
|
||||
if len(mamba_layers) > 0:
|
||||
if (self.vllm_config.speculative_config is not None
|
||||
and self.vllm_config.model_config.hf_config.model_type
|
||||
and self.vllm_config.model_config.hf_text_config.model_type
|
||||
not in ["qwen3_next"]):
|
||||
raise NotImplementedError(
|
||||
"Mamba with speculative decoding is not supported yet.")
|
||||
|
||||
@@ -173,7 +173,7 @@ class NPUWorker(WorkerBase):
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.wake_up(tags=tags)
|
||||
|
||||
hidden_size = self.vllm_config.model_config.hf_config.hidden_size
|
||||
hidden_size = self.vllm_config.model_config.hf_text_config.hidden_size
|
||||
model = self.model_runner.model
|
||||
if tags is None or "weights" in tags:
|
||||
for name, param in model.named_parameters():
|
||||
|
||||
@@ -61,7 +61,7 @@ class LlamaXliteModel(XliteModel):
|
||||
xlite_model.embed = params_dict.get(model_prefix +
|
||||
"model.embed_tokens.weight")
|
||||
xlite_model.norm = params_dict.get(model_prefix + "model.norm.weight")
|
||||
if vllm_config.model_config.hf_config.tie_word_embeddings:
|
||||
if vllm_config.model_config.hf_text_config.tie_word_embeddings:
|
||||
xlite_model.head = xlite_model.embed
|
||||
else:
|
||||
xlite_model.head = params_dict.get(model_prefix + "lm_head.weight")
|
||||
@@ -118,7 +118,7 @@ class LlamaXliteModel(XliteModel):
|
||||
return (xlite_model, freq_cis, config.hidden_size, dtype)
|
||||
|
||||
def _build_model_config(self, vllm_config: VllmConfig) -> ModelConfig:
|
||||
hf_config = vllm_config.model_config.hf_config
|
||||
hf_config = vllm_config.model_config.hf_text_config
|
||||
if hasattr(hf_config, "text_config"):
|
||||
hf_config = hf_config.text_config
|
||||
config = ModelConfig()
|
||||
|
||||
Reference in New Issue
Block a user