From 0ad52517a18e65ef235c47ab1ee1c8e22611dc8e Mon Sep 17 00:00:00 2001 From: Hexiang Wang <56632993+whx-sjtu@users.noreply.github.com> Date: Sat, 14 Mar 2026 00:05:54 +0800 Subject: [PATCH] Revert "Refactor quantization layer name mapping to leverage vLLM built-in mappers" (#7237) Reverts vllm-project/vllm-ascend#7050, which breaks kimi-k2.5 and qwen-omin. - vLLM version: v0.17.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --- vllm_ascend/quantization/modelslim_config.py | 104 ++++++------------- 1 file changed, 31 insertions(+), 73 deletions(-) diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index eb9dddb1..c682856b 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -48,9 +48,22 @@ MODELSLIM_CONFIG_FILENAME = "quant_model_description.json" logger = init_logger(__name__) # key: model_type -# value: vLLM prefix -> HF prefix mapping (used to convert vLLM layer names to HF format -# for looking up keys in quant_model_description.json) +# value: orig_to_new_prefix QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { + "qwen3_vl_moe": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, + "qwen3_vl_text": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, + "kimi_k25": { + "mm_projector.linear_1": "mm_projector.proj.0", + "mm_projector.linear_2": "mm_projector.proj.2", + }, "qwen3_omni_moe": { "language_model.lm_head.": "thinker.lm_head.", "language_model.model.": "thinker.model.", @@ -63,8 +76,6 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { }, "qwen2_5_omni_text": { "language_model.": "thinker.", - "language_model.lm_head.": "thinker.lm_head.", - "language_model.model.": "thinker.model.", }, "glm4v_moe": { "visual.": "model.visual.", @@ -76,6 +87,16 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, + "qwen3_5": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, + "qwen3_5_moe": { + "visual.": "model.visual.", + "language_model.lm_head.": "lm_head.", + "language_model.model.": "model.language_model.", + }, } # key: model_type @@ -417,10 +438,6 @@ class AscendModelSlimConfig(QuantizationConfig): new_k = k.replace("weight_packed", "weight") extra_quant_dict[new_k] = self.quant_description[k] self.quant_description.update(extra_quant_dict) - # Initialize attributes for type checking - self.model_type: str | None = None - self.hf_to_vllm_mapper: WeightsMapper | None = None - self.vllm_to_hf_mapper: WeightsMapper | None = None def __repr__(self) -> str: return "AscendModelSlimConfig:\n" + super().__repr__() @@ -458,74 +475,12 @@ class AscendModelSlimConfig(QuantizationConfig): return ASCEND_QUANTIZATION_METHOD return None - def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - """Apply the vLLM model-specific mapper to this quantization config. - - This method is called by vLLM to apply the model-specific weight mapper - to the quantization configuration. It creates a reverse mapper to convert - vLLM prefixes back to HF format for looking up keys in quant_config.json. - - Args: - hf_to_vllm_mapper: The WeightsMapper instance provided by vLLM - that contains model-specific prefix mappings (HF to vLLM). - """ - # Check if we already have a valid vllm_to_hf_mapper for this hf_to_vllm_mapper - if hasattr(self, "hf_to_vllm_mapper") and self.hf_to_vllm_mapper is hf_to_vllm_mapper: - # Same mapper instance, no need to recreate - return - - # Store the original mapper - self.hf_to_vllm_mapper = hf_to_vllm_mapper - - # Check if manual mapping exists for this model type - # Manual mapping takes priority and is used exclusively to avoid conflicts - if hasattr(self, "model_type") and self.model_type in QUANT_MODEL_PREFIX_MAPPINGS: - manual_mapping = QUANT_MODEL_PREFIX_MAPPINGS[self.model_type] - # Manual mapping is already in vLLM -> HF direction, use directly - self.vllm_to_hf_mapper = WeightsMapper(orig_to_new_prefix=manual_mapping) - logger.debug(f"Using manual mapping for {self.model_type}: {manual_mapping}") - return - - # No manual mapping, use hf_to_vllm_mapper and reverse it - # Try different ways to get the mapping based on WeightsMapper implementation - mapping_attrs = ["orig_to_new_prefix"] - orig_to_new_prefix = {} - - for attr_name in mapping_attrs: - if hasattr(hf_to_vllm_mapper, attr_name): - orig_to_new_prefix = getattr(hf_to_vllm_mapper, attr_name) - break - - # Create reverse mapping (vLLM -> HF), skipping empty values - vllm_to_hf_mapping = {} - for orig_prefix, new_prefix in orig_to_new_prefix.items(): - # Skip empty values to avoid invalid keys in reverse mapping - if new_prefix: - vllm_to_hf_mapping[new_prefix] = orig_prefix - - # Create and store the reverse WeightsMapper instance - if vllm_to_hf_mapping: - self.vllm_to_hf_mapper = WeightsMapper(orig_to_new_prefix=vllm_to_hf_mapping) - logger.debug(f"Created reverse mapping from hf_to_vllm_mapper: {vllm_to_hf_mapping}") - else: - logger.info("No valid reverse mapping found for WeightsMapper.") - def quant_prefix_mapper(self, model_type: str, prefix: str) -> str: - # Store model_type for backward compatibility mappings - self.model_type = model_type - - # Use the reverse mapper (vLLM to HF) if available - if hasattr(self, "vllm_to_hf_mapper") and self.vllm_to_hf_mapper: - return self.vllm_to_hf_mapper._map_name(prefix) - - # Fall back to manual mapping for backward compatibility (simplified) - # This is only used if apply_vllm_mapper wasn't called or failed + # TODO (Levi-JQ): will be removed when QuantizationConfig.apply_vllm_mapper is implemented prefix_mapping = QUANT_MODEL_PREFIX_MAPPINGS.get(model_type) if prefix_mapping: - # Manual mapping is already in vLLM -> HF direction, use directly - mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping) - return mapper._map_name(prefix) - + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping) + return hf_to_vllm_mapper._map_name(prefix) return prefix def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: @@ -556,6 +511,9 @@ class AscendModelSlimConfig(QuantizationConfig): from vllm.model_executor.layers.attention import Attention + if model_type != "kimi_k2": + if prefix.startswith("language_model"): + prefix = prefix.split(".", 1)[-1] if isinstance(layer, LinearBase): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import