xc-llm-ascend/vllm_ascend/quantization/utils.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#

import json
from pathlib import Path

from vllm import envs
from vllm.logger import init_logger

from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD

logger = init_logger(__name__)


def get_model_file(
    model: str | Path,
    filename: str,
    revision: str | None = None,
) -> Path | None:
    """Get a file from local model directory or download from remote repo.

    This function handles both local paths and remote repository IDs,
    automatically downloading files from HuggingFace Hub or ModelScope
    if they are not already cached.

    Args:
        model: Local directory path or HuggingFace/ModelScope repo id.
        filename: Name of the file to retrieve (e.g., "config.json").
        revision: Optional revision (branch, tag, or commit hash) for remote repos.

    Returns:
        Path to the file if found, None otherwise.
    """
    # Check if it's a local path
    model_path = Path(model) if isinstance(model, str) else model
    if model_path.exists():
        file_path = model_path / filename
        return file_path if file_path.exists() else None

    # Remote repo: try to download from HF Hub or ModelScope
    try:
        if envs.VLLM_USE_MODELSCOPE:
            from modelscope.hub.file_download import model_file_download  # type: ignore[import-untyped]

            downloaded_path = model_file_download(
                model_id=str(model),
                file_path=filename,
                revision=revision,
            )
            return Path(downloaded_path)
        else:
            from huggingface_hub import hf_hub_download

            downloaded_path = hf_hub_download(
                repo_id=str(model),
                filename=filename,
                revision=revision,
            )
            return Path(downloaded_path)
    except Exception as e:
        logger.debug(f"Could not download {filename} from {model}: {e}")
        return None


def detect_quantization_method(model: str, revision: str | None = None) -> str | None:
    """Auto-detect the quantization method from model files.

    This function performs a lightweight check (JSON files only — no
    .safetensors or .bin inspection) to determine which quantization
    method was used to produce the weights in *model*.

    Works with both local directories (``/path/to/model``) and remote
    repository identifiers (``org/model-name``).  For remote repos the
    lookup goes through the HuggingFace / ModelScope cache, downloading
    config files if not already cached.

    Detection priority:
        1. **ModelSlim (Ascend)** – ``quant_model_description.json`` exists.
        2. **LLM-Compressor (compressed-tensors)** – ``config.json`` contains
           a ``quantization_config`` section with
           ``"quant_method": "compressed-tensors"``.
        3. **None** – neither condition is met; the caller should fall back to
           the default (float) behaviour.

    Args:
        model: Local directory path **or** HuggingFace / ModelScope repo id.
        revision: Optional model revision (branch, tag, or commit id).

    Returns:
        ``"ascend"`` for ModelSlim models,
        ``"compressed-tensors"`` for LLM-Compressor models,
        or ``None`` if no quantization signature is found.
    """
    from vllm_ascend.quantization.modelslim_config import MODELSLIM_CONFIG_FILENAME

    # Case 1: ModelSlim — look for quant_model_description.json
    modelslim_path = get_model_file(model, MODELSLIM_CONFIG_FILENAME, revision=revision)
    if modelslim_path is not None:
        return ASCEND_QUANTIZATION_METHOD

    # Case 2: LLM-Compressor — look for compressed-tensors in config.json
    config_path = get_model_file(model, "config.json", revision=revision)
    if config_path is not None:
        try:
            with open(config_path) as f:
                config = json.load(f)
            quant_cfg = config.get("quantization_config")
            if isinstance(quant_cfg, dict):
                quant_method = quant_cfg.get("quant_method", "")
                if quant_method == COMPRESSED_TENSORS_METHOD:
                    return COMPRESSED_TENSORS_METHOD
        except (json.JSONDecodeError, OSError):
            pass

    # Case 3: No quantization signature found.
    return None


def maybe_auto_detect_quantization(vllm_config) -> None:
    """Auto-detect and apply the quantization method on *vllm_config*.

    This should be called during engine initialisation (from
    ``NPUPlatform.check_and_update_config``) **after** ``VllmConfig`` has been
    created but **before** heavy weights are loaded.

    Because ``check_and_update_config`` runs *after*
    ``VllmConfig.__post_init__`` has already evaluated
    ``_get_quantization_config`` (which returned ``None`` when
    ``model_config.quantization`` was not set), we must:

    1. Set ``model_config.quantization`` to the detected value.
    2. Recreate ``vllm_config.quant_config`` so that the quantization
       pipeline (``get_quant_config`` → ``QuantizationConfig`` →
       ``get_quant_method`` for every layer) is properly initialised.

    Rules:
        * If the user explicitly set ``--quantization``, that value is
          respected.  A warning is emitted when the detected method differs.
        * If no ``--quantization`` was given, the detected method (if any) is
          applied automatically.

    Args:
        vllm_config: A ``vllm.config.VllmConfig`` instance (mutable).
    """
    model_config = vllm_config.model_config
    model = model_config.model
    revision = model_config.revision
    user_quant = model_config.quantization
    detected = detect_quantization_method(model, revision=revision)

    if detected is None:
        # No quantization signature found — nothing to do.
        return

    if user_quant is not None:
        # User explicitly specified a quantization method.
        if user_quant != detected:
            logger.warning(
                "Auto-detected quantization method '%s' from model "
                "files for '%s', but user explicitly specified "
                "'--quantization %s'. Respecting the user-specified "
                "value. If you encounter errors during model loading, "
                "consider using '--quantization %s' instead.",
                detected,
                model,
                user_quant,
                detected,
            )
        return

    # No user-specified quantization — apply auto-detected value.
    model_config.quantization = detected
    logger.info(
        "Auto-detected quantization method '%s' from model files "
        "for '%s'. To override, pass '--quantization <method>' explicitly.",
        detected,
        model,
    )

    # Recreate quant_config on VllmConfig.  The original __post_init__
    # already ran _get_quantization_config(), but at that point
    # model_config.quantization was None so it returned None.  Now that
    # we've set it, we need to build the actual QuantizationConfig so the
    # downstream model-loading code can use it.
    from vllm.config import VllmConfig as _VllmConfig

    vllm_config.quant_config = _VllmConfig._get_quantization_config(model_config, vllm_config.load_config)