fix: pass lm_head to LogitsProcessor instead of calling forward()

In vLLM v0.6.2, ParallelLMHead.forward() raises RuntimeError since its weights should be used through LogitsProcessor.linear_method.apply(). Pass lm_head as first arg to LogitsProcessor which handles the hidden_states -> logits projection internally.
testing dynamic register
2026-02-06 14:21:14 +08:00 · 2026-02-06 14:17:06 +08:00 · 2026-02-06 14:04:04 +08:00 · 2026-02-06 13:51:02 +08:00 · 2026-02-06 13:39:13 +08:00 · 2026-02-05 18:57:04 +08:00
15 changed files with 2079 additions and 29 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,240 @@
 # version file generated by setuptools-scm
 /vllm/_version.py
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
 # FlashMLA interface copied from source
 vllm/third_party/flashmla/flash_mla_interface.py
 # triton jit
 .triton
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 cmake-build-*/
 CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 /.deps/
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # generated files
 **/generated/**
 # uv
 uv.lock
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 docs/argparse
 docs/examples/*
 !docs/examples/README.md
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 # VSCode
 .vscode/
 # Claude
 CLAUDE.md
 .claude/
 # Codex
 AGENTS.md
 .codex/
 # Cursor
 .cursor/
 # DS Store
 .DS_Store
 # Results
 *.csv
 # Python pickle files
 *.pkl
 # Sphinx documentation
 _build/
 # vim swap files
 *.swo
 *.swp
 # hip files generated by PyTorch
 *.hip
 *_hip*
 hip_compat.h
 # Benchmark dataset
 benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
 # Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
 # Allow tracked library source folders under submodules (e.g., benchmarks/lib)
 !vllm/benchmarks/lib/
 # Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
 vllm/grpc/vllm_engine_pb2.py
 vllm/grpc/vllm_engine_pb2_grpc.py
 vllm/grpc/vllm_engine_pb2.pyi
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 # 寒武纪 mlu370 文本生成
 该模型测试框架在寒武纪mlu370 （X8/X4）加速卡上，基于vllm 推理引擎，适配了 Qwen1.5-1.8B-Chat 模型。
 * Qwen1.5-1.8B-Chat 是通义千问系列中一款约18亿参数、轻量级的中英文对话大模型，专为高效推理和多场景聊天交互设计。
 * Llama-2-7b-chat-hf：Meta 发布的 LLaMA 2 系列中 70 亿参数的对话优化版开源大模型，适合多轮聊天与通用任务。
 * ChatGLM3-6B：智谱 AI 推出的第 3 代 ChatGLM 系列中 60 亿参数的中英双语对话大模型，支持推理、代码和多任务能力。
--- a/vllm-v0.6.2/vllm/config.py
+++ b/vllm-v0.6.2/vllm/config.py
@@ -274,7 +274,13 @@ class ModelConfig:
        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
    ) -> Optional["MultiModalConfig"]:
        architectures = getattr(self.hf_config, "architectures", [])
-        if ModelRegistry.is_multimodal_model(architectures):
+        if ModelRegistry.is_multimodal_model(
            architectures,
            model_path=self.model,
            revision=self.revision,
            trust_remote_code=self.trust_remote_code,
            hf_config=self.hf_config,
        ):
            return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
        if limit_mm_per_prompt:
@@ -308,11 +314,23 @@ class ModelConfig:
    def _init_attention_free(self) -> bool:
        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_attention_free_model(architectures)
+        return ModelRegistry.is_attention_free_model(
            architectures,
            model_path=self.model,
            revision=self.revision,
            trust_remote_code=self.trust_remote_code,
            hf_config=self.hf_config,
        )
    def _init_has_inner_state(self) -> bool:
        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.model_has_inner_state(architectures)
+        return ModelRegistry.model_has_inner_state(
            architectures,
            model_path=self.model,
            revision=self.revision,
            trust_remote_code=self.trust_remote_code,
            hf_config=self.hf_config,
        )
    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = self.tokenizer_mode.lower()
@@ -335,8 +353,20 @@ class ModelConfig:
        task_support: Dict[_Task, bool] = {
            # NOTE: Listed from highest to lowest priority,
            # in case the model supports multiple of them
-            "generate": ModelRegistry.is_text_generation_model(architectures),
+            "generate": ModelRegistry.is_text_generation_model(
-            "embedding": ModelRegistry.is_embedding_model(architectures),
+                architectures,
                model_path=self.model,
                revision=self.revision,
                trust_remote_code=self.trust_remote_code,
                hf_config=hf_config,
            ),
            "embedding": ModelRegistry.is_embedding_model(
                architectures,
                model_path=self.model,
                revision=self.revision,
                trust_remote_code=self.trust_remote_code,
                hf_config=hf_config,
            ),
        }
        supported_tasks_lst: List[_Task] = [
            task for task, is_supported in task_support.items() if is_supported
--- a/vllm-v0.6.2/vllm/model_executor/layers/linear.py
+++ b/vllm-v0.6.2/vllm/model_executor/layers/linear.py
@@ -146,6 +146,7 @@ class LinearBase(torch.nn.Module):
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        return_bias: If False, return only output tensor instead of (output, bias) tuple.
    """
    def __init__(
@@ -156,6 +157,7 @@ class LinearBase(torch.nn.Module):
        params_dtype: Optional[torch.dtype] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
        return_bias: bool = True,
    ):
        super().__init__()
@@ -163,6 +165,7 @@ class LinearBase(torch.nn.Module):
        self.input_size = input_size
        self.output_size = output_size
        self.skip_bias_add = skip_bias_add
        self.return_bias = return_bias
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.params_dtype = params_dtype
@@ -198,13 +201,15 @@ class ReplicatedLinear(LinearBase):
                 skip_bias_add: bool = False,
                 params_dtype: Optional[torch.dtype] = None,
                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
                 return_bias: bool = True):
        super().__init__(input_size,
                         output_size,
                         skip_bias_add,
                         params_dtype,
                         quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
                         return_bias=return_bias)
        # All the linear layer supports quant method.
        assert self.quant_method is not None
@@ -238,6 +243,9 @@ class ReplicatedLinear(LinearBase):
        bias = self.bias if not self.skip_bias_add else None
        assert self.quant_method is not None
        output = self.quant_method.apply(self, x, bias)
        if not self.return_bias:
            return output
        output_bias = self.bias if self.skip_bias_add else None
        return output, output_bias
@@ -281,9 +289,10 @@ class ColumnParallelLinear(LinearBase):
                 params_dtype: Optional[torch.dtype] = None,
                 quant_config: Optional[QuantizationConfig] = None,
                 output_sizes: Optional[List[int]] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
                 return_bias: bool = True):
        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, prefix)
+                         quant_config, prefix, return_bias=return_bias)
        self.gather_output = gather_output
@@ -375,6 +384,9 @@ class ColumnParallelLinear(LinearBase):
            output = tensor_model_parallel_all_gather(output_parallel)
        else:
            output = output_parallel
        if not self.return_bias:
            return output
        output_bias = self.bias if self.skip_bias_add else None
        return output, output_bias
@@ -418,7 +430,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 skip_bias_add: bool = False,
                 params_dtype: Optional[torch.dtype] = None,
                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
                 return_bias: bool = True):
        self.output_sizes = output_sizes
        tp_size = get_tensor_model_parallel_world_size()
        assert all(output_size % tp_size == 0 for output_size in output_sizes)
@@ -429,7 +442,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                         skip_bias_add=skip_bias_add,
                         params_dtype=params_dtype,
                         quant_config=quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
                         return_bias=return_bias)
    def weight_loader(self,
                      param: Parameter,
@@ -653,7 +667,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                 skip_bias_add: bool = False,
                 params_dtype: Optional[torch.dtype] = None,
                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
                 return_bias: bool = True):
        self.hidden_size = hidden_size
        self.head_size = head_size
        self.total_num_heads = total_num_heads
@@ -686,7 +701,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                         skip_bias_add=skip_bias_add,
                         params_dtype=params_dtype,
                         quant_config=quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
                         return_bias=return_bias)
    def _get_shard_offset_mapping(self, loaded_shard_id: str):
        shard_offset_mapping = {
@@ -980,9 +996,10 @@ class RowParallelLinear(LinearBase):
                 params_dtype: Optional[torch.dtype] = None,
                 reduce_results: bool = True,
                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
                 return_bias: bool = True):
        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, prefix)
+                         quant_config, prefix, return_bias=return_bias)
        self.input_is_parallel = input_is_parallel
        self.reduce_results = reduce_results
@@ -1086,8 +1103,9 @@ class RowParallelLinear(LinearBase):
        else:
            output = output_parallel
        if not self.return_bias:
            return output
        output_bias = self.bias if self.skip_bias_add else None
        return output, output_bias
    def extra_repr(self) -> str:
--- a/vllm-v0.6.2/vllm/model_executor/model_loader/utils.py
+++ b/vllm-v0.6.2/vllm/model_executor/model_loader/utils.py
@@ -32,7 +32,13 @@ def get_model_architecture(
            and "MixtralForCausalLM" in architectures):
        architectures = ["QuantMixtralForCausalLM"]
-    return ModelRegistry.resolve_model_cls(architectures)
+    return ModelRegistry.resolve_model_cls(
        architectures,
        model_path=model_config.model,
        revision=model_config.revision,
        trust_remote_code=model_config.trust_remote_code,
        hf_config=model_config.hf_config,
    )
 def get_architecture_class_name(model_config: ModelConfig) -> str:
--- a/vllm-v0.6.2/vllm/model_executor/models/registry.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/registry.py
@@ -16,9 +16,11 @@ from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type,
 import cloudpickle
 import torch.nn as nn
 import transformers
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
 from .interfaces import (has_inner_state, is_attention_free,
                         supports_multimodal, supports_pp)
@@ -157,6 +159,13 @@ _SPECULATIVE_DECODING_MODELS = {
    "MedusaModel": ("medusa", "Medusa"),
    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 # Transformers backend models - wrapper classes for custom HuggingFace models
 # These provide the vLLM interface for models loaded via auto_map
 _TRANSFORMERS_BACKEND_MODELS = {
    # Text generation models
    "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
 _VLLM_MODELS = {
@@ -164,6 +173,7 @@ _VLLM_MODELS = {
    **_EMBEDDING_MODELS,
    **_MULTIMODAL_MODELS,
    **_SPECULATIVE_DECODING_MODELS,
    **_TRANSFORMERS_BACKEND_MODELS,
 }
 # Models not supported by ROCm.
@@ -369,6 +379,91 @@ class _ModelRegistry:
        return _try_inspect_model_cls(model_arch, self.models[model_arch])
    def _try_resolve_transformers(
        self,
        architecture: str,
        model_path: str,
        revision: Optional[str],
        trust_remote_code: bool,
        hf_config: Optional[object] = None,
    ) -> Optional[str]:
        """
        Try to resolve a model architecture using the Transformers backend.
        This allows loading custom models that define their own implementation
        via the `auto_map` field in config.json.
        Returns the vLLM wrapper architecture name (e.g. "TransformersForCausalLM")
        if the model can be loaded via auto_map, None otherwise.
        """
        # If architecture is already a transformers backend model, return it
        if architecture in _TRANSFORMERS_BACKEND_MODELS:
            return architecture
        # Check if architecture exists in transformers library
        model_module = getattr(transformers, architecture, None)
        if model_module is not None:
            # Model exists in transformers, can use TransformersForCausalLM wrapper
            logger.info(
                "Architecture %s found in transformers library, "
                "using TransformersForCausalLM wrapper",
                architecture
            )
            return "TransformersForCausalLM"
        # Get auto_map from hf_config
        auto_map: Dict[str, str] = {}
        if hf_config is not None:
            auto_map = getattr(hf_config, "auto_map", None) or {}
        if not auto_map:
            return None
        # Try to load from auto_map to verify it works
        # First, ensure config class is loaded
        for name, module in auto_map.items():
            if name.startswith("AutoConfig"):
                try_get_class_from_dynamic_module(
                    module,
                    model_path,
                    trust_remote_code=trust_remote_code,
                    revision=revision,
                    warn_on_fail=False,
                )
        # Check if auto_map has a model class we can use
        # Priority: AutoModelForCausalLM > AutoModelForSeq2SeqLM > AutoModel
        auto_model_keys = sorted(
            [k for k in auto_map.keys() if k.startswith("AutoModel")],
            key=lambda x: (0 if "ForCausalLM" in x else (1 if "ForSeq2Seq" in x else 2))
        )
        for name in auto_model_keys:
            module = auto_map[name]
            model_cls = try_get_class_from_dynamic_module(
                module,
                model_path,
                trust_remote_code=trust_remote_code,
                revision=revision,
                warn_on_fail=True,
            )
            if model_cls is not None:
                # Only log once per model class to avoid spam
                log_key = f"{model_cls.__name__}_{name}"
                if not hasattr(self, '_logged_custom_models'):
                    self._logged_custom_models = set()
                if log_key not in self._logged_custom_models:
                    logger.info(
                        "Found custom model class %s from auto_map[%s], "
                        "using TransformersForCausalLM wrapper",
                        model_cls.__name__,
                        name
                    )
                    self._logged_custom_models.add(log_key)
                # Return the wrapper architecture, not the actual class
                return "TransformersForCausalLM"
        return None
    def _normalize_archs(
        self,
        architectures: Union[str, List[str]],
@@ -383,6 +478,10 @@ class _ModelRegistry:
    def inspect_model_cls(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> _ModelInfo:
        architectures = self._normalize_archs(architectures)
@@ -391,11 +490,27 @@ class _ModelRegistry:
            if model_info is not None:
                return model_info
        # Fallback: try to resolve using transformers backend (auto_map)
        if model_path and trust_remote_code and hf_config:
            for arch in architectures:
                wrapper_arch = self._try_resolve_transformers(
                    arch, model_path, revision, trust_remote_code, hf_config
                )
                if wrapper_arch is not None:
                    # Use the wrapper architecture's ModelInfo
                    model_info = self._try_inspect_model_cls(wrapper_arch)
                    if model_info is not None:
                        return model_info
        return self._raise_for_unsupported(architectures)
    def resolve_model_cls(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> Tuple[Type[nn.Module], str]:
        architectures = self._normalize_archs(architectures)
@@ -404,39 +519,91 @@ class _ModelRegistry:
            if model_cls is not None:
                return (model_cls, arch)
        # Fallback: try to resolve using transformers backend (auto_map)
        if model_path and trust_remote_code and hf_config:
            for arch in architectures:
                wrapper_arch = self._try_resolve_transformers(
                    arch, model_path, revision, trust_remote_code, hf_config
                )
                if wrapper_arch is not None:
                    model_cls = self._try_load_model_cls(wrapper_arch)
                    if model_cls is not None:
                        # Return wrapper class but keep original architecture name
                        return (model_cls, arch)
        return self._raise_for_unsupported(architectures)
    def is_text_generation_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).is_text_generation_model
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).is_text_generation_model
    def is_embedding_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).is_embedding_model
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).is_embedding_model
    def is_multimodal_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).supports_multimodal
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).supports_multimodal
    def is_pp_supported_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).supports_pp
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).supports_pp
-    def model_has_inner_state(self, architectures: Union[str,
+    def model_has_inner_state(
-                                                         List[str]]) -> bool:
+        self,
-        return self.inspect_model_cls(architectures).has_inner_state
+        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).has_inner_state
-    def is_attention_free_model(self, architectures: Union[str,
+    def is_attention_free_model(
-                                                           List[str]]) -> bool:
+        self,
-        return self.inspect_model_cls(architectures).is_attention_free
+        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).is_attention_free
 ModelRegistry = _ModelRegistry({
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/init.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/init.py
@@ -0,0 +1,127 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The vLLM team.
 """Wrapper around `transformers` models for vLLM v0.6.2.
 This module provides the Transformers modeling backend that wraps
 any HuggingFace model with the vLLM interface, enabling support for custom
 models that define their implementation via `auto_map` in config.json.
 Architecture (following latest vLLM patterns):
 - Base: Core functionality (meta init, PP/TP support, module replacement, attention, weight loading)
 - CausalMixin: Causal LM specific (lm_head, compute_logits, sample)
 - EmbeddingMixin: Embedding/pooling specific (pooler, pooling)
 - SequenceClassificationMixin: Classification specific (classifier, pooling)
 Composed model classes:
 - TransformersForCausalLM = CausalMixin + Base
 - TransformersForEmbedding = EmbeddingMixin + Base
 - TransformersForSequenceClassification = SequenceClassificationMixin + Base
 Key optimizations:
 - Meta device initialization for memory efficiency
 - Pipeline Parallel support (PPMissingLayer)
 - Tensor Parallel support (tp_plan based module replacement)
 - Module replacement (Linear, RMSNorm, Embedding) with vLLM optimized versions
 - vLLM Attention instances for proper KV cache allocation
 - AutoWeightsLoader for efficient weight loading with name mapping
 """
 from vllm.model_executor.models.transformers.base import (
    Base,
    set_attention_context,
    clear_attention_context,
    get_attention_context,
    vllm_flash_attention_forward,
 )
 from vllm.model_executor.models.transformers.causal import CausalMixin
 from vllm.model_executor.models.transformers.pooling import (
    EmbeddingMixin,
    SequenceClassificationMixin,
 )
 from vllm.model_executor.models.transformers.legacy import LegacyMixin
 from vllm.model_executor.models.transformers.utils import (
    init_on_device_without_buffers,
    replace_linear_class,
    replace_rms_norm_class,
    log_replacement,
    maybe_prefix,
 )
 # ============================================================================
 # Composed Model Classes (Mixin + Base pattern)
 # ============================================================================
 class TransformersForCausalLM(CausalMixin, Base):
    """
    Transformers backend wrapper for causal language models.
    Combines CausalMixin (lm_head, compute_logits, sample) with
    Base (meta init, PP/TP support, module replacement, attention, weight loading).
    Supports any HuggingFace model with auto_map in config.json.
    """
    pass
 class TransformersForEmbedding(EmbeddingMixin, Base):
    """
    Transformers backend wrapper for embedding/sentence similarity models.
    Combines EmbeddingMixin (pooler, pooling) with
    Base (meta init, PP/TP support, module replacement, attention, weight loading).
    Supports embedding models like BERT, sentence-transformers, etc.
    """
    pass
 class TransformersForSequenceClassification(SequenceClassificationMixin, Base):
    """
    Transformers backend wrapper for sequence classification models.
    Combines SequenceClassificationMixin (classifier, pooling) with
    Base (meta init, PP/TP support, module replacement, attention, weight loading).
    Supports cross-encoders and classification models.
    """
    pass
 class TransformersForLegacy(LegacyMixin, EmbeddingMixin, Base):
    """
    Transformers backend wrapper for legacy/encoder models.
    Combines LegacyMixin (BERT/RoBERTa weight mapping, position handling) with
    EmbeddingMixin (pooler) and Base (core functionality).
    Supports BERT, RoBERTa, and similar encoder models.
    """
    pass
 __all__ = [
    # Main wrapper classes
    "TransformersForCausalLM",
    "TransformersForEmbedding",
    "TransformersForSequenceClassification",
    "TransformersForLegacy",
    # Base class for extension
    "Base",
    # Mixin classes for custom combinations
    "CausalMixin",
    "EmbeddingMixin",
    "SequenceClassificationMixin",
    "LegacyMixin",
    # Attention context management
    "set_attention_context",
    "clear_attention_context",
    "get_attention_context",
    "vllm_flash_attention_forward",
    # Utility functions
    "init_on_device_without_buffers",
    "replace_linear_class",
    "replace_rms_norm_class",
    "log_replacement",
    "maybe_prefix",
 ]
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/base.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/base.py
@@ -0,0 +1,704 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The vLLM team.
 """Transformers modeling backend base class for v0.6.2.
 This module provides the Base class following latest vLLM architecture:
 - Meta device initialization for memory efficiency
 - Pipeline parallel support (PPMissingLayer)
 - Tensor parallel support (tp_plan based module replacement)
 - Module replacement (Linear, RMSNorm) with vLLM optimized versions
 - VocabParallelEmbedding for input embeddings
 - Attention instances for KV cache allocation
 - Weight loading with AutoWeightsLoader and WeightsMapper
 """
 import re
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Tuple
 import torch
 import torch.nn as nn
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tp_group
 from vllm.distributed.utils import get_pp_indices
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.utils import (
    AutoWeightsLoader,
    PPMissingLayer,
    WeightsMapper,
    make_empty_intermediate_tensors_factory,
 )
 from vllm.attention.layer import Attention
 from vllm.sequence import IntermediateTensors
 from .utils import (
    init_on_device_without_buffers,
    replace_linear_class,
    replace_rms_norm_class,
    log_replacement,
    maybe_prefix,
 )
 if TYPE_CHECKING:
    from transformers import PreTrainedModel
    from vllm.attention import AttentionMetadata
 logger = init_logger(__name__)
 # ============================================================================
 # Attention Context Management (for vLLM attention integration)
 # ============================================================================
 _current_attn_metadata = None
 _current_kv_caches = None
 def set_attention_context(attn_metadata, kv_caches):
    """Set the current attention context for vLLM attention functions."""
    global _current_attn_metadata, _current_kv_caches
    _current_attn_metadata = attn_metadata
    _current_kv_caches = kv_caches
 def clear_attention_context():
    """Clear the current attention context after forward pass."""
    global _current_attn_metadata, _current_kv_caches
    _current_attn_metadata = None
    _current_kv_caches = None
 def get_attention_context():
    """Get the current attention context."""
    return _current_attn_metadata, _current_kv_caches
 # ============================================================================
 # vLLM Attention Function for Transformers Integration
 # ============================================================================
 def vllm_flash_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: torch.Tensor,
    scaling: float = None,
    attention_instances: Dict[int, Attention] = None,
    **kwargs,
 ):
    """
    vLLM's optimized attention function for transformers integration.
    In v0.6.2, Attention.forward signature is:
        (query, key, value, kv_cache, attn_metadata)
    """
    layer_idx = getattr(module, 'layer_idx', 0)
    if attention_instances is None or layer_idx not in attention_instances:
        return _standard_attention(query, key, value, attention_mask, scaling)
    self_attn = attention_instances[layer_idx]
    attn_metadata, kv_caches = get_attention_context()
    if attn_metadata is None or kv_caches is None:
        return _standard_attention(query, key, value, attention_mask, scaling)
    if scaling is not None:
        self_attn.impl.scale = float(scaling)
    # Reshape: [batch, heads, seq, head_dim] -> [seq, heads * head_dim]
    hidden = query.shape[-2]
    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
    kv_cache = kv_caches[layer_idx] if layer_idx < len(kv_caches) else None
    output = self_attn.forward(query, key, value, kv_cache, attn_metadata)
    return output, None
 def _standard_attention(query, key, value, attention_mask, scaling):
    """Standard scaled dot-product attention fallback."""
    attn_weights = torch.matmul(query, key.transpose(-2, -1))
    if scaling is not None:
        attn_weights = attn_weights * scaling
    if attention_mask is not None:
        attn_weights = attn_weights + attention_mask
    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
    attn_output = torch.matmul(attn_weights, value)
    return attn_output, None
 # Register vLLM attention to transformers
 _vllm_attention_registered = False
 try:
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
    _vllm_attention_registered = True
    logger.info("Registered vLLM attention function to transformers")
 except (ImportError, AttributeError) as e:
    logger.warning("Could not register vLLM attention: %s", e)
 # ============================================================================
 # Base Class with Pipeline Parallel and Tensor Parallel Support
 # ============================================================================
 class Base(nn.Module):
    """
    Base class for Transformers backend models with full parallel support.
    Features:
    - Pipeline Parallel: PPMissingLayer for distributed layers
    - Tensor Parallel: tp_plan based module replacement
    - Meta device initialization
    - Module replacement (Linear → vLLM Linear, RMSNorm → vLLM RMSNorm)
    - VocabParallelEmbedding for input embeddings
    - Attention instances for KV cache allocation
    """
    # For vLLM's weight loader
    embedding_modules = ["embed_tokens"]
    # Weight name mapping following latest vLLM pattern
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            # Add `model.` prefix for base model checkpoints,
            # handling the case where it is already present
            "": "model.",
            "model.model.": "model.",
            # Heads will be adjacent to `model` (pooling included because of adapters)
            "model.lm_head.": "lm_head.",
            "model.score.": "classifier.",
            "model.classifier.": "classifier.",
        }
    )
    # Note: __init_subclass__ with WeightsMapper merging is not supported in v0.6.2
    # because WeightsMapper doesn't implement __or__/__ior__ operators.
    # Each Mixin should define its own hf_to_vllm_mapper if needed.
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
        super().__init__()
        logger.info("Using Transformers modeling backend.")
        # Store configuration
        self.config = vllm_config.model_config.hf_config
        self.text_config = getattr(self.config, "text_config", self.config)
        self.model_config = vllm_config.model_config
        self.cache_config = vllm_config.cache_config
        self.device_config = vllm_config.device_config
        self.parallel_config = vllm_config.parallel_config
        self.quant_config = vllm_config.quant_config
        self.prefix = prefix
        # Parallel groups
        self.pp_group = get_pp_group()
        self.tp_group = get_tp_group()
        # Model dimensions
        self.hidden_size = getattr(self.text_config, "hidden_size", 4096)
        self.vocab_size = getattr(self.text_config, "vocab_size", 32000)
        # Weight loading configuration
        self.skip_prefixes: List[str] = []
        self.ignore_unexpected_prefixes: List[str] = []
        # Configure attention backend
        self._configure_attention_backend()
        # Create model on meta device
        self._init_model_on_meta()
        # Apply pipeline parallel
        self._apply_pipeline_parallel()
        # Replace modules (with tensor parallel support)
        self._replace_modules()
        # Fix attention head_dim in case config was incorrect
        self._fix_attention_head_dim()
        # Add debug hook to first attention module to capture tensor shapes
        self._add_attention_debug_hook()
        # Replace input embeddings
        self._replace_input_embeddings()
        # Create attention instances
        self.attention_instances = self._create_attention_instances()
        # Initialize parameters on target device
        self._init_parameters()
        # Pipeline parallel intermediate tensors
        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
            ["hidden_states"], self.hidden_size
        )
    def _configure_attention_backend(self) -> None:
        """Configure vLLM attention backend."""
        # Note: attention implementation is set in _init_model_on_meta
        # This method is kept for potential platform-specific configuration
        pass
    def _init_model_on_meta(self) -> None:
        """Create model structure on meta device."""
        from transformers import AutoModel
        logger.info("Creating model structure on meta device...")
        # Set attention implementation to vLLM's
        self.text_config._attn_implementation = "vllm"
        # Ensure head_dim is correctly set in BOTH config and text_config
        # Transformers models use config.head_dim to compute attention dimensions
        # Some models may have incorrect head_dim, so we compute and set it
        if hasattr(self.text_config, "num_attention_heads") and hasattr(self.text_config, "hidden_size"):
            correct_head_dim = self.text_config.hidden_size // self.text_config.num_attention_heads
            # Check and fix head_dim in text_config
            if hasattr(self.text_config, "head_dim"):
                if self.text_config.head_dim != correct_head_dim:
                    logger.warning(
                        "Correcting head_dim in text_config: %d -> %d",
                        self.text_config.head_dim, correct_head_dim
                    )
                    self.text_config.head_dim = correct_head_dim
            else:
                self.text_config.head_dim = correct_head_dim
            # Also set in self.config (which is passed to AutoModel.from_config)
            if hasattr(self.config, "head_dim"):
                if self.config.head_dim != correct_head_dim:
                    logger.warning(
                        "Correcting head_dim in config: %d -> %d",
                        self.config.head_dim, correct_head_dim
                    )
                    self.config.head_dim = correct_head_dim
            else:
                self.config.head_dim = correct_head_dim
            # Some models also need _attn_implementation in config
            self.config._attn_implementation = "vllm"
        with init_on_device_without_buffers("meta"):
            self.model: "PreTrainedModel" = AutoModel.from_config(
                self.config,
                torch_dtype=self.model_config.dtype,
                trust_remote_code=self.model_config.trust_remote_code,
            )
        self.model.eval()
        for param in self.model.parameters():
            param.requires_grad = False
    def _apply_pipeline_parallel(self) -> None:
        """
        Apply pipeline parallelization plan.
        For models that don't explicitly support pp_plan, we do a best-effort
        approach by splitting layers based on num_hidden_layers.
        """
        if self.pp_group.world_size <= 1:
            return
        logger.info("Applying pipeline parallel (world_size=%d, rank=%d)",
                   self.pp_group.world_size, self.pp_group.rank_in_group)
        num_layers = getattr(self.text_config, "num_hidden_layers",
                            getattr(self.text_config, "num_layers", 32))
        start_layer, end_layer = get_pp_indices(
            num_layers,
            self.pp_group.rank_in_group,
            self.pp_group.world_size,
        )
        # Find and process layer modules
        layers_module = self._find_layers_module()
        if layers_module is not None:
            layers = list(layers_module.children())
            for i, layer in enumerate(layers):
                if not (start_layer <= i < end_layer):
                    # Replace layers not on this rank with PPMissingLayer
                    setattr(layers_module, str(i), PPMissingLayer())
        # Handle embeddings (only on first rank)
        if not self.pp_group.is_first_rank:
            input_embeddings = self.model.get_input_embeddings()
            if input_embeddings is not None:
                # Keep a reference but mark as missing for forward
                self._has_embeddings = False
        else:
            self._has_embeddings = True
        # Handle final norm and lm_head (only on last rank)  
        if not self.pp_group.is_last_rank:
            # Mark lm_head as missing
            if hasattr(self.model, 'lm_head'):
                self.model.lm_head = PPMissingLayer()
        logger.info("Pipeline parallel applied: layers %d-%d on this rank",
                   start_layer, end_layer)
    def _find_layers_module(self) -> Optional[nn.Module]:
        """Find the ModuleList containing transformer layers."""
        # Common layer container names
        layer_names = ['layers', 'h', 'blocks', 'layer', 'encoder.layer', 'decoder.layers']
        def _search_layers(module: nn.Module, prefix: str = "") -> Optional[nn.Module]:
            for name, child in module.named_children():
                if name in ['layers', 'h', 'blocks', 'layer'] and isinstance(child, nn.ModuleList):
                    return child
                # Recursively search in model backbone
                if name in ['model', 'transformer', 'encoder', 'decoder']:
                    result = _search_layers(child, f"{prefix}.{name}" if prefix else name)
                    if result is not None:
                        return result
            return None
        return _search_layers(self.model)
    def _get_tp_plan(self) -> Dict[str, str]:
        """
        Get tensor parallel plan for module replacement.
        This maps module name patterns to parallelization styles:
        - "colwise": Column parallel (split output dim)
        - "rowwise": Row parallel (split input dim)
        - "replicate": Replicated (no split)
        Returns a dict mapping regex patterns to styles.
        """
        # Check if model has explicit tp_plan
        if hasattr(self.model, 'tp_plan') and self.model.tp_plan:
            return {maybe_prefix("model", k): v for k, v in self.model.tp_plan.items()}
        # Default tp_plan for common LLM architectures
        # Based on typical transformer structure
        return {
            r".*\.q_proj$": "colwise",
            r".*\.k_proj$": "colwise", 
            r".*\.v_proj$": "colwise",
            r".*\.o_proj$": "rowwise",
            r".*\.gate_proj$": "colwise",
            r".*\.up_proj$": "colwise",
            r".*\.down_proj$": "rowwise",
            r".*\.query$": "colwise",
            r".*\.key$": "colwise",
            r".*\.value$": "colwise",
            r".*\.dense$": "rowwise",
            r".*\.fc1$": "colwise",
            r".*\.fc2$": "rowwise",
        }
    def _replace_modules(self) -> None:
        """
        Replace modules with vLLM optimized versions.
        Uses tp_plan for tensor parallel style selection.
        Note: lm_head is NOT replaced here - it's created at wrapper level by CausalMixin.
        """
        logger.info("Replacing modules with vLLM optimized versions...")
        replaced_count = 0
        # Get tensor parallel plan
        tp_plan = self._get_tp_plan() if self.tp_group.world_size > 1 else {}
        # Modules to skip replacement (handled at wrapper level)
        skip_modules = {"lm_head", "score", "classifier"}
        def _recursive_replace(module: nn.Module, prefix: str = ""):
            nonlocal replaced_count
            for name, child in list(module.named_children()):
                # Skip PPMissingLayer
                if isinstance(child, PPMissingLayer):
                    continue
                # Skip modules that are handled at wrapper level
                if name in skip_modules:
                    logger.debug("Skipping %s (handled at wrapper level)", name)
                    continue
                qual_name = maybe_prefix(prefix, name)
                new_module = None
                if isinstance(child, nn.Linear):
                    # Determine parallelization style from tp_plan
                    style = "replicate"
                    for pattern, plan_style in tp_plan.items():
                        if re.match(pattern, qual_name):
                            style = plan_style
                            break
                    new_module = replace_linear_class(
                        child,
                        style=style,
                        quant_config=self.quant_config,
                        prefix=qual_name,
                    )
                    replaced_count += 1
                elif child.__class__.__name__.endswith("RMSNorm") and \
                        not isinstance(child, RMSNorm):
                    new_module = replace_rms_norm_class(child, self.hidden_size)
                    replaced_count += 1
                if new_module is not None:
                    setattr(module, name, new_module)
                    log_replacement(qual_name, child, new_module)
                else:
                    _recursive_replace(child, qual_name)
        _recursive_replace(self.model, "model")
        logger.info("Replaced %d modules", replaced_count)
    def _add_attention_debug_hook(self) -> None:
        """No-op. Debug hooks removed after root cause identified."""
        pass
    def _fix_attention_head_dim(self) -> None:
        """
        Fix head_dim in attention modules and rotary embeddings after model creation.
        Some models may have incorrect head_dim in config, which causes
        Transformers attention modules and RoPE to use wrong dimensions.
        This method corrects head_dim in all attention modules and recreates
        rotary embeddings if needed.
        """
        correct_head_dim = self.hidden_size // getattr(
            self.text_config, "num_attention_heads", 32
        )
        fixed_count = 0
        for name, module in self.model.named_modules():
            module_name = module.__class__.__name__
            # Fix head_dim in Attention modules
            if "Attention" in module_name:
                if hasattr(module, "head_dim"):
                    if module.head_dim != correct_head_dim:
                        logger.warning(
                            "Fixing head_dim in %s: %d -> %d",
                            name, module.head_dim, correct_head_dim
                        )
                        module.head_dim = correct_head_dim
                        fixed_count += 1
            # Fix rotary embeddings - recreate inv_freq buffer if needed
            if "RotaryEmbedding" in module_name:
                if hasattr(module, "inv_freq"):
                    current_dim = module.inv_freq.shape[0] * 2
                    if current_dim != correct_head_dim:
                        logger.warning(
                            "Recreating rotary embedding %s: dim %d -> %d",
                            name, current_dim, correct_head_dim
                        )
                        base = getattr(module.config, 'rope_theta', 10000.0)
                        if hasattr(module.config, 'rope_parameters'):
                            base = module.config.rope_parameters.get('rope_theta', base)
                        device = module.inv_freq.device
                        inv_freq = 1.0 / (
                            base ** (
                                torch.arange(0, correct_head_dim, 2, dtype=torch.int64)
                                .to(device=device, dtype=torch.float) / correct_head_dim
                            )
                        )
                        module.register_buffer("inv_freq", inv_freq, persistent=False)
                        if hasattr(module, "original_inv_freq"):
                            module.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
        if fixed_count > 0:
            logger.info("Fixed head_dim in %d attention modules", fixed_count)
    def _replace_input_embeddings(self) -> None:
        """Replace input embeddings with VocabParallelEmbedding."""
        input_embeddings = self.model.get_input_embeddings()
        if input_embeddings is None or isinstance(input_embeddings, PPMissingLayer):
            return
        if hasattr(input_embeddings, "embedding_dim"):
            embedding_dim = input_embeddings.embedding_dim
        elif hasattr(input_embeddings, "weight"):
            embedding_dim = input_embeddings.weight.shape[1]
        else:
            embedding_dim = self.hidden_size
        self.embed_scale = getattr(input_embeddings, "embed_scale", None)
        logger.info("Replacing input embeddings (vocab=%d, dim=%d)",
                   self.vocab_size, embedding_dim)
        new_embeddings = VocabParallelEmbedding(
            self.vocab_size,
            embedding_dim,
            org_num_embeddings=self.vocab_size,
            quant_config=self.quant_config,
        )
        self.model.set_input_embeddings(new_embeddings)
    def _create_attention_instances(self) -> Dict[int, Attention]:
        """Create Attention instances for KV cache allocation."""
        num_layers = getattr(self.text_config, "num_hidden_layers",
                            getattr(self.text_config, "num_layers", 32))
        num_heads = getattr(self.text_config, "num_attention_heads", 32)
        head_size = self.hidden_size // num_heads
        num_kv_heads = getattr(self.text_config, "num_key_value_heads", num_heads)
        # Get PP layer range
        pp_rank = self.pp_group.rank_in_group
        pp_size = self.pp_group.world_size
        start_layer, end_layer = get_pp_indices(num_layers, pp_rank, pp_size)
        logger.info("Creating attention instances for layers %d-%d "
                   "(heads=%d, head_size=%d, kv_heads=%d)",
                   start_layer, end_layer, num_heads, head_size, num_kv_heads)
        attention_instances: Dict[int, Attention] = {}
        for layer_idx in range(start_layer, end_layer):
            per_layer_sliding_window = None
            if hasattr(self.config, "layer_types"):
                layer_types = self.config.layer_types
                if layer_idx < len(layer_types) and layer_types[layer_idx] == "sliding_attention":
                    per_layer_sliding_window = getattr(self.config, "sliding_window", None)
            attention = Attention(
                num_heads=num_heads,
                head_size=head_size,
                scale=1.0 / (head_size ** 0.5),
                num_kv_heads=num_kv_heads,
                cache_config=self.cache_config,
                quant_config=self.quant_config,
                prefix=f"model.layers.{layer_idx}.self_attn",
            )
            attention_instances[layer_idx] = attention
        return attention_instances
    def _init_parameters(self) -> None:
        """Initialize parameters from meta device to target device."""
        device = self.device_config.device
        if device is None:
            device = torch.device("cpu")
        dtype = self.model_config.dtype
        def _init_params(module: nn.Module):
            if isinstance(module, PPMissingLayer):
                return
            for name, param in list(module.named_parameters(recurse=False)):
                if param.device == torch.device("meta"):
                    new_param = nn.Parameter(
                        torch.empty_like(param.data, dtype=dtype, device=device),
                        requires_grad=False,
                    )
                    setattr(module, name, new_param)
            for child in module.children():
                _init_params(child)
        _init_params(self.model)
        logger.info("Parameters initialized on %s", device)
    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
        """Get embeddings for input IDs."""
        inputs_embeds = self.model.get_input_embeddings()(input_ids)
        if self.embed_scale is not None:
            inputs_embeds = inputs_embeds * self.embed_scale
        return inputs_embeds
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: "AttentionMetadata",
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        """Forward pass with pipeline parallel support."""
        # Handle intermediate tensors for PP
        if not self.pp_group.is_first_rank:
            assert intermediate_tensors is not None
            input_ids = None
            inputs_embeds = intermediate_tensors["hidden_states"]
        set_attention_context(attn_metadata, kv_caches)
        try:
            # Prepare inputs
            if inputs_embeds is not None:
                if inputs_embeds.dim() == 2:
                    inputs_embeds = inputs_embeds.unsqueeze(0)
                model_inputs = {"inputs_embeds": inputs_embeds}
            else:
                if input_ids is not None and input_ids.dim() == 1:
                    input_ids = input_ids.unsqueeze(0)
                model_inputs = {"input_ids": input_ids}
            if positions is not None:
                if positions.dim() == 1:
                    positions = positions.unsqueeze(0)
                model_inputs["position_ids"] = positions
            # Apply embed_scale if needed
            if (
                self.embed_scale is not None
                and input_ids is not None
                and inputs_embeds is None
            ):
                inputs_embeds = self.embed_input_ids(model_inputs["input_ids"])
                model_inputs = {"inputs_embeds": inputs_embeds}
                if positions is not None:
                    model_inputs["position_ids"] = positions
            # Forward through model
            # Note: return_dict=False returns tuple, first element is last hidden state
            with torch.no_grad():
                outputs = self.model(
                    **model_inputs,
                    use_cache=False,
                    return_dict=False,
                    attention_instances=self.attention_instances,
                )
            # Get hidden states from model output
            # For models using return_dict=False, outputs is a tuple
            # outputs[0] is usually the last hidden state
            if isinstance(outputs, tuple):
                hidden_states = outputs[0]
            else:
                hidden_states = outputs
            # Remove batch dimension
            if hidden_states.dim() == 3 and hidden_states.size(0) == 1:
                hidden_states = hidden_states.squeeze(0)
            # Return intermediate tensors for PP
            if not self.pp_group.is_last_rank:
                return IntermediateTensors({"hidden_states": hidden_states})
            return hidden_states
        finally:
            clear_attention_context()
    def load_weights(
        self,
        weights: Iterable[Tuple[str, torch.Tensor]],
    ) -> Set[str]:
        """Load weights using AutoWeightsLoader with name mapping."""
        loader = AutoWeightsLoader(
            self,
            skip_prefixes=self.skip_prefixes,
            ignore_unexpected_prefixes=self.ignore_unexpected_prefixes,
        )
        loaded = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
        logger.info("Loaded %d weight tensors", len(loaded))
        return set(loaded)
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/causal.py
@@ -0,0 +1,142 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The vLLM team.
 """Transformers modeling backend mixin for causal language models.
 This module provides CausalMixin that adds causal language model specific
 functionality (lm_head, compute_logits, sample) to the Base class.
 Following latest vLLM architecture:
 - TransformersForCausalLM = CausalMixin + Base
 - lm_head is created at the wrapper level (not inside self.model)
 """
 from typing import TYPE_CHECKING, Optional
 import torch
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.utils import PPMissingLayer, maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 if TYPE_CHECKING:
    from vllm.config import VllmConfig
 logger = init_logger(__name__)
 class CausalMixin:
    """
    Mixin class that adds causal language model functionality.
    This mixin provides:
    - ParallelLMHead for language model head (created at wrapper level)
    - LogitsProcessor for logits computation
    - Sampler for token sampling
    - compute_logits method for VllmModelForTextGeneration protocol
    - sample method for VllmModelForTextGeneration protocol
    Following latest vLLM architecture:
    - lm_head is a direct attribute of TransformersForCausalLM (not inside self.model)
    - hf_to_vllm_mapper maps "model.lm_head." -> "lm_head." to handle this
    - For tied embeddings, lm_head weight loading is skipped and weights are tied
    Should be used with Base class:
        class TransformersForCausalLM(CausalMixin, Base): ...
    """
    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = "") -> None:
        # Call next class in MRO (should be Base)
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        # Handle tied word embeddings - skip loading lm_head weights
        tie_word_embeddings = getattr(self.text_config, "tie_word_embeddings", False)
        if tie_word_embeddings:
            self.skip_prefixes.append("lm_head.")
            logger.info("Model has tied word embeddings, will tie lm_head weights")
        # Create lm_head at wrapper level (following latest vLLM architecture)
        # This is outside self.model, so weights map "model.lm_head." -> "lm_head."
        if self.pp_group.is_last_rank:
            self.lm_head = ParallelLMHead(
                self.vocab_size,
                self.hidden_size,
                quant_config=self.quant_config,
                prefix=maybe_prefix(prefix, "lm_head"),
            )
            # Tie weights if needed
            if tie_word_embeddings:
                input_embeddings = self.model.get_input_embeddings()
                if input_embeddings is not None:
                    self.lm_head = self.lm_head.tie_weights(input_embeddings)
                    logger.info("Tied lm_head weights with input embeddings")
            # Setup logits processor
            logit_scale = getattr(self.text_config, "logit_scale", 1.0)
            self.logits_processor = LogitsProcessor(
                self.vocab_size,
                logits_as_input=False,
                scale=logit_scale,
            )
            logger.info("CausalMixin initialized (vocab_size=%d, hidden_size=%d, logit_scale=%s)",
                       self.vocab_size, self.hidden_size, logit_scale)
        else:
            # For non-last PP ranks, use PPMissingLayer
            self.lm_head = PPMissingLayer()
            self.logits_processor = None
            logger.info("CausalMixin initialized (PP non-last rank, using PPMissingLayer)")
        # Setup sampler
        self.sampler = Sampler()
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        """
        Compute logits from hidden states.
        This method conforms to the VllmModelForTextGeneration protocol.
        Args:
            hidden_states: Hidden states from the model [seq_len, hidden_size]
            sampling_metadata: Sampling metadata
        Returns:
            Logits tensor or None
        """
        if self.logits_processor is None:
            # Non-last PP rank
            return None
        # In v0.6.2, LogitsProcessor handles the lm_head projection internally
        # via lm_head.linear_method.apply(). Pass lm_head as the first arg.
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits
    def sample(
        self,
        logits: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[SamplerOutput]:
        """
        Sample tokens from logits.
        This method conforms to the VllmModelForTextGeneration protocol.
        Args:
            logits: Logits tensor
            sampling_metadata: Sampling metadata
        Returns:
            SamplerOutput with sampled tokens
        """
        next_tokens = self.sampler(logits, sampling_metadata)
        return next_tokens
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/legacy.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/legacy.py
@@ -0,0 +1,118 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The vLLM team.
 """Transformers modeling backend mixin for legacy models.
 This module provides LegacyMixin for BERT-like encoder models that have
 different weight naming conventions and special position handling.
 Following latest vLLM architecture patterns adapted for v0.6.2.
 """
 from typing import TYPE_CHECKING, List, Optional
 import torch
 from vllm.logger import init_logger
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.sequence import IntermediateTensors
 if TYPE_CHECKING:
    from vllm.config import VllmConfig
 logger = init_logger(__name__)
 class LegacyMixin:
    """
    Mixin class for legacy/encoder models like BERT, RoBERTa.
    This mixin provides:
    - Weight name mapping for legacy suffix conventions (.gamma/.beta)
    - Prefix mapping for BERT-like model structures
    - RoBERTa-specific position handling
    - Skip prefixes for unsupported output layers
    Should be used with Base class:
        class TransformersForLegacy(LegacyMixin, Base): ...
    """
    # Weight name mapping for legacy models
    hf_to_vllm_mapper = WeightsMapper(
        # These are applied in order, so the order matters!
        orig_to_new_prefix={
            # Handle BERT-like models
            "roberta": "model",
            "bert": "model",
        },
        orig_to_new_suffix={
            # Replace legacy suffixes used for norms
            ".gamma": ".weight",
            ".beta": ".bias",
        },
    )
    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = "") -> None:
        # Call next class in MRO (should be Base)
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        # Skip unsupported/unwanted output embeddings layers
        self.skip_prefixes.extend([
            "model.lm_head.",
            "model.predictions.",
            "model.qa_outputs.",
            "model.embeddings_project.",
            "model.discriminator_predictions.",
        ])
        # v0.6.2 doesn't have skip_substrs, so we handle it differently
        # Store patterns to skip during weight loading
        self._legacy_skip_patterns: List[str] = [
            "position_ids",  # Some encoder models have position_ids buffer
            "score.bias",    # Final classifier bias not used by vLLM
        ]
        # RoBERTa-like models have extra padding in positions
        model_type = getattr(self.text_config, "model_type", "").lower()
        self.is_roberta = "roberta" in model_type
        self.padding_idx = getattr(self.text_config, "pad_token_id", 1)
        if self.is_roberta:
            logger.info("LegacyMixin detected RoBERTa model, enabling position padding")
        logger.info("LegacyMixin initialized for legacy/encoder model")
    def _should_skip_weight(self, name: str) -> bool:
        """Check if a weight should be skipped during loading."""
        for pattern in self._legacy_skip_patterns:
            if pattern in name:
                return True
        return False
    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
        Forward pass with RoBERTa position handling.
        RoBERTa models require positions to be offset by padding_idx + 1.
        """
        if self.is_roberta and positions is not None:
            # RoBERTa-specific positions padding
            positions = positions + self.padding_idx + 1
        return super().forward(
            input_ids=input_ids,
            positions=positions,
            kv_caches=kv_caches,
            attn_metadata=attn_metadata,
            intermediate_tensors=intermediate_tensors,
            inputs_embeds=inputs_embeds,
            **kwargs,
        )
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/pooling.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/pooling.py
@@ -0,0 +1,170 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The vLLM team.
 """Transformers modeling backend mixins for pooling/embedding models.
 This module provides mixins for embedding and sequence classification models:
 - EmbeddingMixin: For embedding/sentence similarity models
 - SequenceClassificationMixin: For sequence classification/cross-encoding
 Following latest vLLM architecture patterns adapted for v0.6.2.
 """
 from typing import TYPE_CHECKING, List, Optional
 import torch
 import torch.nn as nn
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import PoolerOutput
 if TYPE_CHECKING:
    from vllm.config import VllmConfig
 logger = init_logger(__name__)
 class EmbeddingMixin:
    """
    Mixin class that adds embedding/pooling functionality.
    This mixin provides:
    - Pooler layer for extracting embeddings
    - pooling method for VllmModelForPooling protocol
    Should be used with Base class:
        class TransformersForEmbedding(EmbeddingMixin, Base): ...
    """
    # Default pooling configuration
    default_pooling_type: PoolingType = PoolingType.CLS
    default_normalize: bool = True
    default_softmax: bool = False
    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = "") -> None:
        # Call next class in MRO (should be Base)
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        # Get pooler config from model config
        pooler_config = vllm_config.model_config.pooler_config
        # Setup pooler
        self.pooler = Pooler.from_config_with_defaults(
            pooler_config=pooler_config,
            pooling_type=self.default_pooling_type,
            normalize=self.default_normalize,
            softmax=self.default_softmax,
        )
        if self.pooler is None:
            # Create default pooler if config doesn't specify
            self.pooler = Pooler(
                pooling_type=self.default_pooling_type,
                normalize=self.default_normalize,
                softmax=self.default_softmax,
            )
        logger.info("EmbeddingMixin initialized (pooling_type=%s, normalize=%s)",
                   self.pooler.pooling_type.name, self.pooler.normalize)
    def pooling(
        self,
        hidden_states: torch.Tensor,
        pooling_metadata: PoolingMetadata,
    ) -> Optional[PoolerOutput]:
        """
        Apply pooling to hidden states.
        Args:
            hidden_states: Hidden states from the model [seq_len, hidden_size]
            pooling_metadata: Pooling metadata
        Returns:
            PoolerOutput with pooled embeddings
        """
        return self.pooler(hidden_states, pooling_metadata)
 class SequenceClassificationMixin(EmbeddingMixin):
    """
    Mixin class that adds sequence classification functionality.
    This mixin provides:
    - Classifier layer for sequence classification
    - pooling method with classification logits
    Should be used with Base class:
        class TransformersForSequenceClassification(SequenceClassificationMixin, Base): ...
    """
    default_pooling_type: PoolingType = PoolingType.CLS
    default_normalize: bool = False
    default_softmax: bool = True
    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = "") -> None:
        # Call EmbeddingMixin.__init__ -> Base.__init__
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        # Find and setup classifier layer
        self.classifier = self._find_classifier()
        if self.classifier is not None:
            # Initialize classifier parameters on device
            self._init_classifier_params()
            logger.info("SequenceClassificationMixin initialized with classifier")
        else:
            logger.warning("Could not find classifier layer")
    def _find_classifier(self) -> Optional[nn.Module]:
        """Find the classifier layer in the model."""
        # Common classifier layer names
        classifier_names = ['classifier', 'score', 'fc', 'head']
        for name in classifier_names:
            if hasattr(self.model, name):
                return getattr(self.model, name)
        return None
    def _init_classifier_params(self) -> None:
        """Initialize classifier parameters on target device."""
        device = self.device_config.device
        if device is None:
            device = torch.device("cpu")
        dtype = self.model_config.dtype
        for name, param in list(self.classifier.named_parameters()):
            if param.device == torch.device("meta"):
                new_param = nn.Parameter(
                    torch.empty_like(param.data, dtype=dtype, device=device),
                    requires_grad=False,
                )
                setattr(self.classifier, name.split('.')[-1], new_param)
    def pooling(
        self,
        hidden_states: torch.Tensor,
        pooling_metadata: PoolingMetadata,
    ) -> Optional[PoolerOutput]:
        """
        Apply pooling and classification to hidden states.
        Args:
            hidden_states: Hidden states from the model [seq_len, hidden_size]
            pooling_metadata: Pooling metadata
        Returns:
            PoolerOutput with classification logits
        """
        # First apply base pooling
        pooled = self.pooler(hidden_states, pooling_metadata)
        # Apply classifier if available
        if self.classifier is not None and pooled is not None:
            # Apply classifier to each pooled output
            for i, output in enumerate(pooled.outputs):
                if hasattr(output, 'data'):
                    output.data = self.classifier(output.data)
        return pooled
--- a/vllm-v0.6.2/vllm/model_executor/models/transformers/utils.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/transformers/utils.py
@@ -0,0 +1,247 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The vLLM team.
 """Transformers modeling backend utilities for v0.6.2.
 This module provides utility functions for the Transformers backend,
 including context managers for meta device initialization and
 module replacement functions.
 """
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Literal, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
    ColumnParallelLinear,
    ReplicatedLinear,
    RowParallelLinear,
 )
 if TYPE_CHECKING:
    from vllm.model_executor.layers.quantization.base_config import (
        QuantizationConfig,
    )
 logger = init_logger(__name__)
@contextmanager
 def init_on_device_without_buffers(device: Union[str, torch.device]):
    """
    A context manager under which models are initialized with all
    parameters on the specified device. However buffers are not
    initialized on specified device.
    This is useful for creating model structure without allocating
    GPU memory, which is essential for memory efficiency.
    Args:
        device: Device to initialize all parameters on (e.g., "meta").
    Example:
        with init_on_device_without_buffers("meta"):
            model = AutoModel.from_config(config)
        # Now model is on meta device, no GPU memory allocated
    """
    if isinstance(device, str):
        device = torch.device(device)
    old_register_parameter = nn.Module.register_parameter
    def register_empty_parameter(module, name, param):
        old_register_parameter(module, name, param)
        if param is not None:
            param_cls = type(module._parameters[name])
            kwargs = module._parameters[name].__dict__
            kwargs["requires_grad"] = param.requires_grad
            module._parameters[name] = param_cls(
                module._parameters[name].to(device), **kwargs
            )
    try:
        nn.Module.register_parameter = register_empty_parameter
        yield
    finally:
        nn.Module.register_parameter = old_register_parameter
 # Linear replacement styles
 Style = Literal["colwise", "colwise_rep", "rowwise", "rowwise_rep", "replicate"]
 def replace_linear_class(
    linear: nn.Linear,
    style: Style = "replicate",
    quant_config: Optional["QuantizationConfig"] = None,
    prefix: str = "",
 ) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
    """
    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
    This replacement provides:
    - Memory efficiency through proper tensor allocation
    - Support for quantization
    - Tensor parallel support (when using ColumnParallel/RowParallel)
    Args:
        linear: `nn.Linear` to be replaced.
        style: Tensor parallel style of the new linear:
            - "colwise": Column parallel (split output dim)
            - "colwise_rep": Column parallel with gather output
            - "rowwise": Row parallel (split input dim)
            - "rowwise_rep": Row parallel without parallel input
            - "replicate": Replicated (no parallelism)
        quant_config: Quantization config for the new linear.
        prefix: The name of the layer for weight loading.
    Returns:
        The new vLLM linear layer.
    """
    if not isinstance(style, str):
        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
    vllm_linear_cls, vllm_linear_kwargs = {
        "colwise": (ColumnParallelLinear, {}),
        "colwise_rep": (ColumnParallelLinear, {"gather_output": True}),
        "rowwise": (RowParallelLinear, {}),
        "rowwise_rep": (RowParallelLinear, {"input_is_parallel": False}),
        "replicate": (ReplicatedLinear, {}),
    }.get(style, (ReplicatedLinear, {}))
    return vllm_linear_cls(
        input_size=linear.in_features,
        output_size=linear.out_features,
        bias=linear.bias is not None,
        quant_config=quant_config,
        prefix=prefix,
        return_bias=False,  # Return tensor only, not (tensor, bias) tuple
        **vllm_linear_kwargs,
    )
 class TransformersRMSNorm(RMSNorm):
    """
    vLLM RMSNorm subclass that preserves tensor dimensions.
    vLLM's RMSNorm (especially the MLU backend) flattens input to 2D
    (e.g., [batch, seq, hidden] -> [batch*seq, hidden]), but transformers
    expects the batch dimension to be preserved. This subclass wraps
    the parent forward methods to save and restore the original tensor shape.
    Since this inherits from RMSNorm directly, weight loading via
    named_parameters() works correctly (weight path stays the same).
    """
    def forward_native(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_native(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_cuda(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_cuda(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_mlu(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_mlu(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_xpu(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_xpu(x, residual)
        return self._restore_shape(result, orig_shape)
    def forward_hpu(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ):
        orig_shape = x.shape
        result = super().forward_hpu(x, residual)
        return self._restore_shape(result, orig_shape)
    @staticmethod
    def _restore_shape(result, orig_shape: Tuple):
        """Restore original tensor shape if it was changed."""
        if isinstance(result, tuple):
            restored = []
            for t in result:
                if t is not None and t.shape != orig_shape:
                    t = t.view(orig_shape)
                restored.append(t)
            return tuple(restored)
        else:
            if result.shape != orig_shape:
                result = result.view(orig_shape)
            return result
 def replace_rms_norm_class(
    rms_norm: nn.Module,
    hidden_size: int,
 ) -> nn.Module:
    """
    Replace a Transformers RMSNorm with vLLM's optimized RMSNorm,
    wrapped to preserve tensor dimensions.
    vLLM's RMSNorm provides:
    - Fused CUDA kernels for better performance
    - Support for fused add + norm operations
    The wrapper ensures that the original tensor shape (including batch
    dimension) is preserved, which is required by transformers' model
    forward methods.
    Args:
        rms_norm: The RMSNorm module to replace.
        hidden_size: The hidden size of the model.
    Returns:
        The new vLLM RMSNorm layer wrapped for shape preservation.
    """
    # Try to get epsilon from various attribute names
    eps = getattr(rms_norm, "eps", None)
    if eps is None:
        eps = getattr(rms_norm, "variance_epsilon", None)
    if eps is None:
        eps = 1e-6
    # Check if weight exists and get its size
    weight = getattr(rms_norm, "weight", None)
    if weight is not None:
        hidden_size = weight.size(0)
    return TransformersRMSNorm(hidden_size=hidden_size, eps=eps)
 def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
    """Log module replacement for debugging."""
    logger.debug("Replaced %s: %s -> %s", name, type(old_module).__name__, type(new_module).__name__)
 def maybe_prefix(prefix: str, name: str) -> str:
    """Combine prefix and name with a dot separator."""
    if prefix:
        return f"{prefix}.{name}"
    return name
--- a/vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
@@ -0,0 +1,76 @@
 """
 Dynamic module loading utilities for custom HuggingFace models.
 Ported from latest vLLM to support auto_map in model config.
 """
 import os
 from typing import Dict, Optional, Type, Union
 from transformers.dynamic_module_utils import (
    get_class_from_dynamic_module,
    resolve_trust_remote_code,
 )
 import vllm.envs as envs
 from vllm.logger import init_logger
 logger = init_logger(__name__)
 def try_get_class_from_dynamic_module(
    class_reference: str,
    pretrained_model_name_or_path: str,
    trust_remote_code: bool,
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: Optional[bool] = None,
    proxies: Optional[Dict[str, str]] = None,
    token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
    repo_type: Optional[str] = None,
    code_revision: Optional[str] = None,
    warn_on_fail: bool = True,
    **kwargs,
 ) -> Optional[Type]:
    """
    As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
    but ignoring any errors.
    This allows vLLM to load custom models that define their own
    model classes via the `auto_map` field in config.json.
    """
    try:
        resolve_trust_remote_code(
            trust_remote_code,
            pretrained_model_name_or_path,
            has_local_code=False,
            has_remote_code=True,
        )
        return get_class_from_dynamic_module(
            class_reference,
            pretrained_model_name_or_path,
            cache_dir=cache_dir,
            force_download=force_download,
            resume_download=resume_download,
            proxies=proxies,
            token=token,
            revision=revision,
            local_files_only=local_files_only,
            repo_type=repo_type,
            code_revision=code_revision,
            **kwargs,
        )
    except Exception:
        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
        if warn_on_fail:
            logger.warning(
                "Unable to load %s from %s on %s.",
                class_reference,
                pretrained_model_name_or_path,
                location,
                exc_info=True,
            )
        return None
--- a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py
+++ b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py
@@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward(
    smooth_quant_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
    qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
    '''
    =============================
    Modify by vllm_mlu
    =============================
    @brief: pack q & k to fit tmo.apply_rotary
    @optimization: avoid redundant split operation
    '''
    if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
    else:
-        qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
+        # Optimized: split qkv into [qk, v] directly, avoiding redundant split
        qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
        self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
        # Split qk into q and k after rotary embedding
        q, k = qk.split([self.q_size, self.kv_size], dim=-1)
    '''
    ==================
    End of MLU Hijack
Author	SHA1	Message	Date
Chranos	ebdc6fed03	fix: pass lm_head to LogitsProcessor instead of calling forward() In vLLM v0.6.2, ParallelLMHead.forward() raises RuntimeError since its weights should be used through LogitsProcessor.linear_method.apply(). Pass lm_head as first arg to LogitsProcessor which handles the hidden_states -> logits projection internally.	2026-02-06 14:21:14 +08:00
Chranos	b702adf015	testing dynamic register	2026-02-06 14:17:06 +08:00
Chranos	fba02652c8	testing dynamic register	2026-02-06 14:04:04 +08:00
Chranos	5d2f4000cc	testing dynamic register	2026-02-06 13:51:02 +08:00
Chranos	f088a6b45d	testing dynamic register	2026-02-06 13:39:13 +08:00
Chranos	d31ace279b	testing dynamic register	2026-02-05 18:57:04 +08:00
Chranos	ac2082ff36	testing dynamic register	2026-02-05 18:48:11 +08:00
Chranos	2068984bde	testing dynamic register	2026-02-05 18:36:03 +08:00
Chranos	df848b4284	testing dynamic register	2026-02-05 18:24:33 +08:00
Chranos	4d0da98b9e	testing dynamic register	2026-02-05 18:21:31 +08:00
Chranos	05605419e3	testing dynamic register	2026-02-05 18:08:05 +08:00
Chranos	332e5f71a6	testing dynamic register	2026-02-05 18:02:59 +08:00
Chranos	6e38461af6	testing dynamic register	2026-02-05 17:11:09 +08:00
Chranos	b399840b8d	testing dynamic register	2026-02-05 16:30:44 +08:00
Chranos	808b9b7c97	删除 .DS_Store	2026-02-05 16:20:54 +08:00
Chranos	6b650ae280	add gitignore	2026-02-05 16:19:33 +08:00
Chranos	92f0016e6f	add dynamic register	2026-02-05 15:53:43 +08:00
Chranos	9563c9af0d	opt llama3	2026-02-05 11:53:52 +08:00
Chranos	3b3e614cb6	opt llama3	2026-02-05 11:42:01 +08:00
Chranos	3cf13dd8c5	add ops	2026-02-04 17:51:35 +08:00