add gitignore

add dynamic register
opt llama3
2026-02-05 16:19:33 +08:00 · 2026-02-05 15:53:43 +08:00 · 2026-02-05 11:53:52 +08:00 · 2026-02-05 11:42:01 +08:00
7 changed files with 490 additions and 16 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,240 @@
+# version file generated by setuptools-scm
+/vllm/_version.py
+
+# vllm-flash-attn built from source
+vllm/vllm_flash_attn/*
+
+# OpenAI triton kernels copied from source
+vllm/third_party/triton_kernels/*
+
+# FlashMLA interface copied from source
+vllm/third_party/flashmla/flash_mla_interface.py
+
+# triton jit
+.triton
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+cmake-build-*/
+CMakeUserPresets.json
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+/.deps/
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# generated files
+**/generated/**
+
+# uv
+uv.lock
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+docs/argparse
+docs/examples/*
+!docs/examples/README.md
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# VSCode
+.vscode/
+
+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
+# Cursor
+.cursor/
+
+# DS Store
+.DS_Store
+
+# Results
+*.csv
+
+# Python pickle files
+*.pkl
+
+# Sphinx documentation
+_build/
+
+# vim swap files
+*.swo
+*.swp
+
+# hip files generated by PyTorch
+*.hip
+*_hip*
+hip_compat.h
+
+# Benchmark dataset
+benchmarks/**/*.json
+
+# Linting
+actionlint
+shellcheck*/
+
+# Ignore moe/marlin_moe gen code
+csrc/moe/marlin_moe_wna16/kernel_*
+
+# Ignore ep_kernels_workspace folder
+ep_kernels_workspace/
+
+# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
+!vllm/benchmarks/lib/
+
+# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
+vllm/grpc/vllm_engine_pb2.py
+vllm/grpc/vllm_engine_pb2_grpc.py
+vllm/grpc/vllm_engine_pb2.pyi
--- a/vllm-v0.6.2/vllm/config.py
+++ b/vllm-v0.6.2/vllm/config.py
@@ -274,7 +274,13 @@ class ModelConfig:
        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
    ) -> Optional["MultiModalConfig"]:
        architectures = getattr(self.hf_config, "architectures", [])
-        if ModelRegistry.is_multimodal_model(architectures):
+        if ModelRegistry.is_multimodal_model(
+            architectures,
+            model_path=self.model,
+            revision=self.revision,
+            trust_remote_code=self.trust_remote_code,
+            hf_config=self.hf_config,
+        ):
            return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})

        if limit_mm_per_prompt:
@@ -308,11 +314,23 @@ class ModelConfig:

    def _init_attention_free(self) -> bool:
        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_attention_free_model(architectures)
+        return ModelRegistry.is_attention_free_model(
+            architectures,
+            model_path=self.model,
+            revision=self.revision,
+            trust_remote_code=self.trust_remote_code,
+            hf_config=self.hf_config,
+        )

    def _init_has_inner_state(self) -> bool:
        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.model_has_inner_state(architectures)
+        return ModelRegistry.model_has_inner_state(
+            architectures,
+            model_path=self.model,
+            revision=self.revision,
+            trust_remote_code=self.trust_remote_code,
+            hf_config=self.hf_config,
+        )

    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = self.tokenizer_mode.lower()
--- a/vllm-v0.6.2/vllm/model_executor/model_loader/utils.py
+++ b/vllm-v0.6.2/vllm/model_executor/model_loader/utils.py
@@ -32,7 +32,13 @@ def get_model_architecture(
            and "MixtralForCausalLM" in architectures):
        architectures = ["QuantMixtralForCausalLM"]

-    return ModelRegistry.resolve_model_cls(architectures)
+    return ModelRegistry.resolve_model_cls(
+        architectures,
+        model_path=model_config.model,
+        revision=model_config.revision,
+        trust_remote_code=model_config.trust_remote_code,
+        hf_config=model_config.hf_config,
+    )


 def get_architecture_class_name(model_config: ModelConfig) -> str:
--- a/vllm-v0.6.2/vllm/model_executor/models/registry.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/registry.py
@@ -16,9 +16,11 @@ from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type,

 import cloudpickle
 import torch.nn as nn
+import transformers

 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module

 from .interfaces import (has_inner_state, is_attention_free,
                         supports_multimodal, supports_pp)
@@ -157,6 +159,11 @@ _SPECULATIVE_DECODING_MODELS = {
    "MedusaModel": ("medusa", "Medusa"),
    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
+
+# Transformers backend models - for custom models with auto_map
+_TRANSFORMERS_BACKEND_MODELS = {
+    "TransformersForCausalLM": ("transformers_backend", "TransformersForCausalLM"),
+}
 # yapf: enable

 _VLLM_MODELS = {
@@ -369,6 +376,62 @@ class _ModelRegistry:

        return _try_inspect_model_cls(model_arch, self.models[model_arch])

+    def _try_resolve_transformers(
+        self,
+        architecture: str,
+        model_path: str,
+        revision: Optional[str],
+        trust_remote_code: bool,
+        hf_config: Optional[object] = None,
+    ) -> Optional[Type[nn.Module]]:
+        """
+        Try to resolve a model architecture using the Transformers backend.
+        This allows loading custom models that define their own implementation
+        via the `auto_map` field in config.json.
+        
+        Returns the loaded model class if successful, None otherwise.
+        """
+        # Check if architecture is in transformers
+        model_module = getattr(transformers, architecture, None)
+        
+        # Get auto_map from hf_config
+        auto_map: Dict[str, str] = {}
+        if hf_config is not None:
+            auto_map = getattr(hf_config, "auto_map", None) or {}
+        
+        if model_module is None and auto_map:
+            # Try to load from auto_map
+            # First, ensure config class is loaded
+            for prefix in ("AutoConfig", "AutoModel"):
+                for name, module in auto_map.items():
+                    if name.startswith(prefix):
+                        try_get_class_from_dynamic_module(
+                            module,
+                            model_path,
+                            trust_remote_code=trust_remote_code,
+                            revision=revision,
+                            warn_on_fail=False,
+                        )
+            
+            # Now try to load the model class
+            for name, module in auto_map.items():
+                if name.startswith("AutoModel"):
+                    model_module = try_get_class_from_dynamic_module(
+                        module,
+                        model_path,
+                        trust_remote_code=trust_remote_code,
+                        revision=revision,
+                        warn_on_fail=True,
+                    )
+                    if model_module is not None:
+                        logger.info(
+                            "Loaded custom model class %s from auto_map",
+                            model_module.__name__
+                        )
+                        return model_module
+        
+        return model_module
+
    def _normalize_archs(
        self,
        architectures: Union[str, List[str]],
@@ -383,6 +446,10 @@ class _ModelRegistry:
    def inspect_model_cls(
        self,
        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
    ) -> _ModelInfo:
        architectures = self._normalize_archs(architectures)

@@ -391,11 +458,25 @@ class _ModelRegistry:
            if model_info is not None:
                return model_info

+        # Fallback: try to resolve using transformers backend (auto_map)
+        if model_path and trust_remote_code and hf_config:
+            for arch in architectures:
+                model_cls = self._try_resolve_transformers(
+                    arch, model_path, revision, trust_remote_code, hf_config
+                )
+                if model_cls is not None:
+                    # Create ModelInfo from the dynamically loaded class
+                    return _ModelInfo.from_model_cls(model_cls)
+
        return self._raise_for_unsupported(architectures)

    def resolve_model_cls(
        self,
        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
    ) -> Tuple[Type[nn.Module], str]:
        architectures = self._normalize_archs(architectures)

@@ -404,39 +485,88 @@ class _ModelRegistry:
            if model_cls is not None:
                return (model_cls, arch)

+        # Fallback: try to resolve using transformers backend (auto_map)
+        if model_path and trust_remote_code and hf_config:
+            for arch in architectures:
+                model_cls = self._try_resolve_transformers(
+                    arch, model_path, revision, trust_remote_code, hf_config
+                )
+                if model_cls is not None:
+                    return (model_cls, arch)
+
        return self._raise_for_unsupported(architectures)

    def is_text_generation_model(
        self,
        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).is_text_generation_model
+        return self.inspect_model_cls(
+            architectures, model_path, revision, trust_remote_code, hf_config
+        ).is_text_generation_model

    def is_embedding_model(
        self,
        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).is_embedding_model
+        return self.inspect_model_cls(
+            architectures, model_path, revision, trust_remote_code, hf_config
+        ).is_embedding_model

    def is_multimodal_model(
        self,
        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).supports_multimodal
+        return self.inspect_model_cls(
+            architectures, model_path, revision, trust_remote_code, hf_config
+        ).supports_multimodal

    def is_pp_supported_model(
        self,
        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).supports_pp
+        return self.inspect_model_cls(
+            architectures, model_path, revision, trust_remote_code, hf_config
+        ).supports_pp

-    def model_has_inner_state(self, architectures: Union[str,
-                                                         List[str]]) -> bool:
-        return self.inspect_model_cls(architectures).has_inner_state
+    def model_has_inner_state(
+        self,
+        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
+    ) -> bool:
+        return self.inspect_model_cls(
+            architectures, model_path, revision, trust_remote_code, hf_config
+        ).has_inner_state

-    def is_attention_free_model(self, architectures: Union[str,
-                                                           List[str]]) -> bool:
-        return self.inspect_model_cls(architectures).is_attention_free
+    def is_attention_free_model(
+        self,
+        architectures: Union[str, List[str]],
+        model_path: Optional[str] = None,
+        revision: Optional[str] = None,
+        trust_remote_code: bool = False,
+        hf_config: Optional[object] = None,
+    ) -> bool:
+        return self.inspect_model_cls(
+            architectures, model_path, revision, trust_remote_code, hf_config
+        ).is_attention_free


 ModelRegistry = _ModelRegistry({
--- a/vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
@@ -0,0 +1,76 @@
+"""
+Dynamic module loading utilities for custom HuggingFace models.
+Ported from latest vLLM to support auto_map in model config.
+"""
+import os
+from typing import Dict, Optional, Type, Union
+
+from transformers.dynamic_module_utils import (
+    get_class_from_dynamic_module,
+    resolve_trust_remote_code,
+)
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def try_get_class_from_dynamic_module(
+    class_reference: str,
+    pretrained_model_name_or_path: str,
+    trust_remote_code: bool,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[Dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    repo_type: Optional[str] = None,
+    code_revision: Optional[str] = None,
+    warn_on_fail: bool = True,
+    **kwargs,
+) -> Optional[Type]:
+    """
+    As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
+    but ignoring any errors.
+    
+    This allows vLLM to load custom models that define their own
+    model classes via the `auto_map` field in config.json.
+    """
+    try:
+        resolve_trust_remote_code(
+            trust_remote_code,
+            pretrained_model_name_or_path,
+            has_local_code=False,
+            has_remote_code=True,
+        )
+
+        return get_class_from_dynamic_module(
+            class_reference,
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            repo_type=repo_type,
+            code_revision=code_revision,
+            **kwargs,
+        )
+    except Exception:
+        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
+
+        if warn_on_fail:
+            logger.warning(
+                "Unable to load %s from %s on %s.",
+                class_reference,
+                pretrained_model_name_or_path,
+                location,
+                exc_info=True,
+            )
+
+        return None
--- a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py
+++ b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py
@@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward(
    smooth_quant_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
    qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
-    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
    '''
    =============================
    Modify by vllm_mlu
    =============================
    @brief: pack q & k to fit tmo.apply_rotary
+    @optimization: avoid redundant split operation
    '''
    if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
    else:
-        qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
+        # Optimized: split qkv into [qk, v] directly, avoiding redundant split
+        qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
        self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
+        # Split qk into q and k after rotary embedding
+        q, k = qk.split([self.q_size, self.kv_size], dim=-1)
    '''
    ==================
    End of MLU Hijack
Author	SHA1	Message	Date
Chranos	6b650ae280	add gitignore	2026-02-05 16:19:33 +08:00
Chranos	92f0016e6f	add dynamic register	2026-02-05 15:53:43 +08:00
Chranos	9563c9af0d	opt llama3	2026-02-05 11:53:52 +08:00
Chranos	3b3e614cb6	opt llama3	2026-02-05 11:42:01 +08:00