删除 .DS_Store

add gitignore
add dynamic register
2026-02-05 16:21:10 +08:00 · 2026-02-05 16:19:33 +08:00 · 2026-02-05 15:53:43 +08:00 · 2026-02-05 11:53:52 +08:00 · 2026-02-05 11:42:01 +08:00 · 2026-02-04 17:51:35 +08:00
8 changed files with 491 additions and 16 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,240 @@
 # version file generated by setuptools-scm
 /vllm/_version.py
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 # OpenAI triton kernels copied from source
 vllm/third_party/triton_kernels/*
 # FlashMLA interface copied from source
 vllm/third_party/flashmla/flash_mla_interface.py
 # triton jit
 .triton
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 cmake-build-*/
 CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 /.deps/
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # generated files
 **/generated/**
 # uv
 uv.lock
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 docs/argparse
 docs/examples/*
 !docs/examples/README.md
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 # VSCode
 .vscode/
 # Claude
 CLAUDE.md
 .claude/
 # Codex
 AGENTS.md
 .codex/
 # Cursor
 .cursor/
 # DS Store
 .DS_Store
 # Results
 *.csv
 # Python pickle files
 *.pkl
 # Sphinx documentation
 _build/
 # vim swap files
 *.swo
 *.swp
 # hip files generated by PyTorch
 *.hip
 *_hip*
 hip_compat.h
 # Benchmark dataset
 benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
 # Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
 # Allow tracked library source folders under submodules (e.g., benchmarks/lib)
 !vllm/benchmarks/lib/
 # Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
 vllm/grpc/vllm_engine_pb2.py
 vllm/grpc/vllm_engine_pb2_grpc.py
 vllm/grpc/vllm_engine_pb2.pyi
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 # 寒武纪 mlu370 文本生成
 该模型测试框架在寒武纪mlu370 （X8/X4）加速卡上，基于vllm 推理引擎，适配了 Qwen1.5-1.8B-Chat 模型。
 * Qwen1.5-1.8B-Chat 是通义千问系列中一款约18亿参数、轻量级的中英文对话大模型，专为高效推理和多场景聊天交互设计。
 * Llama-2-7b-chat-hf：Meta 发布的 LLaMA 2 系列中 70 亿参数的对话优化版开源大模型，适合多轮聊天与通用任务。
 * ChatGLM3-6B：智谱 AI 推出的第 3 代 ChatGLM 系列中 60 亿参数的中英双语对话大模型，支持推理、代码和多任务能力。
--- a/vllm-v0.6.2/vllm/config.py
+++ b/vllm-v0.6.2/vllm/config.py
@@ -274,7 +274,13 @@ class ModelConfig:
        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
    ) -> Optional["MultiModalConfig"]:
        architectures = getattr(self.hf_config, "architectures", [])
-        if ModelRegistry.is_multimodal_model(architectures):
+        if ModelRegistry.is_multimodal_model(
            architectures,
            model_path=self.model,
            revision=self.revision,
            trust_remote_code=self.trust_remote_code,
            hf_config=self.hf_config,
        ):
            return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
        if limit_mm_per_prompt:
@@ -308,11 +314,23 @@ class ModelConfig:
    def _init_attention_free(self) -> bool:
        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_attention_free_model(architectures)
+        return ModelRegistry.is_attention_free_model(
            architectures,
            model_path=self.model,
            revision=self.revision,
            trust_remote_code=self.trust_remote_code,
            hf_config=self.hf_config,
        )
    def _init_has_inner_state(self) -> bool:
        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.model_has_inner_state(architectures)
+        return ModelRegistry.model_has_inner_state(
            architectures,
            model_path=self.model,
            revision=self.revision,
            trust_remote_code=self.trust_remote_code,
            hf_config=self.hf_config,
        )
    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = self.tokenizer_mode.lower()
--- a/vllm-v0.6.2/vllm/model_executor/model_loader/utils.py
+++ b/vllm-v0.6.2/vllm/model_executor/model_loader/utils.py
@@ -32,7 +32,13 @@ def get_model_architecture(
            and "MixtralForCausalLM" in architectures):
        architectures = ["QuantMixtralForCausalLM"]
-    return ModelRegistry.resolve_model_cls(architectures)
+    return ModelRegistry.resolve_model_cls(
        architectures,
        model_path=model_config.model,
        revision=model_config.revision,
        trust_remote_code=model_config.trust_remote_code,
        hf_config=model_config.hf_config,
    )
 def get_architecture_class_name(model_config: ModelConfig) -> str:
--- a/vllm-v0.6.2/vllm/model_executor/models/registry.py
+++ b/vllm-v0.6.2/vllm/model_executor/models/registry.py
@@ -16,9 +16,11 @@ from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type,
 import cloudpickle
 import torch.nn as nn
 import transformers
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
 from .interfaces import (has_inner_state, is_attention_free,
                         supports_multimodal, supports_pp)
@@ -157,6 +159,11 @@ _SPECULATIVE_DECODING_MODELS = {
    "MedusaModel": ("medusa", "Medusa"),
    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 # Transformers backend models - for custom models with auto_map
 _TRANSFORMERS_BACKEND_MODELS = {
    "TransformersForCausalLM": ("transformers_backend", "TransformersForCausalLM"),
 }
 # yapf: enable
 _VLLM_MODELS = {
@@ -369,6 +376,62 @@ class _ModelRegistry:
        return _try_inspect_model_cls(model_arch, self.models[model_arch])
    def _try_resolve_transformers(
        self,
        architecture: str,
        model_path: str,
        revision: Optional[str],
        trust_remote_code: bool,
        hf_config: Optional[object] = None,
    ) -> Optional[Type[nn.Module]]:
        """
        Try to resolve a model architecture using the Transformers backend.
        This allows loading custom models that define their own implementation
        via the `auto_map` field in config.json.
        Returns the loaded model class if successful, None otherwise.
        """
        # Check if architecture is in transformers
        model_module = getattr(transformers, architecture, None)
        # Get auto_map from hf_config
        auto_map: Dict[str, str] = {}
        if hf_config is not None:
            auto_map = getattr(hf_config, "auto_map", None) or {}
        if model_module is None and auto_map:
            # Try to load from auto_map
            # First, ensure config class is loaded
            for prefix in ("AutoConfig", "AutoModel"):
                for name, module in auto_map.items():
                    if name.startswith(prefix):
                        try_get_class_from_dynamic_module(
                            module,
                            model_path,
                            trust_remote_code=trust_remote_code,
                            revision=revision,
                            warn_on_fail=False,
                        )
            # Now try to load the model class
            for name, module in auto_map.items():
                if name.startswith("AutoModel"):
                    model_module = try_get_class_from_dynamic_module(
                        module,
                        model_path,
                        trust_remote_code=trust_remote_code,
                        revision=revision,
                        warn_on_fail=True,
                    )
                    if model_module is not None:
                        logger.info(
                            "Loaded custom model class %s from auto_map",
                            model_module.__name__
                        )
                        return model_module
        return model_module
    def _normalize_archs(
        self,
        architectures: Union[str, List[str]],
@@ -383,6 +446,10 @@ class _ModelRegistry:
    def inspect_model_cls(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> _ModelInfo:
        architectures = self._normalize_archs(architectures)
@@ -391,11 +458,25 @@ class _ModelRegistry:
            if model_info is not None:
                return model_info
        # Fallback: try to resolve using transformers backend (auto_map)
        if model_path and trust_remote_code and hf_config:
            for arch in architectures:
                model_cls = self._try_resolve_transformers(
                    arch, model_path, revision, trust_remote_code, hf_config
                )
                if model_cls is not None:
                    # Create ModelInfo from the dynamically loaded class
                    return _ModelInfo.from_model_cls(model_cls)
        return self._raise_for_unsupported(architectures)
    def resolve_model_cls(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> Tuple[Type[nn.Module], str]:
        architectures = self._normalize_archs(architectures)
@@ -404,39 +485,88 @@ class _ModelRegistry:
            if model_cls is not None:
                return (model_cls, arch)
        # Fallback: try to resolve using transformers backend (auto_map)
        if model_path and trust_remote_code and hf_config:
            for arch in architectures:
                model_cls = self._try_resolve_transformers(
                    arch, model_path, revision, trust_remote_code, hf_config
                )
                if model_cls is not None:
                    return (model_cls, arch)
        return self._raise_for_unsupported(architectures)
    def is_text_generation_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).is_text_generation_model
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).is_text_generation_model
    def is_embedding_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).is_embedding_model
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).is_embedding_model
    def is_multimodal_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).supports_multimodal
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).supports_multimodal
    def is_pp_supported_model(
        self,
        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
-        return self.inspect_model_cls(architectures).supports_pp
+        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).supports_pp
-    def model_has_inner_state(self, architectures: Union[str,
+    def model_has_inner_state(
-                                                         List[str]]) -> bool:
+        self,
-        return self.inspect_model_cls(architectures).has_inner_state
+        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).has_inner_state
-    def is_attention_free_model(self, architectures: Union[str,
+    def is_attention_free_model(
-                                                           List[str]]) -> bool:
+        self,
-        return self.inspect_model_cls(architectures).is_attention_free
+        architectures: Union[str, List[str]],
        model_path: Optional[str] = None,
        revision: Optional[str] = None,
        trust_remote_code: bool = False,
        hf_config: Optional[object] = None,
    ) -> bool:
        return self.inspect_model_cls(
            architectures, model_path, revision, trust_remote_code, hf_config
        ).is_attention_free
 ModelRegistry = _ModelRegistry({
--- a/vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
+++ b/vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
@@ -0,0 +1,76 @@
 """
 Dynamic module loading utilities for custom HuggingFace models.
 Ported from latest vLLM to support auto_map in model config.
 """
 import os
 from typing import Dict, Optional, Type, Union
 from transformers.dynamic_module_utils import (
    get_class_from_dynamic_module,
    resolve_trust_remote_code,
 )
 import vllm.envs as envs
 from vllm.logger import init_logger
 logger = init_logger(__name__)
 def try_get_class_from_dynamic_module(
    class_reference: str,
    pretrained_model_name_or_path: str,
    trust_remote_code: bool,
    cache_dir: Optional[Union[str, os.PathLike]] = None,
    force_download: bool = False,
    resume_download: Optional[bool] = None,
    proxies: Optional[Dict[str, str]] = None,
    token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    local_files_only: bool = False,
    repo_type: Optional[str] = None,
    code_revision: Optional[str] = None,
    warn_on_fail: bool = True,
    **kwargs,
 ) -> Optional[Type]:
    """
    As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
    but ignoring any errors.
    This allows vLLM to load custom models that define their own
    model classes via the `auto_map` field in config.json.
    """
    try:
        resolve_trust_remote_code(
            trust_remote_code,
            pretrained_model_name_or_path,
            has_local_code=False,
            has_remote_code=True,
        )
        return get_class_from_dynamic_module(
            class_reference,
            pretrained_model_name_or_path,
            cache_dir=cache_dir,
            force_download=force_download,
            resume_download=resume_download,
            proxies=proxies,
            token=token,
            revision=revision,
            local_files_only=local_files_only,
            repo_type=repo_type,
            code_revision=code_revision,
            **kwargs,
        )
    except Exception:
        location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
        if warn_on_fail:
            logger.warning(
                "Unable to load %s from %s on %s.",
                class_reference,
                pretrained_model_name_or_path,
                location,
                exc_info=True,
            )
        return None
--- a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py
+++ b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/models/llama.py
@@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward(
    smooth_quant_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
    qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
    '''
    =============================
    Modify by vllm_mlu
    =============================
    @brief: pack q & k to fit tmo.apply_rotary
    @optimization: avoid redundant split operation
    '''
    if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
    else:
-        qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
+        # Optimized: split qkv into [qk, v] directly, avoiding redundant split
        qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
        self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
        # Split qk into q and k after rotary embedding
        q, k = qk.split([self.q_size, self.kv_size], dim=-1)
    '''
    ==================
    End of MLU Hijack
Author	SHA1	Message	Date
Chranos	31e7cd3bf9	删除 .DS_Store	2026-02-05 16:21:10 +08:00
Chranos	6b650ae280	add gitignore	2026-02-05 16:19:33 +08:00
Chranos	92f0016e6f	add dynamic register	2026-02-05 15:53:43 +08:00
Chranos	9563c9af0d	opt llama3	2026-02-05 11:53:52 +08:00
Chranos	3b3e614cb6	opt llama3	2026-02-05 11:42:01 +08:00
Chranos	3cf13dd8c5	add ops	2026-02-04 17:51:35 +08:00