4 Commits

Author SHA1 Message Date
Chranos
6b650ae280 add gitignore 2026-02-05 16:19:33 +08:00
Chranos
92f0016e6f add dynamic register 2026-02-05 15:53:43 +08:00
Chranos
9563c9af0d opt llama3 2026-02-05 11:53:52 +08:00
Chranos
3b3e614cb6 opt llama3 2026-02-05 11:42:01 +08:00
7 changed files with 490 additions and 16 deletions

BIN
.DS_Store vendored

Binary file not shown.

240
.gitignore vendored Normal file
View File

@@ -0,0 +1,240 @@
# version file generated by setuptools-scm
/vllm/_version.py
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
# OpenAI triton kernels copied from source
vllm/third_party/triton_kernels/*
# FlashMLA interface copied from source
vllm/third_party/flashmla/flash_mla_interface.py
# triton jit
.triton
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
/.deps/
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# generated files
**/generated/**
# uv
uv.lock
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
docs/argparse
docs/examples/*
!docs/examples/README.md
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# VSCode
.vscode/
# Claude
CLAUDE.md
.claude/
# Codex
AGENTS.md
.codex/
# Cursor
.cursor/
# DS Store
.DS_Store
# Results
*.csv
# Python pickle files
*.pkl
# Sphinx documentation
_build/
# vim swap files
*.swo
*.swp
# hip files generated by PyTorch
*.hip
*_hip*
hip_compat.h
# Benchmark dataset
benchmarks/**/*.json
# Linting
actionlint
shellcheck*/
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder
ep_kernels_workspace/
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
!vllm/benchmarks/lib/
# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
vllm/grpc/vllm_engine_pb2.py
vllm/grpc/vllm_engine_pb2_grpc.py
vllm/grpc/vllm_engine_pb2.pyi

View File

@@ -274,7 +274,13 @@ class ModelConfig:
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
) -> Optional["MultiModalConfig"]:
architectures = getattr(self.hf_config, "architectures", [])
if ModelRegistry.is_multimodal_model(architectures):
if ModelRegistry.is_multimodal_model(
architectures,
model_path=self.model,
revision=self.revision,
trust_remote_code=self.trust_remote_code,
hf_config=self.hf_config,
):
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
if limit_mm_per_prompt:
@@ -308,11 +314,23 @@ class ModelConfig:
def _init_attention_free(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return ModelRegistry.is_attention_free_model(architectures)
return ModelRegistry.is_attention_free_model(
architectures,
model_path=self.model,
revision=self.revision,
trust_remote_code=self.trust_remote_code,
hf_config=self.hf_config,
)
def _init_has_inner_state(self) -> bool:
architectures = getattr(self.hf_config, "architectures", [])
return ModelRegistry.model_has_inner_state(architectures)
return ModelRegistry.model_has_inner_state(
architectures,
model_path=self.model,
revision=self.revision,
trust_remote_code=self.trust_remote_code,
hf_config=self.hf_config,
)
def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = self.tokenizer_mode.lower()

View File

@@ -32,7 +32,13 @@ def get_model_architecture(
and "MixtralForCausalLM" in architectures):
architectures = ["QuantMixtralForCausalLM"]
return ModelRegistry.resolve_model_cls(architectures)
return ModelRegistry.resolve_model_cls(
architectures,
model_path=model_config.model,
revision=model_config.revision,
trust_remote_code=model_config.trust_remote_code,
hf_config=model_config.hf_config,
)
def get_architecture_class_name(model_config: ModelConfig) -> str:

View File

@@ -16,9 +16,11 @@ from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type,
import cloudpickle
import torch.nn as nn
import transformers
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
from .interfaces import (has_inner_state, is_attention_free,
supports_multimodal, supports_pp)
@@ -157,6 +159,11 @@ _SPECULATIVE_DECODING_MODELS = {
"MedusaModel": ("medusa", "Medusa"),
"MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
}
# Transformers backend models - for custom models with auto_map
_TRANSFORMERS_BACKEND_MODELS = {
"TransformersForCausalLM": ("transformers_backend", "TransformersForCausalLM"),
}
# yapf: enable
_VLLM_MODELS = {
@@ -369,6 +376,62 @@ class _ModelRegistry:
return _try_inspect_model_cls(model_arch, self.models[model_arch])
def _try_resolve_transformers(
self,
architecture: str,
model_path: str,
revision: Optional[str],
trust_remote_code: bool,
hf_config: Optional[object] = None,
) -> Optional[Type[nn.Module]]:
"""
Try to resolve a model architecture using the Transformers backend.
This allows loading custom models that define their own implementation
via the `auto_map` field in config.json.
Returns the loaded model class if successful, None otherwise.
"""
# Check if architecture is in transformers
model_module = getattr(transformers, architecture, None)
# Get auto_map from hf_config
auto_map: Dict[str, str] = {}
if hf_config is not None:
auto_map = getattr(hf_config, "auto_map", None) or {}
if model_module is None and auto_map:
# Try to load from auto_map
# First, ensure config class is loaded
for prefix in ("AutoConfig", "AutoModel"):
for name, module in auto_map.items():
if name.startswith(prefix):
try_get_class_from_dynamic_module(
module,
model_path,
trust_remote_code=trust_remote_code,
revision=revision,
warn_on_fail=False,
)
# Now try to load the model class
for name, module in auto_map.items():
if name.startswith("AutoModel"):
model_module = try_get_class_from_dynamic_module(
module,
model_path,
trust_remote_code=trust_remote_code,
revision=revision,
warn_on_fail=True,
)
if model_module is not None:
logger.info(
"Loaded custom model class %s from auto_map",
model_module.__name__
)
return model_module
return model_module
def _normalize_archs(
self,
architectures: Union[str, List[str]],
@@ -383,6 +446,10 @@ class _ModelRegistry:
def inspect_model_cls(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> _ModelInfo:
architectures = self._normalize_archs(architectures)
@@ -391,11 +458,25 @@ class _ModelRegistry:
if model_info is not None:
return model_info
# Fallback: try to resolve using transformers backend (auto_map)
if model_path and trust_remote_code and hf_config:
for arch in architectures:
model_cls = self._try_resolve_transformers(
arch, model_path, revision, trust_remote_code, hf_config
)
if model_cls is not None:
# Create ModelInfo from the dynamically loaded class
return _ModelInfo.from_model_cls(model_cls)
return self._raise_for_unsupported(architectures)
def resolve_model_cls(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> Tuple[Type[nn.Module], str]:
architectures = self._normalize_archs(architectures)
@@ -404,39 +485,88 @@ class _ModelRegistry:
if model_cls is not None:
return (model_cls, arch)
# Fallback: try to resolve using transformers backend (auto_map)
if model_path and trust_remote_code and hf_config:
for arch in architectures:
model_cls = self._try_resolve_transformers(
arch, model_path, revision, trust_remote_code, hf_config
)
if model_cls is not None:
return (model_cls, arch)
return self._raise_for_unsupported(architectures)
def is_text_generation_model(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> bool:
return self.inspect_model_cls(architectures).is_text_generation_model
return self.inspect_model_cls(
architectures, model_path, revision, trust_remote_code, hf_config
).is_text_generation_model
def is_embedding_model(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> bool:
return self.inspect_model_cls(architectures).is_embedding_model
return self.inspect_model_cls(
architectures, model_path, revision, trust_remote_code, hf_config
).is_embedding_model
def is_multimodal_model(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> bool:
return self.inspect_model_cls(architectures).supports_multimodal
return self.inspect_model_cls(
architectures, model_path, revision, trust_remote_code, hf_config
).supports_multimodal
def is_pp_supported_model(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> bool:
return self.inspect_model_cls(architectures).supports_pp
return self.inspect_model_cls(
architectures, model_path, revision, trust_remote_code, hf_config
).supports_pp
def model_has_inner_state(self, architectures: Union[str,
List[str]]) -> bool:
return self.inspect_model_cls(architectures).has_inner_state
def model_has_inner_state(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> bool:
return self.inspect_model_cls(
architectures, model_path, revision, trust_remote_code, hf_config
).has_inner_state
def is_attention_free_model(self, architectures: Union[str,
List[str]]) -> bool:
return self.inspect_model_cls(architectures).is_attention_free
def is_attention_free_model(
self,
architectures: Union[str, List[str]],
model_path: Optional[str] = None,
revision: Optional[str] = None,
trust_remote_code: bool = False,
hf_config: Optional[object] = None,
) -> bool:
return self.inspect_model_cls(
architectures, model_path, revision, trust_remote_code, hf_config
).is_attention_free
ModelRegistry = _ModelRegistry({

View File

@@ -0,0 +1,76 @@
"""
Dynamic module loading utilities for custom HuggingFace models.
Ported from latest vLLM to support auto_map in model config.
"""
import os
from typing import Dict, Optional, Type, Union
from transformers.dynamic_module_utils import (
get_class_from_dynamic_module,
resolve_trust_remote_code,
)
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def try_get_class_from_dynamic_module(
class_reference: str,
pretrained_model_name_or_path: str,
trust_remote_code: bool,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[Dict[str, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
local_files_only: bool = False,
repo_type: Optional[str] = None,
code_revision: Optional[str] = None,
warn_on_fail: bool = True,
**kwargs,
) -> Optional[Type]:
"""
As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
but ignoring any errors.
This allows vLLM to load custom models that define their own
model classes via the `auto_map` field in config.json.
"""
try:
resolve_trust_remote_code(
trust_remote_code,
pretrained_model_name_or_path,
has_local_code=False,
has_remote_code=True,
)
return get_class_from_dynamic_module(
class_reference,
pretrained_model_name_or_path,
cache_dir=cache_dir,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
repo_type=repo_type,
code_revision=code_revision,
**kwargs,
)
except Exception:
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
if warn_on_fail:
logger.warning(
"Unable to load %s from %s on %s.",
class_reference,
pretrained_model_name_or_path,
location,
exc_info=True,
)
return None

View File

@@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward(
smooth_quant_scale: Optional[torch.Tensor] = None,
) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
'''
=============================
Modify by vllm_mlu
=============================
@brief: pack q & k to fit tmo.apply_rotary
@optimization: avoid redundant split operation
'''
if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = self.rotary_emb(positions, q, k)
else:
qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
# Optimized: split qkv into [qk, v] directly, avoiding redundant split
qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
# Split qk into q and k after rotary embedding
q, k = qk.split([self.q_size, self.kv_size], dim=-1)
'''
==================
End of MLU Hijack