Compare commits
6 Commits
v0.0.2
...
31e7cd3bf9
| Author | SHA1 | Date | |
|---|---|---|---|
| 31e7cd3bf9 | |||
|
|
6b650ae280 | ||
|
|
92f0016e6f | ||
|
|
9563c9af0d | ||
|
|
3b3e614cb6 | ||
|
|
3cf13dd8c5 |
240
.gitignore
vendored
Normal file
240
.gitignore
vendored
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
# version file generated by setuptools-scm
|
||||||
|
/vllm/_version.py
|
||||||
|
|
||||||
|
# vllm-flash-attn built from source
|
||||||
|
vllm/vllm_flash_attn/*
|
||||||
|
|
||||||
|
# OpenAI triton kernels copied from source
|
||||||
|
vllm/third_party/triton_kernels/*
|
||||||
|
|
||||||
|
# FlashMLA interface copied from source
|
||||||
|
vllm/third_party/flashmla/flash_mla_interface.py
|
||||||
|
|
||||||
|
# triton jit
|
||||||
|
.triton
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
cmake-build-*/
|
||||||
|
CMakeUserPresets.json
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
/.deps/
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# generated files
|
||||||
|
**/generated/**
|
||||||
|
|
||||||
|
# uv
|
||||||
|
uv.lock
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
docs/argparse
|
||||||
|
docs/examples/*
|
||||||
|
!docs/examples/README.md
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
# VSCode
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
# Claude
|
||||||
|
CLAUDE.md
|
||||||
|
.claude/
|
||||||
|
|
||||||
|
# Codex
|
||||||
|
AGENTS.md
|
||||||
|
.codex/
|
||||||
|
|
||||||
|
# Cursor
|
||||||
|
.cursor/
|
||||||
|
|
||||||
|
# DS Store
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Results
|
||||||
|
*.csv
|
||||||
|
|
||||||
|
# Python pickle files
|
||||||
|
*.pkl
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
_build/
|
||||||
|
|
||||||
|
# vim swap files
|
||||||
|
*.swo
|
||||||
|
*.swp
|
||||||
|
|
||||||
|
# hip files generated by PyTorch
|
||||||
|
*.hip
|
||||||
|
*_hip*
|
||||||
|
hip_compat.h
|
||||||
|
|
||||||
|
# Benchmark dataset
|
||||||
|
benchmarks/**/*.json
|
||||||
|
|
||||||
|
# Linting
|
||||||
|
actionlint
|
||||||
|
shellcheck*/
|
||||||
|
|
||||||
|
# Ignore moe/marlin_moe gen code
|
||||||
|
csrc/moe/marlin_moe_wna16/kernel_*
|
||||||
|
|
||||||
|
# Ignore ep_kernels_workspace folder
|
||||||
|
ep_kernels_workspace/
|
||||||
|
|
||||||
|
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
|
||||||
|
!vllm/benchmarks/lib/
|
||||||
|
|
||||||
|
# Generated gRPC protobuf files (compiled at build time from vllm_engine.proto)
|
||||||
|
vllm/grpc/vllm_engine_pb2.py
|
||||||
|
vllm/grpc/vllm_engine_pb2_grpc.py
|
||||||
|
vllm/grpc/vllm_engine_pb2.pyi
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
# 寒武纪 mlu370 文本生成
|
# 寒武纪 mlu370 文本生成
|
||||||
该模型测试框架在寒武纪mlu370 (X8/X4)加速卡上,基于vllm 推理引擎,适配了 Qwen1.5-1.8B-Chat 模型。
|
该模型测试框架在寒武纪mlu370 (X8/X4)加速卡上,基于vllm 推理引擎,适配了 Qwen1.5-1.8B-Chat 模型。
|
||||||
|
|
||||||
|
|
||||||
* Qwen1.5-1.8B-Chat 是通义千问系列中一款约18亿参数、轻量级的中英文对话大模型,专为高效推理和多场景聊天交互设计。
|
* Qwen1.5-1.8B-Chat 是通义千问系列中一款约18亿参数、轻量级的中英文对话大模型,专为高效推理和多场景聊天交互设计。
|
||||||
* Llama-2-7b-chat-hf:Meta 发布的 LLaMA 2 系列中 70 亿参数的对话优化版开源大模型,适合多轮聊天与通用任务。
|
* Llama-2-7b-chat-hf:Meta 发布的 LLaMA 2 系列中 70 亿参数的对话优化版开源大模型,适合多轮聊天与通用任务。
|
||||||
* ChatGLM3-6B:智谱 AI 推出的第 3 代 ChatGLM 系列中 60 亿参数的中英双语对话大模型,支持推理、代码和多任务能力。
|
* ChatGLM3-6B:智谱 AI 推出的第 3 代 ChatGLM 系列中 60 亿参数的中英双语对话大模型,支持推理、代码和多任务能力。
|
||||||
|
|||||||
@@ -274,7 +274,13 @@ class ModelConfig:
|
|||||||
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
|
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
|
||||||
) -> Optional["MultiModalConfig"]:
|
) -> Optional["MultiModalConfig"]:
|
||||||
architectures = getattr(self.hf_config, "architectures", [])
|
architectures = getattr(self.hf_config, "architectures", [])
|
||||||
if ModelRegistry.is_multimodal_model(architectures):
|
if ModelRegistry.is_multimodal_model(
|
||||||
|
architectures,
|
||||||
|
model_path=self.model,
|
||||||
|
revision=self.revision,
|
||||||
|
trust_remote_code=self.trust_remote_code,
|
||||||
|
hf_config=self.hf_config,
|
||||||
|
):
|
||||||
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
|
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
|
||||||
|
|
||||||
if limit_mm_per_prompt:
|
if limit_mm_per_prompt:
|
||||||
@@ -308,11 +314,23 @@ class ModelConfig:
|
|||||||
|
|
||||||
def _init_attention_free(self) -> bool:
|
def _init_attention_free(self) -> bool:
|
||||||
architectures = getattr(self.hf_config, "architectures", [])
|
architectures = getattr(self.hf_config, "architectures", [])
|
||||||
return ModelRegistry.is_attention_free_model(architectures)
|
return ModelRegistry.is_attention_free_model(
|
||||||
|
architectures,
|
||||||
|
model_path=self.model,
|
||||||
|
revision=self.revision,
|
||||||
|
trust_remote_code=self.trust_remote_code,
|
||||||
|
hf_config=self.hf_config,
|
||||||
|
)
|
||||||
|
|
||||||
def _init_has_inner_state(self) -> bool:
|
def _init_has_inner_state(self) -> bool:
|
||||||
architectures = getattr(self.hf_config, "architectures", [])
|
architectures = getattr(self.hf_config, "architectures", [])
|
||||||
return ModelRegistry.model_has_inner_state(architectures)
|
return ModelRegistry.model_has_inner_state(
|
||||||
|
architectures,
|
||||||
|
model_path=self.model,
|
||||||
|
revision=self.revision,
|
||||||
|
trust_remote_code=self.trust_remote_code,
|
||||||
|
hf_config=self.hf_config,
|
||||||
|
)
|
||||||
|
|
||||||
def _verify_tokenizer_mode(self) -> None:
|
def _verify_tokenizer_mode(self) -> None:
|
||||||
tokenizer_mode = self.tokenizer_mode.lower()
|
tokenizer_mode = self.tokenizer_mode.lower()
|
||||||
|
|||||||
@@ -32,7 +32,13 @@ def get_model_architecture(
|
|||||||
and "MixtralForCausalLM" in architectures):
|
and "MixtralForCausalLM" in architectures):
|
||||||
architectures = ["QuantMixtralForCausalLM"]
|
architectures = ["QuantMixtralForCausalLM"]
|
||||||
|
|
||||||
return ModelRegistry.resolve_model_cls(architectures)
|
return ModelRegistry.resolve_model_cls(
|
||||||
|
architectures,
|
||||||
|
model_path=model_config.model,
|
||||||
|
revision=model_config.revision,
|
||||||
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
|
hf_config=model_config.hf_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_architecture_class_name(model_config: ModelConfig) -> str:
|
def get_architecture_class_name(model_config: ModelConfig) -> str:
|
||||||
|
|||||||
@@ -16,9 +16,11 @@ from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type,
|
|||||||
|
|
||||||
import cloudpickle
|
import cloudpickle
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
import transformers
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
|
||||||
|
|
||||||
from .interfaces import (has_inner_state, is_attention_free,
|
from .interfaces import (has_inner_state, is_attention_free,
|
||||||
supports_multimodal, supports_pp)
|
supports_multimodal, supports_pp)
|
||||||
@@ -157,6 +159,11 @@ _SPECULATIVE_DECODING_MODELS = {
|
|||||||
"MedusaModel": ("medusa", "Medusa"),
|
"MedusaModel": ("medusa", "Medusa"),
|
||||||
"MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
|
"MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Transformers backend models - for custom models with auto_map
|
||||||
|
_TRANSFORMERS_BACKEND_MODELS = {
|
||||||
|
"TransformersForCausalLM": ("transformers_backend", "TransformersForCausalLM"),
|
||||||
|
}
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
|
|
||||||
_VLLM_MODELS = {
|
_VLLM_MODELS = {
|
||||||
@@ -369,6 +376,62 @@ class _ModelRegistry:
|
|||||||
|
|
||||||
return _try_inspect_model_cls(model_arch, self.models[model_arch])
|
return _try_inspect_model_cls(model_arch, self.models[model_arch])
|
||||||
|
|
||||||
|
def _try_resolve_transformers(
|
||||||
|
self,
|
||||||
|
architecture: str,
|
||||||
|
model_path: str,
|
||||||
|
revision: Optional[str],
|
||||||
|
trust_remote_code: bool,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
|
) -> Optional[Type[nn.Module]]:
|
||||||
|
"""
|
||||||
|
Try to resolve a model architecture using the Transformers backend.
|
||||||
|
This allows loading custom models that define their own implementation
|
||||||
|
via the `auto_map` field in config.json.
|
||||||
|
|
||||||
|
Returns the loaded model class if successful, None otherwise.
|
||||||
|
"""
|
||||||
|
# Check if architecture is in transformers
|
||||||
|
model_module = getattr(transformers, architecture, None)
|
||||||
|
|
||||||
|
# Get auto_map from hf_config
|
||||||
|
auto_map: Dict[str, str] = {}
|
||||||
|
if hf_config is not None:
|
||||||
|
auto_map = getattr(hf_config, "auto_map", None) or {}
|
||||||
|
|
||||||
|
if model_module is None and auto_map:
|
||||||
|
# Try to load from auto_map
|
||||||
|
# First, ensure config class is loaded
|
||||||
|
for prefix in ("AutoConfig", "AutoModel"):
|
||||||
|
for name, module in auto_map.items():
|
||||||
|
if name.startswith(prefix):
|
||||||
|
try_get_class_from_dynamic_module(
|
||||||
|
module,
|
||||||
|
model_path,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
revision=revision,
|
||||||
|
warn_on_fail=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now try to load the model class
|
||||||
|
for name, module in auto_map.items():
|
||||||
|
if name.startswith("AutoModel"):
|
||||||
|
model_module = try_get_class_from_dynamic_module(
|
||||||
|
module,
|
||||||
|
model_path,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
revision=revision,
|
||||||
|
warn_on_fail=True,
|
||||||
|
)
|
||||||
|
if model_module is not None:
|
||||||
|
logger.info(
|
||||||
|
"Loaded custom model class %s from auto_map",
|
||||||
|
model_module.__name__
|
||||||
|
)
|
||||||
|
return model_module
|
||||||
|
|
||||||
|
return model_module
|
||||||
|
|
||||||
def _normalize_archs(
|
def _normalize_archs(
|
||||||
self,
|
self,
|
||||||
architectures: Union[str, List[str]],
|
architectures: Union[str, List[str]],
|
||||||
@@ -383,6 +446,10 @@ class _ModelRegistry:
|
|||||||
def inspect_model_cls(
|
def inspect_model_cls(
|
||||||
self,
|
self,
|
||||||
architectures: Union[str, List[str]],
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
) -> _ModelInfo:
|
) -> _ModelInfo:
|
||||||
architectures = self._normalize_archs(architectures)
|
architectures = self._normalize_archs(architectures)
|
||||||
|
|
||||||
@@ -391,11 +458,25 @@ class _ModelRegistry:
|
|||||||
if model_info is not None:
|
if model_info is not None:
|
||||||
return model_info
|
return model_info
|
||||||
|
|
||||||
|
# Fallback: try to resolve using transformers backend (auto_map)
|
||||||
|
if model_path and trust_remote_code and hf_config:
|
||||||
|
for arch in architectures:
|
||||||
|
model_cls = self._try_resolve_transformers(
|
||||||
|
arch, model_path, revision, trust_remote_code, hf_config
|
||||||
|
)
|
||||||
|
if model_cls is not None:
|
||||||
|
# Create ModelInfo from the dynamically loaded class
|
||||||
|
return _ModelInfo.from_model_cls(model_cls)
|
||||||
|
|
||||||
return self._raise_for_unsupported(architectures)
|
return self._raise_for_unsupported(architectures)
|
||||||
|
|
||||||
def resolve_model_cls(
|
def resolve_model_cls(
|
||||||
self,
|
self,
|
||||||
architectures: Union[str, List[str]],
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
) -> Tuple[Type[nn.Module], str]:
|
) -> Tuple[Type[nn.Module], str]:
|
||||||
architectures = self._normalize_archs(architectures)
|
architectures = self._normalize_archs(architectures)
|
||||||
|
|
||||||
@@ -404,39 +485,88 @@ class _ModelRegistry:
|
|||||||
if model_cls is not None:
|
if model_cls is not None:
|
||||||
return (model_cls, arch)
|
return (model_cls, arch)
|
||||||
|
|
||||||
|
# Fallback: try to resolve using transformers backend (auto_map)
|
||||||
|
if model_path and trust_remote_code and hf_config:
|
||||||
|
for arch in architectures:
|
||||||
|
model_cls = self._try_resolve_transformers(
|
||||||
|
arch, model_path, revision, trust_remote_code, hf_config
|
||||||
|
)
|
||||||
|
if model_cls is not None:
|
||||||
|
return (model_cls, arch)
|
||||||
|
|
||||||
return self._raise_for_unsupported(architectures)
|
return self._raise_for_unsupported(architectures)
|
||||||
|
|
||||||
def is_text_generation_model(
|
def is_text_generation_model(
|
||||||
self,
|
self,
|
||||||
architectures: Union[str, List[str]],
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
return self.inspect_model_cls(architectures).is_text_generation_model
|
return self.inspect_model_cls(
|
||||||
|
architectures, model_path, revision, trust_remote_code, hf_config
|
||||||
|
).is_text_generation_model
|
||||||
|
|
||||||
def is_embedding_model(
|
def is_embedding_model(
|
||||||
self,
|
self,
|
||||||
architectures: Union[str, List[str]],
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
return self.inspect_model_cls(architectures).is_embedding_model
|
return self.inspect_model_cls(
|
||||||
|
architectures, model_path, revision, trust_remote_code, hf_config
|
||||||
|
).is_embedding_model
|
||||||
|
|
||||||
def is_multimodal_model(
|
def is_multimodal_model(
|
||||||
self,
|
self,
|
||||||
architectures: Union[str, List[str]],
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
return self.inspect_model_cls(architectures).supports_multimodal
|
return self.inspect_model_cls(
|
||||||
|
architectures, model_path, revision, trust_remote_code, hf_config
|
||||||
|
).supports_multimodal
|
||||||
|
|
||||||
def is_pp_supported_model(
|
def is_pp_supported_model(
|
||||||
self,
|
self,
|
||||||
architectures: Union[str, List[str]],
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
return self.inspect_model_cls(architectures).supports_pp
|
return self.inspect_model_cls(
|
||||||
|
architectures, model_path, revision, trust_remote_code, hf_config
|
||||||
|
).supports_pp
|
||||||
|
|
||||||
def model_has_inner_state(self, architectures: Union[str,
|
def model_has_inner_state(
|
||||||
List[str]]) -> bool:
|
self,
|
||||||
return self.inspect_model_cls(architectures).has_inner_state
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
|
) -> bool:
|
||||||
|
return self.inspect_model_cls(
|
||||||
|
architectures, model_path, revision, trust_remote_code, hf_config
|
||||||
|
).has_inner_state
|
||||||
|
|
||||||
def is_attention_free_model(self, architectures: Union[str,
|
def is_attention_free_model(
|
||||||
List[str]]) -> bool:
|
self,
|
||||||
return self.inspect_model_cls(architectures).is_attention_free
|
architectures: Union[str, List[str]],
|
||||||
|
model_path: Optional[str] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
hf_config: Optional[object] = None,
|
||||||
|
) -> bool:
|
||||||
|
return self.inspect_model_cls(
|
||||||
|
architectures, model_path, revision, trust_remote_code, hf_config
|
||||||
|
).is_attention_free
|
||||||
|
|
||||||
|
|
||||||
ModelRegistry = _ModelRegistry({
|
ModelRegistry = _ModelRegistry({
|
||||||
|
|||||||
76
vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
Normal file
76
vllm-v0.6.2/vllm/transformers_utils/dynamic_module.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""
|
||||||
|
Dynamic module loading utilities for custom HuggingFace models.
|
||||||
|
Ported from latest vLLM to support auto_map in model config.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from typing import Dict, Optional, Type, Union
|
||||||
|
|
||||||
|
from transformers.dynamic_module_utils import (
|
||||||
|
get_class_from_dynamic_module,
|
||||||
|
resolve_trust_remote_code,
|
||||||
|
)
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def try_get_class_from_dynamic_module(
|
||||||
|
class_reference: str,
|
||||||
|
pretrained_model_name_or_path: str,
|
||||||
|
trust_remote_code: bool,
|
||||||
|
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
||||||
|
force_download: bool = False,
|
||||||
|
resume_download: Optional[bool] = None,
|
||||||
|
proxies: Optional[Dict[str, str]] = None,
|
||||||
|
token: Optional[Union[bool, str]] = None,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
local_files_only: bool = False,
|
||||||
|
repo_type: Optional[str] = None,
|
||||||
|
code_revision: Optional[str] = None,
|
||||||
|
warn_on_fail: bool = True,
|
||||||
|
**kwargs,
|
||||||
|
) -> Optional[Type]:
|
||||||
|
"""
|
||||||
|
As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
|
||||||
|
but ignoring any errors.
|
||||||
|
|
||||||
|
This allows vLLM to load custom models that define their own
|
||||||
|
model classes via the `auto_map` field in config.json.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
resolve_trust_remote_code(
|
||||||
|
trust_remote_code,
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
has_local_code=False,
|
||||||
|
has_remote_code=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return get_class_from_dynamic_module(
|
||||||
|
class_reference,
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
force_download=force_download,
|
||||||
|
resume_download=resume_download,
|
||||||
|
proxies=proxies,
|
||||||
|
token=token,
|
||||||
|
revision=revision,
|
||||||
|
local_files_only=local_files_only,
|
||||||
|
repo_type=repo_type,
|
||||||
|
code_revision=code_revision,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
location = "ModelScope" if envs.VLLM_USE_MODELSCOPE else "HF Hub"
|
||||||
|
|
||||||
|
if warn_on_fail:
|
||||||
|
logger.warning(
|
||||||
|
"Unable to load %s from %s on %s.",
|
||||||
|
class_reference,
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
location,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
@@ -74,18 +74,22 @@ def vllm__module_executor__models__llama__LlamaAttention__forward(
|
|||||||
smooth_quant_scale: Optional[torch.Tensor] = None,
|
smooth_quant_scale: Optional[torch.Tensor] = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
|
qkv, _ = self.qkv_proj(hidden_states, smooth_quant_scale)
|
||||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
|
||||||
'''
|
'''
|
||||||
=============================
|
=============================
|
||||||
Modify by vllm_mlu
|
Modify by vllm_mlu
|
||||||
=============================
|
=============================
|
||||||
@brief: pack q & k to fit tmo.apply_rotary
|
@brief: pack q & k to fit tmo.apply_rotary
|
||||||
|
@optimization: avoid redundant split operation
|
||||||
'''
|
'''
|
||||||
if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
|
if self.rope_scaling is not None and self.rope_scaling["rope_type"] == "longrope":
|
||||||
|
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||||
q, k = self.rotary_emb(positions, q, k)
|
q, k = self.rotary_emb(positions, q, k)
|
||||||
else:
|
else:
|
||||||
qk, _ = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
|
# Optimized: split qkv into [qk, v] directly, avoiding redundant split
|
||||||
|
qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
|
||||||
self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
|
self.rotary_emb(positions, qk.view(-1, self.num_heads + self.num_kv_heads, self.head_dim))
|
||||||
|
# Split qk into q and k after rotary embedding
|
||||||
|
q, k = qk.split([self.q_size, self.kv_size], dim=-1)
|
||||||
'''
|
'''
|
||||||
==================
|
==================
|
||||||
End of MLU Hijack
|
End of MLU Hijack
|
||||||
|
|||||||
Reference in New Issue
Block a user