Sync from v0.13
This commit is contained in:
@@ -1,72 +1,131 @@
|
||||
from typing import List, Optional, Set, Tuple, Type
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import huggingface_hub
|
||||
import regex as re
|
||||
from huggingface_hub.utils import (
|
||||
EntryNotFoundError,
|
||||
HfHubHTTPError,
|
||||
HFValidationError,
|
||||
RepositoryNotFoundError,
|
||||
)
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.fully_sharded_layers import (
|
||||
ColumnParallelLinearWithShardedLoRA,
|
||||
MergedColumnParallelLinearWithShardedLoRA,
|
||||
MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
|
||||
|
||||
# being imported for _all_lora_classes below
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
LogitsProcessorWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
MergedQKVParallelLinearWithLora,
|
||||
QKVParallelLinearWithLora,
|
||||
RowParallelLinearWithLoRA,
|
||||
VocabParallelEmbeddingWithLoRA)
|
||||
# yapf: enable
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.lora.layers import (
|
||||
BaseLayerWithLoRA,
|
||||
ColumnParallelLinearWithLoRA,
|
||||
ColumnParallelLinearWithShardedLoRA,
|
||||
FusedMoE3DWithLoRA,
|
||||
FusedMoEWithLoRA,
|
||||
LogitsProcessorWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithShardedLoRA,
|
||||
MergedQKVParallelLinearWithLoRA,
|
||||
MergedQKVParallelLinearWithShardedLoRA,
|
||||
QKVParallelLinearWithLoRA,
|
||||
QKVParallelLinearWithShardedLoRA,
|
||||
ReplicatedLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA,
|
||||
RowParallelLinearWithShardedLoRA,
|
||||
VocabParallelEmbeddingWithLoRA,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.linear import LinearBase
|
||||
from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_all_lora_classes: Set[Type[BaseLayerWithLoRA]] = {
|
||||
VocabParallelEmbeddingWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA, QKVParallelLinearWithLora,
|
||||
MergedQKVParallelLinearWithLora, RowParallelLinearWithLoRA,
|
||||
LogitsProcessorWithLoRA, ColumnParallelLinearWithShardedLoRA,
|
||||
_GLOBAL_LORA_ID = 0
|
||||
|
||||
|
||||
def get_lora_id():
|
||||
global _GLOBAL_LORA_ID
|
||||
_GLOBAL_LORA_ID += 1
|
||||
return _GLOBAL_LORA_ID
|
||||
|
||||
|
||||
_all_lora_classes: set[type[BaseLayerWithLoRA]] = {
|
||||
VocabParallelEmbeddingWithLoRA,
|
||||
ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
QKVParallelLinearWithLoRA,
|
||||
MergedQKVParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA,
|
||||
ReplicatedLinearWithLoRA,
|
||||
LogitsProcessorWithLoRA,
|
||||
ColumnParallelLinearWithShardedLoRA,
|
||||
QKVParallelLinearWithShardedLoRA,
|
||||
MergedColumnParallelLinearWithShardedLoRA,
|
||||
MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA
|
||||
MergedQKVParallelLinearWithShardedLoRA,
|
||||
RowParallelLinearWithShardedLoRA,
|
||||
FusedMoEWithLoRA,
|
||||
FusedMoE3DWithLoRA,
|
||||
}
|
||||
|
||||
|
||||
def from_layer(layer: nn.Module,
|
||||
max_loras: int,
|
||||
lora_config: LoRAConfig,
|
||||
packed_modules_list: List,
|
||||
model_config: Optional[PretrainedConfig] = None) -> nn.Module:
|
||||
def is_moe_model(model: nn.Module) -> bool:
|
||||
"""Checks if the model contains FusedMoE layers and warns the user."""
|
||||
if any(isinstance(module, FusedMoE) for module in model.modules()):
|
||||
logger.info_once("MoE model detected. Using fused MoE LoRA implementation.")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def from_layer(
|
||||
layer: nn.Module,
|
||||
max_loras: int,
|
||||
lora_config: LoRAConfig,
|
||||
packed_modules_list: list,
|
||||
model_config: PretrainedConfig | None = None,
|
||||
) -> nn.Module:
|
||||
for lora_cls in _all_lora_classes:
|
||||
# specifying kwargs so they can be easily accessed in decorator
|
||||
if lora_cls.can_replace_layer(source_layer=layer,
|
||||
lora_config=lora_config,
|
||||
packed_modules_list=packed_modules_list,
|
||||
model_config=model_config):
|
||||
ret = lora_cls(layer)
|
||||
ret.create_lora_weights(max_loras, lora_config, model_config)
|
||||
return ret
|
||||
if lora_cls.can_replace_layer(
|
||||
source_layer=layer,
|
||||
lora_config=lora_config,
|
||||
packed_modules_list=packed_modules_list,
|
||||
model_config=model_config,
|
||||
):
|
||||
instance_layer = lora_cls(layer)
|
||||
instance_layer.create_lora_weights(max_loras, lora_config, model_config)
|
||||
return instance_layer
|
||||
return layer
|
||||
|
||||
|
||||
def from_layer_logits_processor(
|
||||
layer: LogitsProcessor,
|
||||
lm_head: ParallelLMHead,
|
||||
layer: "LogitsProcessor",
|
||||
lm_head: "ParallelLMHead",
|
||||
max_loras: int,
|
||||
lora_config: LoRAConfig,
|
||||
model_config: Optional[PretrainedConfig] = None,
|
||||
model_config: PretrainedConfig | None = None,
|
||||
) -> LogitsProcessorWithLoRA:
|
||||
ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
|
||||
lm_head.weight.dtype, lm_head.weight.device)
|
||||
ret = LogitsProcessorWithLoRA(
|
||||
layer,
|
||||
lm_head.embedding_dim,
|
||||
lm_head.weight.dtype,
|
||||
lm_head.weight.device,
|
||||
lm_head.get_sharded_to_full_mapping(),
|
||||
)
|
||||
ret.create_lora_weights(max_loras, lora_config, model_config)
|
||||
return ret
|
||||
|
||||
|
||||
def replace_submodule(model: nn.Module, module_name: str,
|
||||
new_module: nn.Module) -> nn.Module:
|
||||
def replace_submodule(
|
||||
model: nn.Module, module_name: str, new_module: nn.Module
|
||||
) -> nn.Module:
|
||||
"""Replace a submodule in a model with a new module."""
|
||||
parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
|
||||
target_name = module_name.split(".")[-1]
|
||||
@@ -74,25 +133,183 @@ def replace_submodule(model: nn.Module, module_name: str,
|
||||
return new_module
|
||||
|
||||
|
||||
def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
|
||||
def parse_fine_tuned_lora_name(
|
||||
name: str, weights_mapper: Optional["WeightsMapper"] = None
|
||||
) -> tuple[str, bool]:
|
||||
"""Parse the name of lora weights.
|
||||
|
||||
args:
|
||||
name: the name of the fine-tuned LoRA, e.g.
|
||||
base_model.model.dense1.weight
|
||||
weights_mapper: maps the name of weight, e.g.
|
||||
`model.` -> `language_model.model.`,
|
||||
return:
|
||||
Tuple(module_name, is_lora_a):
|
||||
tuple(module_name, is_lora_a):
|
||||
module_name: the name of the module, e.g. model.dense1,
|
||||
is_lora_a whether the tensor is lora_a or lora_b.
|
||||
"""
|
||||
|
||||
# LoRA weight qualified name usually starts with `base_model.model.`,
|
||||
# so we remove the prefix `base_model.model.` to make the following
|
||||
# mapping correctly.
|
||||
if name.startswith("base_model.model."):
|
||||
name = name.replace("base_model.model.", "")
|
||||
name = weights_mapper._map_name(name) if weights_mapper else name
|
||||
# recover the prefix `base_model.model.`
|
||||
name = "base_model.model." + name
|
||||
else:
|
||||
name = weights_mapper._map_name(name) if weights_mapper else name
|
||||
|
||||
# In some situations, we may not start with `base_model.model.`.
|
||||
# If we don't (e.g., ibm-granite/granite-speech-3.3-8b),
|
||||
# we should keep the prefix intact.
|
||||
start_index = 2 if name.startswith("base_model.model.") else 0
|
||||
|
||||
parts = name.split(".")
|
||||
assert parts[0] == "base_model"
|
||||
assert parts[1] == "model"
|
||||
if parts[-1] == "weight":
|
||||
assert parts[-2] == "lora_A" or parts[-2] == "lora_B"
|
||||
return ".".join(parts[2:-2]), parts[-2] == "lora_A"
|
||||
if parts[-1] == "weight" and (parts[-2] == "lora_A" or parts[-2] == "lora_B"):
|
||||
new_name = ".".join(parts[start_index:-2])
|
||||
return new_name, parts[-2] == "lora_A"
|
||||
|
||||
if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
|
||||
return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
|
||||
new_name = ".".join(parts[start_index:-1])
|
||||
return new_name, parts[-1] == "lora_embedding_A"
|
||||
|
||||
raise ValueError(f"{name} is unsupported format")
|
||||
raise ValueError(f"{name} is unsupported LoRA weight")
|
||||
|
||||
|
||||
def is_base_embeddding_weights(name: str) -> bool:
|
||||
# hardcoded subfixes for input & output embedding weights
|
||||
embedding_suffixes = (
|
||||
".embed_tokens.base_layer.weight",
|
||||
".lm_head.base_layer.weight",
|
||||
)
|
||||
return name.endswith(embedding_suffixes)
|
||||
|
||||
|
||||
def is_regex_target_modules(
|
||||
load_modules: str | list[str], expected_lora_modules: set[str]
|
||||
) -> bool:
|
||||
"""
|
||||
PEFT supports passing `target_modules` in the form of regular expressions,
|
||||
such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to
|
||||
determine whether the suffix in the regular expression is present in the
|
||||
`expected_lora_modules`.
|
||||
"""
|
||||
|
||||
def is_valid_regex(pattern):
|
||||
try:
|
||||
re.compile(pattern)
|
||||
return True
|
||||
except re.error:
|
||||
return False
|
||||
|
||||
def is_subset(sub_list, full_set):
|
||||
return set(sub_list).issubset(full_set)
|
||||
|
||||
# Similar to PEFT's processing logic, regex-related operations are only
|
||||
# executed when the load_modules is a `str`.
|
||||
if not isinstance(load_modules, str):
|
||||
return False
|
||||
|
||||
if is_valid_regex(load_modules):
|
||||
match = re.search(r"\((.*?)\)\$?$", load_modules)
|
||||
if match:
|
||||
suffix = match.group(1).split("|")
|
||||
return is_subset(suffix, expected_lora_modules)
|
||||
return False
|
||||
|
||||
|
||||
def get_supported_lora_modules(model: nn.Module) -> list[str]:
|
||||
"""
|
||||
In vLLM, all linear layers support LoRA.
|
||||
"""
|
||||
|
||||
supported_lora_modules: set[str] = set()
|
||||
for name, module in model.named_modules():
|
||||
# get the embedding modules if the module's embedding_modules
|
||||
# is not empty.
|
||||
embedding_modules = getattr(module, "embedding_modules", None)
|
||||
if embedding_modules is not None:
|
||||
for name in embedding_modules:
|
||||
supported_lora_modules.add(name)
|
||||
|
||||
# get all the linear subfixes.
|
||||
if isinstance(module, (LinearBase,)):
|
||||
supported_lora_modules.add(name.split(".")[-1])
|
||||
|
||||
if isinstance(module, (FusedMoE,)):
|
||||
supported_lora_modules.add(name.split(".")[-1])
|
||||
|
||||
return list(supported_lora_modules)
|
||||
|
||||
|
||||
def get_adapter_absolute_path(lora_path: str) -> str:
|
||||
"""
|
||||
Resolves the given lora_path to an absolute local path.
|
||||
|
||||
If the lora_path is identified as a Hugging Face model identifier,
|
||||
it will download the model and return the local snapshot path.
|
||||
Otherwise, it treats the lora_path as a local file path and
|
||||
converts it to an absolute path.
|
||||
|
||||
Parameters:
|
||||
lora_path (str): The path to the lora model, which can be an absolute path,
|
||||
a relative path, or a Hugging Face model identifier.
|
||||
|
||||
Returns:
|
||||
str: The resolved absolute local path to the lora model.
|
||||
"""
|
||||
|
||||
# Check if the path is an absolute path. Return it no matter exists or not.
|
||||
if os.path.isabs(lora_path):
|
||||
return lora_path
|
||||
|
||||
# If the path starts with ~, expand the user home directory.
|
||||
if lora_path.startswith("~"):
|
||||
return os.path.expanduser(lora_path)
|
||||
|
||||
# Check if the expanded relative path exists locally.
|
||||
if os.path.exists(lora_path):
|
||||
return os.path.abspath(lora_path)
|
||||
|
||||
# If the path does not exist locally, assume it's a Hugging Face repo.
|
||||
try:
|
||||
local_snapshot_path = huggingface_hub.snapshot_download(repo_id=lora_path)
|
||||
except (
|
||||
HfHubHTTPError,
|
||||
RepositoryNotFoundError,
|
||||
EntryNotFoundError,
|
||||
HFValidationError,
|
||||
):
|
||||
# Handle errors that may occur during the download
|
||||
# Return original path instead of throwing error here
|
||||
logger.exception("Error downloading the HuggingFace model")
|
||||
return lora_path
|
||||
|
||||
return local_snapshot_path
|
||||
|
||||
|
||||
def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
|
||||
if is_moe_model(model):
|
||||
if moe_packed_mapping := get_moe_expert_mapping(model):
|
||||
# This method generates and returns a dictionary mapping packed module
|
||||
# names to lists of their corresponding submodule names. It includes
|
||||
# both static mappings and dynamic mappings for expert layers, where
|
||||
# the expert indices are expanded based on the configured number
|
||||
# of routed experts.
|
||||
packed_modules_mapping = get_packed_modules_mapping(model)
|
||||
if not model.is_3d_moe_weight:
|
||||
# 3D MoE LoRA does not need `packed_modules_mapping`
|
||||
packed_modules_mapping["experts"] = [
|
||||
weight_name.rstrip(".")
|
||||
for _, weight_name, _, _ in moe_packed_mapping
|
||||
]
|
||||
|
||||
return packed_modules_mapping
|
||||
else:
|
||||
raise AttributeError(
|
||||
"To support LoRA for MoE model, "
|
||||
"'get_expert_mapping' must be implemented"
|
||||
)
|
||||
else:
|
||||
return get_packed_modules_mapping(model)
|
||||
|
||||
Reference in New Issue
Block a user