[Bugfix] Follow vLLM Qwen-Moe/VL and KV Connector change to fix broken CI (#2181)
### What this PR does / why we need it? This pr fix broken CI: 1. Fix theee2eb6ecd8changes, in this commit, they fused the gate and up projections in the vision MLP, This can improve performance by reducing one matrix multiplication. so, this pr do the following things: - Specify that the two linear layers are fused as `mlp.gate_up_proj` when loading the weights. - Use a SiluAndMul activation function. 2. Fixaefeea0fde, Update ModelRunnerOutput parameters to adapt to its changes 3. Fix [vllm-commit](https://github.com/vllm-project/vllm/pull/20815/files#diff-3ffb829a39ab2b3e4706aa28f5e476815f36c3a87b98d6a66514ebedc8f3ffb4R354-R356), fix qwen moe ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main:fed5849d3f--------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -18,7 +18,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
from functools import partial
|
||||
from typing import Callable, Iterable, Optional, Set, Tuple
|
||||
from typing import Callable, Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -30,7 +30,8 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
|
||||
from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
|
||||
get_act_and_mul_fn)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
@@ -42,6 +43,8 @@ from vllm.model_executor.models.qwen2_5_vl import (
|
||||
from vllm.model_executor.models.utils import maybe_prefix
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||
|
||||
@@ -197,12 +200,16 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
|
||||
in_channels=vision_config.in_channels,
|
||||
hidden_size=self.hidden_size,
|
||||
)
|
||||
|
||||
act_fn = get_act_and_mul_fn(vision_config.hidden_act)
|
||||
if vllm_version_is("0.10.0"):
|
||||
act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act]
|
||||
self.blocks = nn.ModuleList([
|
||||
AscendQwen2_5_VisionBlock(
|
||||
dim=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
mlp_hidden_dim=vision_config.intermediate_size,
|
||||
act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
|
||||
act_fn=act_fn,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}")
|
||||
@@ -291,12 +298,17 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
stacked_params_mapping = [
|
||||
stacked_params_mapping: list[tuple[str, str, Union[str, int]]] = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
if not vllm_version_is("0.10.0"):
|
||||
stacked_params_mapping.extend([
|
||||
("mlp.gate_up_proj.", "mlp.gate_proj.", 0),
|
||||
("mlp.gate_up_proj.", "mlp.up_proj.", 1),
|
||||
])
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
@@ -30,7 +30,8 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import parallel_state
|
||||
from vllm.distributed import utils as dist_utils
|
||||
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
|
||||
from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
|
||||
get_act_and_mul_fn)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.models.qwen2_5_vl import (
|
||||
@@ -42,6 +43,7 @@ from vllm.model_executor.models.utils import maybe_prefix
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
|
||||
@@ -171,12 +173,16 @@ class AscendQwen2_5_VisionTransformer_Without_Padding(Qwen2_5_VisionTransformer
|
||||
in_channels=vision_config.in_channels,
|
||||
hidden_size=self.hidden_size,
|
||||
)
|
||||
|
||||
act_fn = get_act_and_mul_fn(vision_config.hidden_act)
|
||||
if vllm_version_is("0.10.0"):
|
||||
act_fn = _ACTIVATION_REGISTRY[vision_config.hidden_act]
|
||||
self.blocks = nn.ModuleList([
|
||||
AscendQwen2_5_VisionBlock_Without_Padding(
|
||||
dim=self.hidden_size,
|
||||
num_heads=self.num_heads,
|
||||
mlp_hidden_dim=vision_config.intermediate_size,
|
||||
act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
|
||||
act_fn=act_fn,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.blocks.{layer_idx}")
|
||||
|
||||
@@ -100,6 +100,8 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
|
||||
cache_config = vllm_config.cache_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
self.num_redundant_experts = parallel_config.num_redundant_experts
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.vocab_size = config.vocab_size
|
||||
self.config = config
|
||||
|
||||
Reference in New Issue
Block a user