[Refactor] MLP weight prefetch to consistency with MoE Model's prefetching in terms of code and usage (#6442)
### What this PR does / why we need it?
Refactor MLP weight prefetch to consistency with MoE Model's prefetching
in terms of code and usage.
Environments VLLM_ASCEND_ENABLE_PREFETCH_MLP,
VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE and
VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE is removed, usage as following:
--additional-config '{"weight_prefetch_config": { "enabled": true,
"prefetch_ratio": {"mlp": { "gate_up": 1.0, "down": 1.0} }}}'
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
@@ -2,15 +2,18 @@ from dataclasses import dataclass, field
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.logger import logger
|
||||
|
||||
from vllm_ascend.ascend_config import WeightPrefetchConfig
|
||||
from vllm_ascend.ops.linear import (AscendQKVParallelLinear,
|
||||
AscendRowParallelLinear)
|
||||
from vllm_ascend.utils import is_moe_model
|
||||
|
||||
SUPPORTED_MODULES = ["attn", "mlp", "moe"]
|
||||
MOE_PREFETCH_TOKEN_THRESHOLD = 96
|
||||
|
||||
MAX_PREFETCH_WEIGHT_SIZE = 18 * 1024 * 1024
|
||||
|
||||
@dataclass
|
||||
class ModuleWeightPrefetchConfig:
|
||||
@@ -38,22 +41,37 @@ class WeightPrefetchMethod:
|
||||
"""
|
||||
Unified weight prefetch method.
|
||||
"""
|
||||
is_moe: bool = True
|
||||
MLP_GATE_UP: str = "gate_up"
|
||||
MLP_DOWN: str = "down"
|
||||
|
||||
# backward compatibility: delete in future versions
|
||||
mlp_pre_version_compatibale_config: dict = {}
|
||||
|
||||
def __init__(self, weight_prefetch_config: WeightPrefetchConfig) -> None:
|
||||
self.is_moe = is_moe_model(get_current_vllm_config())
|
||||
|
||||
self.attn = ModuleWeightPrefetchConfig(
|
||||
module_name="attn",
|
||||
enable=weight_prefetch_config.enabled,
|
||||
prefetch_ratio=weight_prefetch_config.prefetch_ratio.get(
|
||||
"attn", {}),
|
||||
"attn", {}) or {'qkv': 1.0, 'o': 1.0},
|
||||
linear_prefix_map={
|
||||
AscendQKVParallelLinear.__name__: "qkv",
|
||||
AscendRowParallelLinear.__name__: "o",
|
||||
})
|
||||
self.moe = ModuleWeightPrefetchConfig(
|
||||
module_name="moe",
|
||||
enable=weight_prefetch_config.enabled,
|
||||
enable=weight_prefetch_config.enabled and self.is_moe,
|
||||
prefetch_ratio=weight_prefetch_config.prefetch_ratio.get(
|
||||
"moe", {}))
|
||||
"moe", {}) or {'gate_up': 0.8})
|
||||
|
||||
self.mlp = ModuleWeightPrefetchConfig(
|
||||
module_name="mlp",
|
||||
enable=weight_prefetch_config.enabled and not self.is_moe,
|
||||
prefetch_ratio=weight_prefetch_config.prefetch_ratio.get(
|
||||
"mlp", {}) or {'gate_up': 1.0, 'down': 1.0})
|
||||
self.mlp_pre_version_compatibale_config = weight_prefetch_config.mlp_pre_version_compatibale_config
|
||||
|
||||
def maybe_prefetch_attn_weight_preprocess(
|
||||
self, layer_cls_name: str, weight: torch.Tensor,
|
||||
@@ -97,6 +115,82 @@ class WeightPrefetchMethod:
|
||||
|
||||
torch.ops.vllm.prefetch_postprocess(stop_flag)
|
||||
|
||||
# x_dependency only eager mode can pass None
|
||||
def maybe_prefetch_mlp_weight_preprocess(self, prefetch_layer_name: str, x_dependency: torch.Tensor | None, curr_layer_prefix: str | None = None):
|
||||
if not self.mlp.enable and not self.mlp_pre_version_compatibale_config:
|
||||
self.mlp.is_active_this_forward = False
|
||||
return
|
||||
|
||||
try:
|
||||
forward_context = get_forward_context()
|
||||
except AssertionError:
|
||||
return
|
||||
self.mlp.is_active_this_forward = (
|
||||
forward_context.layer_idx is not None
|
||||
and forward_context.num_tokens is not None
|
||||
and forward_context.num_tokens < 500
|
||||
)
|
||||
if not self.mlp.is_active_this_forward:
|
||||
return
|
||||
|
||||
if prefetch_layer_name == self.MLP_GATE_UP:
|
||||
self._maybe_prefetch_mlp_gate_up_weight_preprocess(x_dependency, forward_context, curr_layer_prefix)
|
||||
elif prefetch_layer_name == self.MLP_DOWN:
|
||||
self._maybe_prefetch_mlp_down_weight_preprocess(x_dependency, forward_context)
|
||||
else:
|
||||
raise ValueError(f"Unsupported prefetch weight name: {prefetch_layer_name}")
|
||||
|
||||
def _maybe_prefetch_mlp_gate_up_weight_preprocess(self, x_dependency: torch.Tensor, forward_context: ForwardContext, curr_layer_prefix: str | None):
|
||||
if not curr_layer_prefix:
|
||||
raise ValueError("curr_layer_prefix must been specified when prefetching mlp gate_up_proj weight")
|
||||
|
||||
# start point of gate_up_proj weight prefetch
|
||||
if curr_layer_prefix.split('.')[-2] == "self_attn":
|
||||
model_instance = forward_context.model_instance
|
||||
layer_idx = int(curr_layer_prefix.split('.')[2])
|
||||
weight = model_instance.model.layers[layer_idx].mlp.gate_up_proj.weight
|
||||
if self.mlp_pre_version_compatibale_config:
|
||||
weight_size = self.mlp_pre_version_compatibale_config.get(self.MLP_GATE_UP, 0)
|
||||
else:
|
||||
weight_size = weight.data.element_size() * weight.data.numel() * self.mlp.prefetch_ratio.get(self.MLP_GATE_UP, 0)
|
||||
if weight_size > MAX_PREFETCH_WEIGHT_SIZE:
|
||||
weight_size = MAX_PREFETCH_WEIGHT_SIZE
|
||||
torch.ops.vllm.prefetch_preprocess(weight=weight,
|
||||
start_flag=x_dependency,
|
||||
max_weight_size=int(weight_size))
|
||||
forward_context.prefetch_mlp_gate_up_proj = True
|
||||
|
||||
def _maybe_prefetch_mlp_down_weight_preprocess(self, x_dependency: torch.Tensor, forward_context: ForwardContext):
|
||||
layer_idx = forward_context.layer_idx
|
||||
model_instance = forward_context.model_instance
|
||||
weight = model_instance.model.layers[layer_idx].mlp.down_proj.weight
|
||||
if self.mlp_pre_version_compatibale_config:
|
||||
weight_size = self.mlp_pre_version_compatibale_config.get(self.MLP_DOWN, 0)
|
||||
else:
|
||||
weight_size = weight.data.element_size() * weight.data.numel() * self.mlp.prefetch_ratio.get(self.MLP_DOWN, 0)
|
||||
if weight_size > MAX_PREFETCH_WEIGHT_SIZE:
|
||||
weight_size = MAX_PREFETCH_WEIGHT_SIZE
|
||||
torch.ops.vllm.prefetch_preprocess(weight=weight,
|
||||
start_flag=x_dependency,
|
||||
max_weight_size=int(weight_size))
|
||||
forward_context.prefetch_mlp_down_proj = True
|
||||
forward_context.layer_idx += 1
|
||||
|
||||
def maybe_prefetch_mlp_weight_postprocess(self, stop_flag: torch.Tensor):
|
||||
if not self.mlp.is_active_this_forward:
|
||||
return
|
||||
|
||||
try:
|
||||
forward_context = get_forward_context()
|
||||
except AssertionError:
|
||||
return
|
||||
|
||||
if forward_context.prefetch_mlp_gate_up_proj or \
|
||||
forward_context.prefetch_mlp_down_proj:
|
||||
torch.ops.vllm.prefetch_postprocess(stop_flag)
|
||||
forward_context.prefetch_mlp_gate_up_proj = False
|
||||
forward_context.prefetch_mlp_down_proj = False
|
||||
|
||||
|
||||
def maybe_npu_prefetch(inputs: torch.Tensor,
|
||||
dependency: torch.Tensor,
|
||||
|
||||
Reference in New Issue
Block a user