Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
2
vllm/model_executor/layers/fused_moe/runner/__init__.py
Normal file
2
vllm/model_executor/layers/fused_moe/runner/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
@@ -0,0 +1,760 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from contextlib import nullcontext
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed import (
|
||||
get_ep_group,
|
||||
get_pcp_group,
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
from vllm.forward_context import (
|
||||
ForwardContext,
|
||||
get_forward_context,
|
||||
is_forward_context_available,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEConfig,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
|
||||
FusedMoEMethodBase,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import (
|
||||
FusedMoERouter,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.torch_utils import (
|
||||
aux_stream,
|
||||
current_stream,
|
||||
direct_register_custom_op,
|
||||
)
|
||||
from vllm.v1.worker.ubatching import dbo_current_ubatch_id
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def get_layer_from_name(layer_name: str) -> torch.nn.Module:
|
||||
forward_context: ForwardContext = get_forward_context()
|
||||
if layer_name == "from_forward_context":
|
||||
all_moe_layers = forward_context.all_moe_layers
|
||||
assert all_moe_layers is not None
|
||||
moe_layer_index = forward_context.moe_layer_index
|
||||
if moe_layer_index >= len(all_moe_layers):
|
||||
raise AssertionError(
|
||||
"We expected the number of MOE layers in `all_moe_layers` "
|
||||
"to be equal to the number of "
|
||||
"{vllm.moe_forward, vllm.moe_forward_shared} calls."
|
||||
)
|
||||
layer_name = all_moe_layers[moe_layer_index]
|
||||
forward_context.moe_layer_index += 1
|
||||
return forward_context.no_compile_layers[layer_name]
|
||||
|
||||
|
||||
def _moe_forward(
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_experts_input: torch.Tensor | None,
|
||||
layer_name: str,
|
||||
) -> torch.Tensor:
|
||||
layer = get_layer_from_name(layer_name)
|
||||
# TODO(bnell): this can be removed after MK migration is complete.
|
||||
layer.ensure_moe_quant_config_init()
|
||||
return layer.runner.forward_impl(
|
||||
layer, hidden_states, router_logits, shared_experts_input
|
||||
)
|
||||
|
||||
|
||||
def _moe_forward_fake(
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_experts_input: torch.Tensor | None,
|
||||
layer_name: str,
|
||||
) -> torch.Tensor:
|
||||
return torch.empty_like(hidden_states)
|
||||
|
||||
|
||||
def _moe_forward_shared(
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_experts_input: torch.Tensor | None,
|
||||
layer_name: str,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
layer = get_layer_from_name(layer_name)
|
||||
# TODO(bnell): this can be removed after MK migration is complete.
|
||||
layer.ensure_moe_quant_config_init()
|
||||
return layer.runner.forward_impl(
|
||||
layer, hidden_states, router_logits, shared_experts_input
|
||||
)
|
||||
|
||||
|
||||
def _moe_forward_shared_fake(
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_experts_input: torch.Tensor | None,
|
||||
layer_name: str,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Output shapes:
|
||||
# - fused_out: same as hidden_states (routed experts use transformed size)
|
||||
# - shared_out: same as shared_experts_input if provided, else same as
|
||||
# hidden_states
|
||||
# (For latent MoE: shared experts use original hidden_size, not latent size)
|
||||
fused_out = torch.empty_like(hidden_states)
|
||||
|
||||
if shared_experts_input is not None:
|
||||
shared_out = torch.empty_like(shared_experts_input)
|
||||
else:
|
||||
shared_out = torch.empty_like(hidden_states)
|
||||
|
||||
return shared_out, fused_out
|
||||
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="moe_forward",
|
||||
op_func=_moe_forward,
|
||||
mutates_args=["hidden_states"],
|
||||
fake_impl=_moe_forward_fake,
|
||||
tags=(torch.Tag.needs_fixed_stride_order,),
|
||||
)
|
||||
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="moe_forward_shared",
|
||||
op_func=_moe_forward_shared,
|
||||
mutates_args=["hidden_states"],
|
||||
fake_impl=_moe_forward_shared_fake,
|
||||
tags=(torch.Tag.needs_fixed_stride_order,),
|
||||
)
|
||||
|
||||
|
||||
class DefaultMoERunner(MoERunner):
|
||||
"""
|
||||
Default implementation of the MoE runner for executing Mixture of Experts layers.
|
||||
|
||||
This class provides a comprehensive implementation for running MoE computations
|
||||
with support for:
|
||||
- Expert routing and token dispatching
|
||||
- Shared experts computation with optional parallel execution using CUDA streams
|
||||
- Data parallel (DP) chunking for large batch processing
|
||||
- Tensor model parallel and expert parallel operations
|
||||
- Various quantization methods and custom operators
|
||||
- Both monolithic and decomposed expert execution paths
|
||||
|
||||
The runner handles the complete MoE forward pass including routing tokens to
|
||||
experts, executing expert computations, and combining results. It supports
|
||||
advanced features like overlapped execution of shared experts and optimized
|
||||
kernels for different parallel execution modes.
|
||||
|
||||
Eventually, this class will be split up and specialized for different
|
||||
configurations, e.g. the presence or absence of shared experts, a gate, etc.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
moe_config: FusedMoEConfig,
|
||||
router: FusedMoERouter,
|
||||
routed_input_transform: torch.nn.Module | None,
|
||||
gate: torch.nn.Module | None,
|
||||
shared_experts: torch.nn.Module | None,
|
||||
quant_method: FusedMoEMethodBase,
|
||||
reduce_results: bool,
|
||||
enable_dbo: bool,
|
||||
):
|
||||
super().__init__()
|
||||
self.moe_config = moe_config
|
||||
self.router = router
|
||||
self.routed_input_transform = routed_input_transform
|
||||
self.gate = gate
|
||||
self.shared_experts = shared_experts
|
||||
self.quant_method = quant_method
|
||||
self.reduce_results = reduce_results
|
||||
self.enable_dbo = enable_dbo
|
||||
|
||||
# Allow disabling of the separate shared experts stream for
|
||||
# debug purposes.
|
||||
# TODO: Remove this after more extensive testings with TP/DP
|
||||
# and other execution modes
|
||||
if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM:
|
||||
logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local")
|
||||
self.shared_experts_stream = None
|
||||
else:
|
||||
# TODO(rob): enable shared expert overlap with non-cuda-alike.
|
||||
# aux_stream() returns None on non-cuda-alike platforms.
|
||||
self.shared_experts_stream = aux_stream()
|
||||
if self.shared_experts_stream is not None:
|
||||
logger.debug_once(
|
||||
"Enabled separate cuda stream for MoE shared_experts", scope="local"
|
||||
)
|
||||
|
||||
# Needed for string -> FusedMoE layer lookup in custom ops.
|
||||
self.layer_name = layer.layer_name
|
||||
|
||||
if current_platform.is_tpu() or current_platform.is_cpu():
|
||||
# TODO: Once the OOM issue for the TPU backend is resolved, we
|
||||
# will switch to using the moe_forward custom op.
|
||||
# Note: CPU doesn't require wrapped forward_impl.
|
||||
if self.shared_experts is None:
|
||||
self.moe_forward = _moe_forward
|
||||
else:
|
||||
self.moe_forward = _moe_forward_shared
|
||||
else:
|
||||
if self.shared_experts is None:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward
|
||||
else:
|
||||
self.moe_forward = torch.ops.vllm.moe_forward_shared
|
||||
|
||||
# Chunked all2all staging tensor
|
||||
self.batched_hidden_states: torch.Tensor | None = None
|
||||
self.batched_router_logits: torch.Tensor | None = None
|
||||
|
||||
@property
|
||||
def use_dp_chunking(self) -> bool:
|
||||
return (
|
||||
self.moe_config.moe_parallel_config.use_pplx_kernels
|
||||
or self.moe_config.moe_parallel_config.use_deepep_ll_kernels
|
||||
or self.moe_config.moe_parallel_config.use_mori_kernels
|
||||
or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels
|
||||
) and envs.VLLM_ENABLE_MOE_DP_CHUNK
|
||||
|
||||
def _maybe_setup_shared_experts_stream(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
shared_input: torch.Tensor | None,
|
||||
has_separate_shared_experts: bool,
|
||||
use_chunked_impl: bool,
|
||||
) -> tuple[bool, torch.Tensor | None]:
|
||||
use_shared_experts_stream = (
|
||||
current_platform.is_cuda()
|
||||
and has_separate_shared_experts
|
||||
and not use_chunked_impl
|
||||
and self.shared_experts_stream is not None
|
||||
and (
|
||||
hidden_states.shape[0]
|
||||
<= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
|
||||
)
|
||||
)
|
||||
|
||||
shared_experts_input: torch.Tensor | None = None
|
||||
if use_shared_experts_stream:
|
||||
assert self.shared_experts_stream is not None
|
||||
assert self.moe_config.disable_inplace
|
||||
|
||||
shared_experts_input = (
|
||||
shared_input if shared_input is not None else hidden_states
|
||||
)
|
||||
|
||||
# Record that the shared_experts_input will be used in the
|
||||
# shared_experts_stream to to avoid gc issue from
|
||||
# deallocation. For more details:
|
||||
# https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
|
||||
# NOTE: We don't need shared_output.record_stream(current_stream())
|
||||
# because we synch the streams before using shared_output.
|
||||
shared_experts_input.record_stream(self.shared_experts_stream)
|
||||
|
||||
# Mark sync start point for the separate shared experts
|
||||
# stream here since we want to run in parallel with the
|
||||
# router/gate (next op below)
|
||||
assert self.shared_experts_stream is not None
|
||||
self.shared_experts_stream.wait_stream(current_stream())
|
||||
|
||||
return use_shared_experts_stream, shared_experts_input
|
||||
|
||||
def ensure_dp_chunking_init(self):
|
||||
if not self.use_dp_chunking or self.batched_hidden_states is not None:
|
||||
return
|
||||
|
||||
states_shape: tuple[int, ...]
|
||||
logits_shape: tuple[int, ...]
|
||||
|
||||
moe = self.moe_config
|
||||
|
||||
if self.enable_dbo:
|
||||
states_shape = (2, moe.max_num_tokens, self.moe_config.hidden_dim)
|
||||
logits_shape = (2, moe.max_num_tokens, self.moe_config.num_logical_experts)
|
||||
else:
|
||||
states_shape = (moe.max_num_tokens, self.moe_config.hidden_dim)
|
||||
logits_shape = (moe.max_num_tokens, self.moe_config.num_logical_experts)
|
||||
|
||||
self.batched_hidden_states = torch.zeros(
|
||||
states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
|
||||
self.batched_router_logits = torch.zeros(
|
||||
logits_shape,
|
||||
dtype=moe.router_logits_dtype,
|
||||
device=torch.cuda.current_device(),
|
||||
)
|
||||
|
||||
def must_reduce_shared_expert_outputs(self) -> bool:
|
||||
"""
|
||||
The shared_experts are typically computed using the RowParallelLinear
|
||||
layer. The result of this function is typically used as
|
||||
the reduce_results argument to the module.
|
||||
When just tensor-parallel is used, it is not required to reduce
|
||||
the shared_experts results immediately. Instead we reduce at the
|
||||
once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
|
||||
With EP and all2all kernels - this is no longer viable as all
|
||||
GPU ranks in DP, produce the complete set of hidden_states.
|
||||
Therefore it is required that we reduce the shared_experts output
|
||||
early.
|
||||
"""
|
||||
assert self.quant_method is not None
|
||||
return (
|
||||
self.quant_method.moe_mk is not None
|
||||
and self.quant_method.moe_mk.output_is_reduced()
|
||||
)
|
||||
|
||||
def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor):
|
||||
"""
|
||||
Some combine kernels reduce across GPU ranks by default.
|
||||
"""
|
||||
if self.must_reduce_shared_expert_outputs():
|
||||
return final_hidden_states
|
||||
else:
|
||||
return tensor_model_parallel_all_reduce(final_hidden_states)
|
||||
|
||||
def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
"""Apply transform for routed experts (e.g., latent projection).
|
||||
|
||||
This is called by FusedMoE.forward_native. The original hidden_states
|
||||
is saved separately so shared experts get [S, hidden_size] while
|
||||
routed experts get the transformed [S, moe_latent_size].
|
||||
|
||||
TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be
|
||||
moved inside SharedFusedMoE to all-reduce on the smaller latent
|
||||
dimension.
|
||||
"""
|
||||
if self.routed_input_transform is not None:
|
||||
result = self.routed_input_transform(hidden_states)
|
||||
# ReplicatedLinear returns (output, extra_bias) tuple.
|
||||
# We only need the output tensor; extra_bias is not used here.
|
||||
if isinstance(result, tuple):
|
||||
return result[0]
|
||||
return result
|
||||
return hidden_states
|
||||
|
||||
def _reduce_output(
|
||||
self,
|
||||
states: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
|
||||
trunc_sizes: list[int],
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
def trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor:
|
||||
return x[..., :trunc_size]
|
||||
|
||||
def reduce_and_trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor:
|
||||
return trunc(self.maybe_all_reduce_tensor_model_parallel(x), trunc_size)
|
||||
|
||||
if (
|
||||
not self.moe_config.is_sequence_parallel
|
||||
and not self.use_dp_chunking
|
||||
and self.reduce_results
|
||||
and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1)
|
||||
):
|
||||
func = reduce_and_trunc
|
||||
else:
|
||||
func = trunc
|
||||
|
||||
if isinstance(states, tuple):
|
||||
return tuple(
|
||||
[func(s, trunc_size) for s, trunc_size in zip(states, trunc_sizes)]
|
||||
)
|
||||
else:
|
||||
assert len(trunc_sizes) == 1
|
||||
return func(states, trunc_sizes[0])
|
||||
|
||||
def _encode_layer_name(self) -> str:
|
||||
# Can be unavailable or None in unittests
|
||||
if (
|
||||
is_forward_context_available()
|
||||
and get_forward_context().all_moe_layers is not None
|
||||
):
|
||||
return "from_forward_context"
|
||||
return self.layer_name
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
# For latent MoE: save ORIGINAL hidden_states before transform
|
||||
# (shared_experts need original dimension, routed experts use transformed)
|
||||
if self.shared_experts is not None:
|
||||
original_hidden_states = hidden_states
|
||||
original_hidden_dim = hidden_states.shape[-1]
|
||||
else:
|
||||
original_hidden_states = None
|
||||
|
||||
# Apply transform for routed experts (e.g., latent projection for latent MoE)
|
||||
hidden_states = self.apply_routed_input_transform(hidden_states)
|
||||
|
||||
# This is the dimension after transform (for routed expert output slicing)
|
||||
transformed_hidden_dim = hidden_states.shape[-1]
|
||||
if self.moe_config.hidden_dim != transformed_hidden_dim:
|
||||
hidden_states = F.pad(
|
||||
hidden_states,
|
||||
(0, self.moe_config.hidden_dim - transformed_hidden_dim),
|
||||
mode="constant",
|
||||
value=0.0,
|
||||
)
|
||||
|
||||
fused_output = self.moe_forward(
|
||||
hidden_states,
|
||||
router_logits,
|
||||
original_hidden_states,
|
||||
self._encode_layer_name(),
|
||||
)
|
||||
|
||||
if self.shared_experts is not None:
|
||||
orig_hidden_dims = [original_hidden_dim, transformed_hidden_dim]
|
||||
else:
|
||||
orig_hidden_dims = [transformed_hidden_dim]
|
||||
|
||||
return self._reduce_output(fused_output, orig_hidden_dims)
|
||||
|
||||
def forward_impl_chunked(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
full_hidden_states: torch.Tensor,
|
||||
full_router_logits: torch.Tensor,
|
||||
full_shared_input: torch.Tensor | None,
|
||||
has_separate_shared_experts: bool,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
assert self.batched_hidden_states is not None
|
||||
assert self.batched_router_logits is not None
|
||||
assert self.batched_hidden_states.dtype == full_hidden_states.dtype, (
|
||||
f"{self.batched_hidden_states.dtype} == {full_hidden_states.dtype}"
|
||||
)
|
||||
assert self.batched_router_logits.dtype == full_router_logits.dtype, (
|
||||
f"{self.batched_router_logits.dtype} == {full_router_logits.dtype}"
|
||||
)
|
||||
# Check size compatibility.
|
||||
assert self.batched_hidden_states.size(-1) == full_hidden_states.size(-1)
|
||||
assert self.batched_router_logits.size(-1) == full_router_logits.size(-1)
|
||||
|
||||
# TODO(bnell): Fix shared_expert_inputs w/chunking.
|
||||
# assert shared_input is None, (
|
||||
# "Routed input transform is not currently supported with DP chunking."
|
||||
# )
|
||||
|
||||
full_fused_final_hidden_states = torch.empty_like(full_hidden_states)
|
||||
if self.shared_experts is not None:
|
||||
full_shared_final_hidden_states = torch.empty_like(full_hidden_states)
|
||||
|
||||
def process_chunk(chunk_start, chunk_end, skip_result_store=False):
|
||||
chunk_size = chunk_end - chunk_start
|
||||
hidden_states = full_hidden_states[chunk_start:chunk_end, :]
|
||||
router_logits = full_router_logits[chunk_start:chunk_end, :]
|
||||
shared_input = (
|
||||
full_shared_input[chunk_start:chunk_end, :]
|
||||
if full_shared_input is not None
|
||||
else None
|
||||
)
|
||||
|
||||
assert self.batched_hidden_states is not None
|
||||
assert self.batched_router_logits is not None
|
||||
# This is only true when DBO has been enabled in the config.
|
||||
# Both tensors will have an outer dimension for the ubatch id
|
||||
if self.batched_hidden_states.dim() == 3:
|
||||
assert self.batched_router_logits.dim() == 3
|
||||
batch_buffer_idx = dbo_current_ubatch_id()
|
||||
batched_hidden_states = self.batched_hidden_states[batch_buffer_idx, :]
|
||||
batched_router_logits = self.batched_router_logits[batch_buffer_idx, :]
|
||||
else:
|
||||
batched_hidden_states = self.batched_hidden_states
|
||||
batched_router_logits = self.batched_router_logits
|
||||
|
||||
assert (
|
||||
batched_hidden_states.size(0) # type: ignore
|
||||
>= chunk_size
|
||||
)
|
||||
assert (
|
||||
batched_router_logits.size(0) # type: ignore
|
||||
>= chunk_size
|
||||
)
|
||||
staged_hidden_states = batched_hidden_states[:chunk_size, :] # type: ignore
|
||||
staged_router_logits = batched_router_logits[:chunk_size, :] # type: ignore
|
||||
staged_hidden_states.copy_(hidden_states, non_blocking=True)
|
||||
staged_router_logits.copy_(router_logits, non_blocking=True)
|
||||
|
||||
shared_input = (
|
||||
shared_input if shared_input is not None else staged_hidden_states
|
||||
)
|
||||
|
||||
# Matrix multiply.
|
||||
if self.quant_method.is_monolithic:
|
||||
assert has_separate_shared_experts or self.shared_experts is None
|
||||
final_hidden_states = self.quant_method.apply_monolithic(
|
||||
layer=layer,
|
||||
x=staged_hidden_states,
|
||||
router_logits=staged_router_logits,
|
||||
)
|
||||
else:
|
||||
topk_weights, topk_ids = self.router.select_experts(
|
||||
hidden_states=staged_hidden_states,
|
||||
router_logits=staged_router_logits,
|
||||
)
|
||||
|
||||
final_hidden_states = self.quant_method.apply(
|
||||
layer=layer,
|
||||
x=staged_hidden_states,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
shared_experts_input=shared_input,
|
||||
)
|
||||
|
||||
if has_separate_shared_experts:
|
||||
assert not isinstance(final_hidden_states, tuple)
|
||||
assert self.shared_experts is not None
|
||||
|
||||
shared_output = self.shared_experts(shared_input)
|
||||
|
||||
final_hidden_states = (
|
||||
shared_output,
|
||||
final_hidden_states,
|
||||
)
|
||||
|
||||
if not skip_result_store:
|
||||
if self.shared_experts is None:
|
||||
full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
|
||||
final_hidden_states, non_blocking=True
|
||||
)
|
||||
else:
|
||||
full_shared_final_hidden_states[chunk_start:chunk_end, :].copy_(
|
||||
final_hidden_states[0], non_blocking=True
|
||||
)
|
||||
full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
|
||||
final_hidden_states[1], non_blocking=True
|
||||
)
|
||||
|
||||
ctx = get_forward_context()
|
||||
# flashinfer_cutlass_kernels can handle: optional DP + TP/EP
|
||||
max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu
|
||||
moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
|
||||
|
||||
# If the input to the MoE is sequence parallel then divide by sp_size
|
||||
# to find the maximum number of tokens for any individual dispatcher.
|
||||
if self.moe_config.is_sequence_parallel:
|
||||
max_tokens_across_dispatchers = cdiv(
|
||||
max_tokens_across_dispatchers, self.moe_config.sp_size
|
||||
)
|
||||
|
||||
num_tokens = full_hidden_states.size(0)
|
||||
for chunk_idx, chunk_start_ in enumerate(
|
||||
range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank)
|
||||
):
|
||||
chunk_start = chunk_start_
|
||||
chunk_end = min(
|
||||
chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers
|
||||
)
|
||||
# clamp start and end
|
||||
chunk_start = min(chunk_start, num_tokens - 1)
|
||||
chunk_end = min(chunk_end, num_tokens)
|
||||
with ctx.dp_metadata.chunked_sizes(
|
||||
self.moe_config.sp_size, moe_dp_chunk_size_per_rank, chunk_idx
|
||||
):
|
||||
process_chunk(
|
||||
chunk_start, chunk_end, skip_result_store=chunk_start_ >= num_tokens
|
||||
)
|
||||
|
||||
if self.shared_experts is None:
|
||||
return full_fused_final_hidden_states
|
||||
else:
|
||||
return (full_shared_final_hidden_states, full_fused_final_hidden_states)
|
||||
|
||||
def forward_impl(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
shared_input: torch.Tensor | None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
assert self.quant_method is not None
|
||||
|
||||
self.ensure_dp_chunking_init()
|
||||
|
||||
has_separate_shared_experts = (
|
||||
not self.quant_method.mk_owns_shared_expert
|
||||
and self.shared_experts is not None
|
||||
)
|
||||
|
||||
use_chunked_impl = self.use_dp_chunking
|
||||
|
||||
use_shared_experts_stream, shared_experts_input = (
|
||||
self._maybe_setup_shared_experts_stream(
|
||||
hidden_states,
|
||||
shared_input,
|
||||
has_separate_shared_experts,
|
||||
use_chunked_impl,
|
||||
)
|
||||
)
|
||||
|
||||
# If router/gate provided, then apply it here.
|
||||
# (Note: This code runs only when "overlapped mode" is on to allow
|
||||
# parallel execution of shared experts with the FusedMoE via
|
||||
# separate cuda stream)
|
||||
if self.gate is not None:
|
||||
router_logits, _ = self.gate(hidden_states)
|
||||
|
||||
if use_chunked_impl:
|
||||
return self.forward_impl_chunked(
|
||||
layer,
|
||||
hidden_states,
|
||||
router_logits,
|
||||
shared_input,
|
||||
has_separate_shared_experts,
|
||||
)
|
||||
|
||||
# NOTE(rob): once we finish migrating all the quant methods to use
|
||||
# MKs, we can remove the naive dispatch/combine path from here.
|
||||
do_naive_dispatch_combine = (
|
||||
self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk
|
||||
)
|
||||
|
||||
ctx = get_forward_context()
|
||||
sp_ctx = (
|
||||
ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size)
|
||||
if ctx.dp_metadata
|
||||
else nullcontext()
|
||||
)
|
||||
|
||||
with sp_ctx:
|
||||
extra_tensors = None
|
||||
if do_naive_dispatch_combine:
|
||||
post_quant_allgather = (
|
||||
self.quant_method is not None
|
||||
and self.moe_config.dp_size > 1
|
||||
and self.moe_config.use_ep
|
||||
and getattr(self.quant_method, "do_post_quant_allgather", False)
|
||||
)
|
||||
if post_quant_allgather:
|
||||
hidden_states_to_dispatch, extra_tensors = (
|
||||
self.quant_method.prepare_dp_allgather_tensor(
|
||||
layer, hidden_states, router_logits
|
||||
)
|
||||
)
|
||||
else:
|
||||
hidden_states_to_dispatch = hidden_states
|
||||
|
||||
dispatch_res = get_ep_group().dispatch_router_logits(
|
||||
hidden_states_to_dispatch,
|
||||
router_logits,
|
||||
self.moe_config.is_sequence_parallel,
|
||||
extra_tensors=extra_tensors,
|
||||
)
|
||||
if extra_tensors is not None:
|
||||
(
|
||||
orig_hidden_states,
|
||||
router_logits,
|
||||
extra_tensors_combined,
|
||||
) = dispatch_res
|
||||
hidden_states_combined = (
|
||||
orig_hidden_states,
|
||||
extra_tensors_combined[0],
|
||||
)
|
||||
else:
|
||||
hidden_states_combined, router_logits = dispatch_res
|
||||
orig_hidden_states = hidden_states_combined
|
||||
else:
|
||||
orig_hidden_states = hidden_states
|
||||
|
||||
# Run shared experts before matrix multiply.
|
||||
# because matrix multiply maybe modify the hidden_states.
|
||||
if has_separate_shared_experts and not use_shared_experts_stream:
|
||||
assert self.shared_experts is not None
|
||||
shared_input = (
|
||||
shared_input if shared_input is not None else hidden_states
|
||||
)
|
||||
shared_output = self.shared_experts(shared_input)
|
||||
|
||||
# NOTE: Similar with DP, PCP also needs dispatch and combine. For
|
||||
# simplicity, AgRsAll2All was added separately for PCP here. Maybe
|
||||
# we should modify All2AllManager abstract to better support PCP.
|
||||
if self.moe_config.pcp_size > 1:
|
||||
hidden_states = get_pcp_group().all_gather(
|
||||
hidden_states,
|
||||
dim=0,
|
||||
)
|
||||
router_logits = get_pcp_group().all_gather(
|
||||
router_logits,
|
||||
dim=0,
|
||||
)
|
||||
|
||||
# TODO(bnell): deal with fp4 flashinfer tuple hidden states hack (#30014).
|
||||
# Figure out nicer way to do this.
|
||||
if do_naive_dispatch_combine:
|
||||
x = hidden_states_combined
|
||||
x_orig = orig_hidden_states
|
||||
else:
|
||||
x = hidden_states
|
||||
x_orig = hidden_states
|
||||
|
||||
# Matrix multiply.
|
||||
if self.quant_method.is_monolithic:
|
||||
final_hidden_states = self.quant_method.apply_monolithic(
|
||||
layer=layer,
|
||||
x=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
else:
|
||||
topk_weights, topk_ids = self.router.select_experts(
|
||||
hidden_states=x_orig,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
final_hidden_states = self.quant_method.apply(
|
||||
layer=layer,
|
||||
x=x, # The type signture of this is wrong due to the hack.
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
shared_experts_input=shared_input,
|
||||
router_logits=router_logits,
|
||||
top_k=topk_ids.shape[-1]
|
||||
)
|
||||
|
||||
if has_separate_shared_experts:
|
||||
assert self.shared_experts is not None
|
||||
|
||||
if use_shared_experts_stream:
|
||||
# Run shared experts in parallel on a separate stream
|
||||
# NOTE: We start the separate stream here and mark the
|
||||
# sync end point immediately after it is done. This is
|
||||
# important to avoid excessive stream allocations by the cuda
|
||||
# graph replay later.
|
||||
with torch.cuda.stream(self.shared_experts_stream):
|
||||
# Note that hidden_states clone() is necessary here to avoid
|
||||
# conflict with the main stream
|
||||
shared_output = self.shared_experts(shared_experts_input)
|
||||
current_stream().wait_stream(self.shared_experts_stream)
|
||||
|
||||
final_hidden_states = (
|
||||
shared_output,
|
||||
final_hidden_states,
|
||||
)
|
||||
|
||||
def combine_output(states: torch.Tensor) -> torch.Tensor:
|
||||
if do_naive_dispatch_combine:
|
||||
states = get_ep_group().combine(
|
||||
states, self.moe_config.is_sequence_parallel
|
||||
)
|
||||
|
||||
if self.moe_config.pcp_size > 1:
|
||||
states = get_pcp_group().reduce_scatter(
|
||||
states,
|
||||
dim=0,
|
||||
)
|
||||
|
||||
return states
|
||||
|
||||
if self.shared_experts is not None:
|
||||
return (
|
||||
final_hidden_states[0],
|
||||
combine_output(final_hidden_states[1]),
|
||||
)
|
||||
else:
|
||||
return combine_output(final_hidden_states)
|
||||
34
vllm/model_executor/layers/fused_moe/runner/moe_runner.py
Normal file
34
vllm/model_executor/layers/fused_moe/runner/moe_runner.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class MoERunner(ABC):
|
||||
"""
|
||||
Abstract base class for Mixture of Experts (MoE) runners.
|
||||
|
||||
This class defines the interface that all MoE runner implementations must follow.
|
||||
MoE runners are responsible for executing the forward pass of MoE layers, handling
|
||||
expert routing, and managing tensor parallel operations.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def must_reduce_shared_expert_outputs(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def maybe_all_reduce_tensor_model_parallel(
|
||||
self,
|
||||
final_hidden_states: torch.Tensor,
|
||||
):
|
||||
raise NotImplementedError
|
||||
Reference in New Issue
Block a user