[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)
This commit is contained in:
@@ -38,6 +38,7 @@ import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
|
||||
from sglang.srt.layers.dp_attention import (
|
||||
DPPaddingMode,
|
||||
get_attention_dp_rank,
|
||||
@@ -839,7 +840,7 @@ class ForwardBatch:
|
||||
|
||||
|
||||
def enable_num_token_non_padded(server_args):
|
||||
return server_args.enable_ep_moe or server_args.enable_deepep_moe
|
||||
return get_moe_expert_parallel_world_size() > 1
|
||||
|
||||
|
||||
class PPProxyTensors:
|
||||
|
||||
@@ -60,6 +60,7 @@ from sglang.srt.layers.dp_attention import (
|
||||
initialize_dp_attention,
|
||||
)
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
|
||||
from sglang.srt.layers.quantization import (
|
||||
deep_gemm_wrapper,
|
||||
monkey_patch_isinstance_for_vllm_base_layer,
|
||||
@@ -217,6 +218,10 @@ class ModelRunner:
|
||||
"use_mla_backend": self.use_mla_backend,
|
||||
"speculative_algorithm": self.spec_algorithm,
|
||||
}
|
||||
| {
|
||||
"moe_a2a_backend": MoeA2ABackend(server_args.moe_a2a_backend),
|
||||
"deepep_mode": DeepEPMode(server_args.deepep_mode),
|
||||
}
|
||||
)
|
||||
|
||||
# CPU offload
|
||||
|
||||
Reference in New Issue
Block a user