[6/N] MoE Refactor: Cleanup MoE-related configs (#8849)
This commit is contained in:
@@ -52,6 +52,7 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
|
||||
ScheduleBatchDisaggregationDecodeMixin,
|
||||
)
|
||||
from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
|
||||
from sglang.srt.layers.moe import is_tbo_enabled
|
||||
from sglang.srt.mem_cache.allocator import (
|
||||
BaseTokenToKVPoolAllocator,
|
||||
SWATokenToKVPoolAllocator,
|
||||
@@ -84,17 +85,10 @@ GLOBAL_SERVER_ARGS_KEYS = [
|
||||
"device",
|
||||
"disable_chunked_prefix_cache",
|
||||
"disable_radix_cache",
|
||||
"enable_two_batch_overlap",
|
||||
"tbo_token_distribution_threshold",
|
||||
"enable_dp_lm_head",
|
||||
"moe_a2a_backend",
|
||||
"deepep_mode",
|
||||
"enable_flashinfer_cutlass_moe",
|
||||
"enable_flashinfer_trtllm_moe",
|
||||
"enable_flashinfer_allreduce_fusion",
|
||||
"moe_dense_tp_size",
|
||||
"ep_dispatch_algorithm",
|
||||
"deepep_config",
|
||||
"ep_num_redundant_experts",
|
||||
"enable_nan_detection",
|
||||
"flashinfer_mla_disable_ragged",
|
||||
@@ -107,8 +101,6 @@ GLOBAL_SERVER_ARGS_KEYS = [
|
||||
"triton_attention_reduce_in_fp32",
|
||||
"num_reserved_decode_tokens",
|
||||
"weight_loader_disable_mmap",
|
||||
"enable_triton_kernel_moe",
|
||||
"enable_flashinfer_mxfp4_moe",
|
||||
"enable_multimodal",
|
||||
"enable_symm_mem",
|
||||
"quantization",
|
||||
|
||||
@@ -64,7 +64,7 @@ from sglang.srt.hf_transformers_utils import (
|
||||
)
|
||||
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
|
||||
from sglang.srt.layers.moe import initialize_moe_config
|
||||
from sglang.srt.managers.io_struct import (
|
||||
AbortReq,
|
||||
CloseSessionReqInput,
|
||||
@@ -245,6 +245,9 @@ class Scheduler(
|
||||
)
|
||||
)
|
||||
|
||||
# Init model config
|
||||
self.model_config = ModelConfig.from_server_args(server_args)
|
||||
|
||||
# Init inter-process communication
|
||||
context = zmq.Context(2)
|
||||
self.idle_sleeper = None
|
||||
@@ -292,6 +295,9 @@ class Scheduler(
|
||||
# Init tokenizer
|
||||
self.init_tokenizer()
|
||||
|
||||
# Init moe config
|
||||
self.init_moe_config()
|
||||
|
||||
# Set reasoning_parser and think_end_id if --reasoning_parser is enabled
|
||||
if self.server_args.reasoning_parser and self.tokenizer:
|
||||
reasoning_parser = ReasoningParser(
|
||||
@@ -538,8 +544,6 @@ class Scheduler(
|
||||
|
||||
def init_tokenizer(self):
|
||||
server_args = self.server_args
|
||||
|
||||
self.model_config = ModelConfig.from_server_args(server_args)
|
||||
self.is_generation = self.model_config.is_generation
|
||||
|
||||
if server_args.skip_tokenizer_init:
|
||||
@@ -761,6 +765,10 @@ class Scheduler(
|
||||
# The prefill requests that are in the middle of kv sending
|
||||
self.disagg_prefill_inflight_queue: List[Req] = []
|
||||
|
||||
def init_moe_config(self):
|
||||
if hasattr(self.model_config.hf_config, "num_experts_per_tok"):
|
||||
initialize_moe_config(self.server_args)
|
||||
|
||||
@DynamicGradMode()
|
||||
def event_loop_normal(self):
|
||||
"""A normal scheduler loop."""
|
||||
@@ -1823,11 +1831,6 @@ class Scheduler(
|
||||
disable_cuda_graph=self.server_args.disable_cuda_graph,
|
||||
spec_algorithm=self.spec_algorithm,
|
||||
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
||||
enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
|
||||
enable_deepep_moe=MoeA2ABackend(
|
||||
self.server_args.moe_a2a_backend
|
||||
).is_deepep(),
|
||||
deepep_mode=DeepEPMode(self.server_args.deepep_mode),
|
||||
require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
|
||||
disable_overlap_schedule=self.server_args.disable_overlap_schedule,
|
||||
)
|
||||
@@ -1922,9 +1925,6 @@ class Scheduler(
|
||||
disable_cuda_graph: bool,
|
||||
spec_algorithm,
|
||||
speculative_num_draft_tokens,
|
||||
enable_two_batch_overlap: bool,
|
||||
enable_deepep_moe: bool,
|
||||
deepep_mode: DeepEPMode,
|
||||
require_mlp_tp_gather: bool,
|
||||
disable_overlap_schedule: bool,
|
||||
):
|
||||
@@ -1972,9 +1972,6 @@ class Scheduler(
|
||||
is_extend_in_batch,
|
||||
*tbo_preparer.prepare_all_gather(
|
||||
local_batch,
|
||||
deepep_mode,
|
||||
enable_deepep_moe,
|
||||
enable_two_batch_overlap,
|
||||
),
|
||||
],
|
||||
dtype=torch.int64,
|
||||
|
||||
Reference in New Issue
Block a user