[6/N] MoE Refactor: Cleanup MoE-related configs (#8849)
This commit is contained in:
@@ -64,7 +64,7 @@ from sglang.srt.hf_transformers_utils import (
|
||||
)
|
||||
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||
from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
|
||||
from sglang.srt.layers.moe import initialize_moe_config
|
||||
from sglang.srt.managers.io_struct import (
|
||||
AbortReq,
|
||||
CloseSessionReqInput,
|
||||
@@ -245,6 +245,9 @@ class Scheduler(
|
||||
)
|
||||
)
|
||||
|
||||
# Init model config
|
||||
self.model_config = ModelConfig.from_server_args(server_args)
|
||||
|
||||
# Init inter-process communication
|
||||
context = zmq.Context(2)
|
||||
self.idle_sleeper = None
|
||||
@@ -292,6 +295,9 @@ class Scheduler(
|
||||
# Init tokenizer
|
||||
self.init_tokenizer()
|
||||
|
||||
# Init moe config
|
||||
self.init_moe_config()
|
||||
|
||||
# Set reasoning_parser and think_end_id if --reasoning_parser is enabled
|
||||
if self.server_args.reasoning_parser and self.tokenizer:
|
||||
reasoning_parser = ReasoningParser(
|
||||
@@ -538,8 +544,6 @@ class Scheduler(
|
||||
|
||||
def init_tokenizer(self):
|
||||
server_args = self.server_args
|
||||
|
||||
self.model_config = ModelConfig.from_server_args(server_args)
|
||||
self.is_generation = self.model_config.is_generation
|
||||
|
||||
if server_args.skip_tokenizer_init:
|
||||
@@ -761,6 +765,10 @@ class Scheduler(
|
||||
# The prefill requests that are in the middle of kv sending
|
||||
self.disagg_prefill_inflight_queue: List[Req] = []
|
||||
|
||||
def init_moe_config(self):
|
||||
if hasattr(self.model_config.hf_config, "num_experts_per_tok"):
|
||||
initialize_moe_config(self.server_args)
|
||||
|
||||
@DynamicGradMode()
|
||||
def event_loop_normal(self):
|
||||
"""A normal scheduler loop."""
|
||||
@@ -1823,11 +1831,6 @@ class Scheduler(
|
||||
disable_cuda_graph=self.server_args.disable_cuda_graph,
|
||||
spec_algorithm=self.spec_algorithm,
|
||||
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
||||
enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
|
||||
enable_deepep_moe=MoeA2ABackend(
|
||||
self.server_args.moe_a2a_backend
|
||||
).is_deepep(),
|
||||
deepep_mode=DeepEPMode(self.server_args.deepep_mode),
|
||||
require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
|
||||
disable_overlap_schedule=self.server_args.disable_overlap_schedule,
|
||||
)
|
||||
@@ -1922,9 +1925,6 @@ class Scheduler(
|
||||
disable_cuda_graph: bool,
|
||||
spec_algorithm,
|
||||
speculative_num_draft_tokens,
|
||||
enable_two_batch_overlap: bool,
|
||||
enable_deepep_moe: bool,
|
||||
deepep_mode: DeepEPMode,
|
||||
require_mlp_tp_gather: bool,
|
||||
disable_overlap_schedule: bool,
|
||||
):
|
||||
@@ -1972,9 +1972,6 @@ class Scheduler(
|
||||
is_extend_in_batch,
|
||||
*tbo_preparer.prepare_all_gather(
|
||||
local_batch,
|
||||
deepep_mode,
|
||||
enable_deepep_moe,
|
||||
enable_two_batch_overlap,
|
||||
),
|
||||
],
|
||||
dtype=torch.int64,
|
||||
|
||||
Reference in New Issue
Block a user