Move files related to EPLB (#7580)
This commit is contained in:
0
python/sglang/srt/eplb/__init__.py
Normal file
0
python/sglang/srt/eplb/__init__.py
Normal file
@@ -3,7 +3,7 @@ from typing import Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
|
from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec
|
||||||
|
|
||||||
|
|
||||||
class EplbAlgorithm(Enum):
|
class EplbAlgorithm(Enum):
|
||||||
@@ -4,10 +4,8 @@ from typing import TYPE_CHECKING, List
|
|||||||
|
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
|
|
||||||
from sglang.srt.managers.expert_distribution import (
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
||||||
get_global_expert_distribution_recorder,
|
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
|
||||||
)
|
|
||||||
from sglang.srt.managers.expert_location import ExpertLocationMetadata
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from sglang.srt.model_executor.model_runner import ModelRunner
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
||||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from sglang.srt.managers.expert_distribution import (
|
from sglang.srt.eplb.expert_distribution import (
|
||||||
_convert_global_physical_count_to_logical_count,
|
_convert_global_physical_count_to_logical_count,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -24,7 +24,7 @@ import einops
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
|
|
||||||
from sglang.srt.managers.expert_location import ExpertLocationMetadata
|
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||||
from sglang.srt.server_args import ServerArgs
|
from sglang.srt.server_args import ServerArgs
|
||||||
@@ -23,7 +23,7 @@ import torch.distributed
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from sglang.srt.configs.model_config import ModelConfig
|
from sglang.srt.configs.model_config import ModelConfig
|
||||||
from sglang.srt.managers import eplb_algorithms
|
from sglang.srt.eplb import eplb_algorithms
|
||||||
from sglang.srt.model_loader import get_model_architecture
|
from sglang.srt.model_loader import get_model_architecture
|
||||||
from sglang.srt.server_args import ServerArgs
|
from sglang.srt.server_args import ServerArgs
|
||||||
|
|
||||||
@@ -17,7 +17,7 @@ from typing import Literal, Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from sglang.srt.managers.expert_location import get_global_expert_location_metadata
|
from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
|
|
||||||
|
|
||||||
@@ -20,7 +20,7 @@ import torch
|
|||||||
import torch.distributed
|
import torch.distributed
|
||||||
from torch.distributed import P2POp
|
from torch.distributed import P2POp
|
||||||
|
|
||||||
from sglang.srt.managers.expert_location import (
|
from sglang.srt.eplb.expert_location import (
|
||||||
ExpertLocationMetadata,
|
ExpertLocationMetadata,
|
||||||
get_global_expert_location_metadata,
|
get_global_expert_location_metadata,
|
||||||
)
|
)
|
||||||
@@ -11,6 +11,8 @@ from sglang.srt.distributed import (
|
|||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
)
|
)
|
||||||
|
from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
|
||||||
|
from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
|
||||||
from sglang.srt.layers.moe.ep_moe.kernels import (
|
from sglang.srt.layers.moe.ep_moe.kernels import (
|
||||||
ep_gather,
|
ep_gather,
|
||||||
ep_scatter,
|
ep_scatter,
|
||||||
@@ -40,8 +42,6 @@ from sglang.srt.layers.quantization.fp8_kernel import (
|
|||||||
sglang_per_token_quant_fp8,
|
sglang_per_token_quant_fp8,
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
|
from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
|
||||||
from sglang.srt.managers.expert_location import get_global_expert_location_metadata
|
|
||||||
from sglang.srt.managers.expert_location_dispatch import ExpertLocationDispatchInfo
|
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import (
|
||||||
|
|||||||
@@ -1,10 +1,8 @@
|
|||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
||||||
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
||||||
from sglang.srt.managers.expert_distribution import (
|
|
||||||
get_global_expert_distribution_recorder,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import (
|
||||||
DeepEPMode,
|
DeepEPMode,
|
||||||
|
|||||||
@@ -18,12 +18,12 @@ from typing import Callable, Optional
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from sglang.srt.managers import expert_location_dispatch
|
from sglang.srt.eplb import expert_location_dispatch
|
||||||
from sglang.srt.managers.expert_distribution import (
|
from sglang.srt.eplb.expert_distribution import (
|
||||||
ExpertDistributionRecorder,
|
ExpertDistributionRecorder,
|
||||||
get_global_expert_distribution_recorder,
|
get_global_expert_distribution_recorder,
|
||||||
)
|
)
|
||||||
from sglang.srt.managers.expert_location_dispatch import (
|
from sglang.srt.eplb.expert_location_dispatch import (
|
||||||
ExpertLocationDispatchInfo,
|
ExpertLocationDispatchInfo,
|
||||||
topk_ids_logical_to_physical,
|
topk_ids_logical_to_physical,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ from sglang.srt.disaggregation.utils import (
|
|||||||
prepare_abort,
|
prepare_abort,
|
||||||
)
|
)
|
||||||
from sglang.srt.distributed import get_pp_group, get_world_group
|
from sglang.srt.distributed import get_pp_group, get_world_group
|
||||||
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
||||||
from sglang.srt.hf_transformers_utils import (
|
from sglang.srt.hf_transformers_utils import (
|
||||||
get_processor,
|
get_processor,
|
||||||
get_tokenizer,
|
get_tokenizer,
|
||||||
@@ -65,9 +66,6 @@ from sglang.srt.hf_transformers_utils import (
|
|||||||
)
|
)
|
||||||
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
||||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||||
from sglang.srt.managers.expert_distribution import (
|
|
||||||
get_global_expert_distribution_recorder,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.io_struct import (
|
from sglang.srt.managers.io_struct import (
|
||||||
AbortReq,
|
AbortReq,
|
||||||
CloseSessionReqInput,
|
CloseSessionReqInput,
|
||||||
|
|||||||
@@ -39,6 +39,19 @@ from sglang.srt.distributed import (
|
|||||||
set_mscclpp_all_reduce,
|
set_mscclpp_all_reduce,
|
||||||
)
|
)
|
||||||
from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
|
from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
|
||||||
|
from sglang.srt.eplb.eplb_manager import EPLBManager
|
||||||
|
from sglang.srt.eplb.expert_distribution import (
|
||||||
|
ExpertDistributionRecorder,
|
||||||
|
get_global_expert_distribution_recorder,
|
||||||
|
set_global_expert_distribution_recorder,
|
||||||
|
)
|
||||||
|
from sglang.srt.eplb.expert_location import (
|
||||||
|
ExpertLocationMetadata,
|
||||||
|
compute_initial_expert_location_metadata,
|
||||||
|
get_global_expert_location_metadata,
|
||||||
|
set_global_expert_location_metadata,
|
||||||
|
)
|
||||||
|
from sglang.srt.eplb.expert_location_updater import ExpertLocationUpdater
|
||||||
from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
|
from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
|
||||||
from sglang.srt.layers.dp_attention import (
|
from sglang.srt.layers.dp_attention import (
|
||||||
get_attention_tp_group,
|
get_attention_tp_group,
|
||||||
@@ -54,18 +67,6 @@ from sglang.srt.layers.sampler import Sampler
|
|||||||
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
|
||||||
from sglang.srt.layers.utils import is_sm100_supported
|
from sglang.srt.layers.utils import is_sm100_supported
|
||||||
from sglang.srt.lora.lora_manager import LoRAManager
|
from sglang.srt.lora.lora_manager import LoRAManager
|
||||||
from sglang.srt.managers.eplb_manager import EPLBManager
|
|
||||||
from sglang.srt.managers.expert_distribution import (
|
|
||||||
ExpertDistributionRecorder,
|
|
||||||
get_global_expert_distribution_recorder,
|
|
||||||
set_global_expert_distribution_recorder,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.expert_location import (
|
|
||||||
ExpertLocationMetadata,
|
|
||||||
compute_initial_expert_location_metadata,
|
|
||||||
get_global_expert_location_metadata,
|
|
||||||
set_global_expert_location_metadata,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.schedule_batch import (
|
from sglang.srt.managers.schedule_batch import (
|
||||||
GLOBAL_SERVER_ARGS_KEYS,
|
GLOBAL_SERVER_ARGS_KEYS,
|
||||||
global_server_args_dict,
|
global_server_args_dict,
|
||||||
@@ -84,7 +85,6 @@ from sglang.srt.mem_cache.memory_pool import (
|
|||||||
SWAKVPool,
|
SWAKVPool,
|
||||||
)
|
)
|
||||||
from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
|
from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
|
||||||
from sglang.srt.model_executor.expert_location_updater import ExpertLocationUpdater
|
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
||||||
from sglang.srt.model_loader import get_model
|
from sglang.srt.model_loader import get_model
|
||||||
from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
|
from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from torch import nn
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
||||||
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
||||||
from sglang.srt.layers.layernorm import RMSNorm
|
from sglang.srt.layers.layernorm import RMSNorm
|
||||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||||
@@ -28,9 +29,6 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|||||||
ParallelLMHead,
|
ParallelLMHead,
|
||||||
VocabParallelEmbedding,
|
VocabParallelEmbedding,
|
||||||
)
|
)
|
||||||
from sglang.srt.managers.expert_distribution import (
|
|
||||||
get_global_expert_distribution_recorder,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||||
from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV3ForCausalLM
|
from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV3ForCausalLM
|
||||||
|
|||||||
@@ -32,6 +32,9 @@ from sglang.srt.distributed import (
|
|||||||
parallel_state,
|
parallel_state,
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
)
|
)
|
||||||
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
||||||
|
from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
|
||||||
|
from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
|
||||||
from sglang.srt.layers.activation import SiluAndMul
|
from sglang.srt.layers.activation import SiluAndMul
|
||||||
from sglang.srt.layers.communicator import (
|
from sglang.srt.layers.communicator import (
|
||||||
LayerCommunicator,
|
LayerCommunicator,
|
||||||
@@ -77,11 +80,6 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|||||||
ParallelLMHead,
|
ParallelLMHead,
|
||||||
VocabParallelEmbedding,
|
VocabParallelEmbedding,
|
||||||
)
|
)
|
||||||
from sglang.srt.managers.expert_distribution import (
|
|
||||||
get_global_expert_distribution_recorder,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.expert_location import ModelConfigForExpertLocation
|
|
||||||
from sglang.srt.managers.expert_location_dispatch import ExpertLocationDispatchInfo
|
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ from sglang.srt.distributed import (
|
|||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
)
|
)
|
||||||
|
from sglang.srt.eplb.expert_distribution import ExpertDistributionRecorder
|
||||||
from sglang.srt.layers.activation import SiluAndMul
|
from sglang.srt.layers.activation import SiluAndMul
|
||||||
from sglang.srt.layers.layernorm import RMSNorm
|
from sglang.srt.layers.layernorm import RMSNorm
|
||||||
from sglang.srt.layers.linear import (
|
from sglang.srt.layers.linear import (
|
||||||
@@ -48,7 +49,6 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|||||||
ParallelLMHead,
|
ParallelLMHead,
|
||||||
VocabParallelEmbedding,
|
VocabParallelEmbedding,
|
||||||
)
|
)
|
||||||
from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
|
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||||
from sglang.srt.model_loader.weight_utils import (
|
from sglang.srt.model_loader.weight_utils import (
|
||||||
default_weight_loader,
|
default_weight_loader,
|
||||||
|
|||||||
@@ -31,6 +31,11 @@ from sglang.srt.distributed import (
|
|||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
)
|
)
|
||||||
|
from sglang.srt.eplb.expert_distribution import (
|
||||||
|
ExpertDistributionRecorder,
|
||||||
|
get_global_expert_distribution_recorder,
|
||||||
|
)
|
||||||
|
from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
|
||||||
from sglang.srt.layers.activation import SiluAndMul
|
from sglang.srt.layers.activation import SiluAndMul
|
||||||
from sglang.srt.layers.communicator import (
|
from sglang.srt.layers.communicator import (
|
||||||
LayerCommunicator,
|
LayerCommunicator,
|
||||||
@@ -64,11 +69,6 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|||||||
ParallelLMHead,
|
ParallelLMHead,
|
||||||
VocabParallelEmbedding,
|
VocabParallelEmbedding,
|
||||||
)
|
)
|
||||||
from sglang.srt.managers.expert_distribution import (
|
|
||||||
ExpertDistributionRecorder,
|
|
||||||
get_global_expert_distribution_recorder,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.expert_location import ModelConfigForExpertLocation
|
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
||||||
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
||||||
|
|||||||
@@ -32,6 +32,9 @@ from sglang.srt.distributed import (
|
|||||||
tensor_model_parallel_all_gather,
|
tensor_model_parallel_all_gather,
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
)
|
)
|
||||||
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
||||||
|
from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
|
||||||
|
from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
|
||||||
from sglang.srt.layers.activation import SiluAndMul
|
from sglang.srt.layers.activation import SiluAndMul
|
||||||
from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
|
from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
|
||||||
from sglang.srt.layers.dp_attention import (
|
from sglang.srt.layers.dp_attention import (
|
||||||
@@ -63,11 +66,6 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|||||||
ParallelLMHead,
|
ParallelLMHead,
|
||||||
VocabParallelEmbedding,
|
VocabParallelEmbedding,
|
||||||
)
|
)
|
||||||
from sglang.srt.managers.expert_distribution import (
|
|
||||||
get_global_expert_distribution_recorder,
|
|
||||||
)
|
|
||||||
from sglang.srt.managers.expert_location import ModelConfigForExpertLocation
|
|
||||||
from sglang.srt.managers.expert_location_dispatch import ExpertLocationDispatchInfo
|
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import (
|
from sglang.srt.model_executor.forward_batch_info import (
|
||||||
ForwardBatch,
|
ForwardBatch,
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import torch.distributed
|
|||||||
import torch.multiprocessing as mp
|
import torch.multiprocessing as mp
|
||||||
from torch.multiprocessing import Process
|
from torch.multiprocessing import Process
|
||||||
|
|
||||||
from sglang.srt.model_executor import expert_location_updater
|
from sglang.srt.eplb import expert_location_updater
|
||||||
from sglang.test.test_utils import CustomTestCase, find_available_port
|
from sglang.test.test_utils import CustomTestCase, find_available_port
|
||||||
from sglang.utils import is_in_ci
|
from sglang.utils import is_in_ci
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user