Rename triton_fused_moe -> fused_moe_triton (#2163)
This commit is contained in:
@@ -1 +0,0 @@
|
||||
from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
|
||||
1
python/sglang/srt/layers/fused_moe_grok/__init__.py
Normal file
1
python/sglang/srt/layers/fused_moe_grok/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from sglang.srt.layers.fused_moe_grok.layer import FusedMoE, FusedMoEMethodBase
|
||||
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.base_config import (
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8Config
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
|
||||
from sglang.srt.layers.fused_moe.fused_moe import padding_size
|
||||
from sglang.srt.layers.fused_moe_grok.fused_moe import padding_size
|
||||
from sglang.srt.utils import is_hip
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -123,7 +123,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
num_expert_group: Optional[int],
|
||||
topk_group: Optional[int],
|
||||
) -> torch.Tensor:
|
||||
from sglang.srt.layers.fused_moe.fused_moe import fused_moe
|
||||
from sglang.srt.layers.fused_moe_grok.fused_moe import fused_moe
|
||||
|
||||
return fused_moe(
|
||||
x,
|
||||
@@ -609,7 +609,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
topk_group: Optional[int] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
from sglang.srt.layers.fused_moe.fused_moe import fused_moe
|
||||
from sglang.srt.layers.fused_moe_grok.fused_moe import fused_moe
|
||||
|
||||
return fused_moe(
|
||||
x,
|
||||
@@ -1,14 +1,14 @@
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import sglang.srt.layers.triton_fused_moe.fused_moe # noqa
|
||||
from sglang.srt.layers.triton_fused_moe.fused_moe import (
|
||||
import sglang.srt.layers.fused_moe_triton.fused_moe # noqa
|
||||
from sglang.srt.layers.fused_moe_triton.fused_moe import (
|
||||
fused_experts,
|
||||
fused_topk,
|
||||
get_config_file_name,
|
||||
grouped_topk,
|
||||
)
|
||||
from sglang.srt.layers.triton_fused_moe.layer import (
|
||||
from sglang.srt.layers.fused_moe_triton.layer import (
|
||||
FusedMoE,
|
||||
FusedMoEMethodBase,
|
||||
FusedMoeWeightScaleSupported,
|
||||
@@ -376,7 +376,7 @@ def try_get_optimal_moe_config(
|
||||
M: int,
|
||||
is_marlin: bool = False,
|
||||
):
|
||||
from sglang.srt.layers.triton_fused_moe import get_config
|
||||
from sglang.srt.layers.fused_moe_triton import get_config
|
||||
|
||||
override_config = get_config()
|
||||
if override_config:
|
||||
@@ -20,7 +20,7 @@ from sglang.srt.layers.quantization.base_config import (
|
||||
from sglang.srt.utils import set_weight_attrs
|
||||
|
||||
if torch.cuda.is_available() or torch.hip.is_available():
|
||||
from sglang.srt.layers.triton_fused_moe.fused_moe import fused_experts
|
||||
from sglang.srt.layers.fused_moe_triton.fused_moe import fused_experts
|
||||
else:
|
||||
fused_experts = None # type: ignore
|
||||
|
||||
@@ -514,7 +514,7 @@ class FusedMoE(torch.nn.Module):
|
||||
num_expert_group: Optional[int] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
):
|
||||
from sglang.srt.layers.triton_fused_moe.fused_moe import (
|
||||
from sglang.srt.layers.fused_moe_triton.fused_moe import (
|
||||
fused_topk,
|
||||
grouped_topk,
|
||||
)
|
||||
@@ -68,7 +68,7 @@ def fp8_get_quant_method(self, layer, prefix):
|
||||
is_layer_skipped,
|
||||
)
|
||||
|
||||
from sglang.srt.layers.triton_fused_moe.layer import FusedMoE
|
||||
from sglang.srt.layers.fused_moe_triton.layer import FusedMoE
|
||||
|
||||
if isinstance(layer, LinearBase):
|
||||
if is_layer_skipped(prefix, self.ignored_layers):
|
||||
|
||||
@@ -28,6 +28,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.transformers_utils.configs.dbrx import DbrxConfig
|
||||
|
||||
from sglang.srt.layers.fused_moe_triton import fused_moe
|
||||
from sglang.srt.layers.linear import (
|
||||
QKVParallelLinear,
|
||||
ReplicatedLinear,
|
||||
@@ -36,7 +37,6 @@ from sglang.srt.layers.linear import (
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.layers.triton_fused_moe import fused_moe
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE,
|
||||
ParallelLMHead,
|
||||
|
||||
@@ -30,6 +30,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from sglang.srt.layers.activation import SiluAndMul
|
||||
from sglang.srt.layers.fused_moe_triton import fused_moe
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
@@ -40,7 +41,6 @@ from sglang.srt.layers.linear import (
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.layers.triton_fused_moe import fused_moe
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
|
||||
@@ -31,6 +31,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from sglang.srt.layers.activation import SiluAndMul
|
||||
from sglang.srt.layers.fused_moe_triton import FusedMoE
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
@@ -41,7 +42,6 @@ from sglang.srt.layers.linear import (
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.layers.triton_fused_moe import FusedMoE
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
|
||||
@@ -31,7 +31,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.model_loader.loader import DefaultModelLoader
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from sglang.srt.layers.fused_moe import FusedMoE
|
||||
from sglang.srt.layers.fused_moe_grok import FusedMoE
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import (
|
||||
QKVParallelLinear,
|
||||
|
||||
@@ -25,6 +25,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from sglang.srt.layers.fused_moe_triton import FusedMoE
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import (
|
||||
QKVParallelLinear,
|
||||
@@ -35,7 +36,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
||||
from sglang.srt.layers.triton_fused_moe import FusedMoE
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
|
||||
@@ -38,11 +38,11 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.utils import print_warning_once
|
||||
|
||||
from sglang.srt.layers.activation import SiluAndMul
|
||||
from sglang.srt.layers.fused_moe_triton import FusedMoE
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.layers.triton_fused_moe import FusedMoE
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
|
||||
@@ -30,6 +30,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from sglang.srt.layers.activation import SiluAndMul
|
||||
from sglang.srt.layers.fused_moe_triton import FusedMoE
|
||||
from sglang.srt.layers.layernorm import RMSNorm
|
||||
from sglang.srt.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
@@ -41,7 +42,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
||||
from sglang.srt.layers.triton_fused_moe import FusedMoE
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
|
||||
@@ -34,10 +34,10 @@ from vllm.model_executor.layers.linear import (
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
from sglang.srt.layers.fused_moe_triton import fused_moe
|
||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||
from sglang.srt.layers.radix_attention import RadixAttention
|
||||
from sglang.srt.layers.triton_fused_moe import fused_moe
|
||||
from sglang.srt.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead,
|
||||
VocabParallelEmbedding,
|
||||
|
||||
Reference in New Issue
Block a user