cleanup deps 1/n (#4400)

Co-authored-by: sleepcoo <sleepcoo@gmail.com>
2025-03-14 00:00:33 -07:00
parent 0e0ec70200
commit 977d7cd26a
4 changed files with 30 additions and 35 deletions
--- a/python/sglang/srt/layers/moe/ep_moe/layer.py
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -3,7 +3,6 @@ from typing import Callable, List, Optional, Tuple

 import torch
 from torch.nn import Module
-from vllm import _custom_ops as vllm_ops

 from sglang.srt.custom_op import CustomOp
 from sglang.srt.distributed import (
@@ -32,6 +31,8 @@ _is_cuda = is_cuda()

 if _is_cuda:
    from sglang.srt.custom_op import scaled_fp8_quant as sgl_scaled_fp8_quant
+else:
+    from vllm import _custom_ops as vllm_ops


 logger = logging.getLogger(__name__)
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -11,7 +11,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
-from vllm import _custom_ops as vllm_ops

 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
@@ -46,6 +45,8 @@ if _is_cuda:
    from sglang.srt.layers.quantization.fp8_kernel import (
        sglang_per_token_group_quant_fp8,
    )
+else:
+    from vllm import _custom_ops as vllm_ops

 if _is_cuda or _is_hip:
    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
@@ -456,38 +457,8 @@ def moe_align_block_size(
        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
    )
    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    if num_experts >= 224:
-        if enable_moe_align_block_size_triton:
-            moe_align_block_size_triton(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-            )
-        else:
-            token_cnts_buffer = torch.zeros(
-                (num_experts + 1) * num_experts,
-                dtype=torch.int32,
-                device=topk_ids.device,
-            )
-            cumsum_buffer = torch.zeros(
-                num_experts + 1, dtype=torch.int32, device=topk_ids.device
-            )
-
-            sgl_moe_align_block_size(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-                token_cnts_buffer,
-                cumsum_buffer,
-            )
-    else:
-        vllm_ops.moe_align_block_size(
+    if enable_moe_align_block_size_triton:
+        moe_align_block_size_triton(
            topk_ids,
            num_experts,
            block_size,
@@ -495,6 +466,26 @@ def moe_align_block_size(
            expert_ids,
            num_tokens_post_pad,
        )
+    else:
+        token_cnts_buffer = torch.zeros(
+            (num_experts + 1) * num_experts,
+            dtype=torch.int32,
+            device=topk_ids.device,
+        )
+        cumsum_buffer = torch.zeros(
+            num_experts + 1, dtype=torch.int32, device=topk_ids.device
+        )
+
+        sgl_moe_align_block_size(
+            topk_ids,
+            num_experts,
+            block_size,
+            sorted_ids,
+            expert_ids,
+            num_tokens_post_pad,
+            token_cnts_buffer,
+            cumsum_buffer,
+        )
    return sorted_ids, expert_ids, num_tokens_post_pad


--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
-from vllm import _custom_ops as ops

 from sglang.srt.custom_op import CustomOp
 from sglang.srt.utils import is_cuda_available
@@ -14,6 +13,8 @@ from sglang.srt.utils import is_cuda_available
 _is_cuda_available = is_cuda_available()
 if _is_cuda_available:
    from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
+else:
+    from vllm import _custom_ops as ops


 def _rotate_neox(x: torch.Tensor) -> torch.Tensor: