From 977d7cd26abda920b9a60399534e50bac9e589fa Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 14 Mar 2025 00:00:33 -0700 Subject: [PATCH] cleanup deps 1/n (#4400) Co-authored-by: sleepcoo --- .github/workflows/pr-test-amd.yml | 2 + python/sglang/srt/layers/moe/ep_moe/layer.py | 3 +- .../layers/moe/fused_moe_triton/fused_moe.py | 57 ++++++++----------- python/sglang/srt/layers/rotary_embedding.py | 3 +- 4 files changed, 30 insertions(+), 35 deletions(-) diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index f21eaba0b..3e1f4e051 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -45,6 +45,7 @@ jobs: - name: Install dependencies run: | docker exec ci_sglang pip install --upgrade pip + docker exec ci_sglang pip uninstall sgl-kernel -y || true docker exec -w /sglang-checkout/sgl-kernel ci_sglang python3 setup_rocm.py install docker exec ci_sglang pip install -e "python[dev_hip]" @@ -83,6 +84,7 @@ jobs: - name: Install dependencies run: | docker exec ci_sglang pip install --upgrade pip + docker exec ci_sglang pip uninstall sgl-kernel -y || true docker exec -w /sglang-checkout/sgl-kernel ci_sglang python3 setup_rocm.py install docker exec ci_sglang pip install -e "python[dev_hip]" diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index c94437221..e862660d7 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -3,7 +3,6 @@ from typing import Callable, List, Optional, Tuple import torch from torch.nn import Module -from vllm import _custom_ops as vllm_ops from sglang.srt.custom_op import CustomOp from sglang.srt.distributed import ( @@ -32,6 +31,8 @@ _is_cuda = is_cuda() if _is_cuda: from sglang.srt.custom_op import scaled_fp8_quant as sgl_scaled_fp8_quant +else: + from vllm import _custom_ops as vllm_ops logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 2c2e213aa..fbf676b8c 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -11,7 +11,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple import torch import triton import triton.language as tl -from vllm import _custom_ops as vllm_ops from sglang.srt.layers.moe.topk import select_experts from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8 @@ -46,6 +45,8 @@ if _is_cuda: from sglang.srt.layers.quantization.fp8_kernel import ( sglang_per_token_group_quant_fp8, ) +else: + from vllm import _custom_ops as vllm_ops if _is_cuda or _is_hip: from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size @@ -456,38 +457,8 @@ def moe_align_block_size( (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device ) num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) - if num_experts >= 224: - if enable_moe_align_block_size_triton: - moe_align_block_size_triton( - topk_ids, - num_experts, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - ) - else: - token_cnts_buffer = torch.zeros( - (num_experts + 1) * num_experts, - dtype=torch.int32, - device=topk_ids.device, - ) - cumsum_buffer = torch.zeros( - num_experts + 1, dtype=torch.int32, device=topk_ids.device - ) - - sgl_moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids, - expert_ids, - num_tokens_post_pad, - token_cnts_buffer, - cumsum_buffer, - ) - else: - vllm_ops.moe_align_block_size( + if enable_moe_align_block_size_triton: + moe_align_block_size_triton( topk_ids, num_experts, block_size, @@ -495,6 +466,26 @@ def moe_align_block_size( expert_ids, num_tokens_post_pad, ) + else: + token_cnts_buffer = torch.zeros( + (num_experts + 1) * num_experts, + dtype=torch.int32, + device=topk_ids.device, + ) + cumsum_buffer = torch.zeros( + num_experts + 1, dtype=torch.int32, device=topk_ids.device + ) + + sgl_moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + token_cnts_buffer, + cumsum_buffer, + ) return sorted_ids, expert_ids, num_tokens_post_pad diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index 732395765..f5608cf5a 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn -from vllm import _custom_ops as ops from sglang.srt.custom_op import CustomOp from sglang.srt.utils import is_cuda_available @@ -14,6 +13,8 @@ from sglang.srt.utils import is_cuda_available _is_cuda_available = is_cuda_available() if _is_cuda_available: from sgl_kernel import apply_rope_with_cos_sin_cache_inplace +else: + from vllm import _custom_ops as ops def _rotate_neox(x: torch.Tensor) -> torch.Tensor: