From fd71b11b1d96d385b09cb79c91a36f1f01293639 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 27 Aug 2025 03:34:29 -0700 Subject: [PATCH] move is_sm90_supported/is_sm100_supported to python/sglang/srt/utils.py (#9679) --- .../srt/layers/attention/flashinfer_backend.py | 7 +++++-- .../srt/layers/attention/flashinfer_mla_backend.py | 7 +++++-- python/sglang/srt/layers/communicator.py | 3 +-- python/sglang/srt/layers/moe/cutlass_moe.py | 8 -------- python/sglang/srt/layers/quantization/fp8.py | 3 ++- python/sglang/srt/layers/quantization/fp8_utils.py | 2 +- python/sglang/srt/layers/quantization/mxfp4.py | 3 +-- python/sglang/srt/layers/utils.py | 14 -------------- python/sglang/srt/model_executor/model_runner.py | 2 +- python/sglang/srt/models/deepseek_v2.py | 5 +++-- python/sglang/srt/models/gpt_oss.py | 3 ++- python/sglang/srt/server_args.py | 3 ++- python/sglang/srt/utils.py | 14 ++++++++++++++ 13 files changed, 37 insertions(+), 37 deletions(-) diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index d1e778e92..6e3418808 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -26,11 +26,14 @@ from sglang.srt.layers.attention.base_attn_backend import AttentionBackend from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.layers.radix_attention import AttentionType -from sglang.srt.layers.utils import is_sm100_supported from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput -from sglang.srt.utils import is_flashinfer_available, next_power_of_2 +from sglang.srt.utils import ( + is_flashinfer_available, + is_sm100_supported, + next_power_of_2, +) if TYPE_CHECKING: from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py index 846d83288..b3acc8b01 100644 --- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py @@ -28,11 +28,14 @@ from sglang.srt.layers.attention.flashinfer_backend import ( create_flashinfer_kv_indices_triton, ) from sglang.srt.layers.dp_attention import get_attention_tp_size -from sglang.srt.layers.utils import is_sm100_supported from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput -from sglang.srt.utils import is_flashinfer_available, next_power_of_2 +from sglang.srt.utils import ( + is_flashinfer_available, + is_sm100_supported, + next_power_of_2, +) if TYPE_CHECKING: from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/layers/communicator.py b/python/sglang/srt/layers/communicator.py index 6e578afe0..4e422a360 100644 --- a/python/sglang/srt/layers/communicator.py +++ b/python/sglang/srt/layers/communicator.py @@ -40,10 +40,9 @@ from sglang.srt.layers.moe import ( get_moe_a2a_backend, should_use_flashinfer_cutlass_moe_fp4_allgather, ) -from sglang.srt.layers.utils import is_sm100_supported from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.model_executor.forward_batch_info import ForwardBatch -from sglang.srt.utils import is_cuda, is_flashinfer_available +from sglang.srt.utils import is_cuda, is_flashinfer_available, is_sm100_supported _is_flashinfer_available = is_flashinfer_available() _is_sm100_supported = is_cuda() and is_sm100_supported() diff --git a/python/sglang/srt/layers/moe/cutlass_moe.py b/python/sglang/srt/layers/moe/cutlass_moe.py index 4d9868710..d0fb4e3ef 100755 --- a/python/sglang/srt/layers/moe/cutlass_moe.py +++ b/python/sglang/srt/layers/moe/cutlass_moe.py @@ -1,20 +1,12 @@ """CUTLASS based Fused MoE kernels.""" -import functools -import json -import logging -import os -from typing import Any, Callable, Dict, List, Optional, Tuple - import torch from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams -from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported from sglang.srt.utils import is_cuda _is_cuda = is_cuda() if _is_cuda: - import sgl_kernel from sgl_kernel import ( apply_shuffle_mul_sum, cutlass_fp4_group_mm, diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 6a199c8f1..4915d4d08 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -64,7 +64,6 @@ from sglang.srt.layers.quantization.utils import ( per_tensor_dequantize, requantize_with_max_scale, ) -from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported from sglang.srt.utils import ( cpu_has_amx_support, get_bool_env_var, @@ -72,6 +71,8 @@ from sglang.srt.utils import ( is_cuda, is_hip, is_npu, + is_sm90_supported, + is_sm100_supported, log_info_on_rank0, next_power_of_2, print_warning_once, diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index 8dcde41e8..c08cabe5e 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -5,7 +5,7 @@ import torch from sglang.srt.layers.quantization import deep_gemm_wrapper from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8 from sglang.srt.layers.quantization.mxfp4_tensor import MXFP4QuantizeUtil -from sglang.srt.layers.utils import is_sm100_supported +from sglang.srt.utils import is_sm100_supported try: from vllm import _custom_ops as ops diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py index ed667f14b..6b2d82e92 100644 --- a/python/sglang/srt/layers/quantization/mxfp4.py +++ b/python/sglang/srt/layers/quantization/mxfp4.py @@ -29,14 +29,13 @@ from sglang.srt.layers.quantization.base_config import ( QuantizeMethodBase, ) from sglang.srt.layers.quantization.utils import is_layer_skipped -from sglang.srt.layers.utils import is_sm100_supported from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.utils import ( direct_register_custom_op, - get_bool_env_var, is_cuda, is_flashinfer_available, is_hip, + is_sm100_supported, is_triton_kernels_available, log_info_on_rank0, mxfp_supported, diff --git a/python/sglang/srt/layers/utils.py b/python/sglang/srt/layers/utils.py index ac0ddb65c..d79ccc663 100644 --- a/python/sglang/srt/layers/utils.py +++ b/python/sglang/srt/layers/utils.py @@ -34,17 +34,3 @@ class PPMissingLayer(torch.nn.Identity): """ input = args[0] if args else next(iter(kwargs.values())) return (input,) if self.return_tuple else input - - -@lru_cache(maxsize=1) -def is_sm100_supported(device=None) -> bool: - return (torch.cuda.get_device_capability(device)[0] == 10) and ( - torch.version.cuda >= "12.8" - ) - - -@lru_cache(maxsize=1) -def is_sm90_supported(device=None) -> bool: - return (torch.cuda.get_device_capability(device)[0] == 9) and ( - torch.version.cuda >= "12.3" - ) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 293dba061..8d5b7c715 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -66,7 +66,6 @@ from sglang.srt.layers.quantization import ( ) from sglang.srt.layers.sampler import Sampler from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model -from sglang.srt.layers.utils import is_sm100_supported from sglang.srt.lora.lora_manager import LoRAManager from sglang.srt.lora.lora_registry import LoRARef from sglang.srt.managers.schedule_batch import ( @@ -121,6 +120,7 @@ from sglang.srt.utils import ( is_hopper_with_cuda_12_3, is_no_spec_infer_or_topk_one, is_npu, + is_sm100_supported, monkey_patch_p2p_access_check, monkey_patch_vllm_gguf_config, set_cuda_arch, diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index c9305d06e..6c942fcd1 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -87,8 +87,8 @@ from sglang.srt.layers.quantization.int8_utils import ( block_dequant as int8_block_dequant, ) from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.rotary_embedding import get_rope, get_rope_wrapper -from sglang.srt.layers.utils import PPMissingLayer, get_layer_id, is_sm100_supported +from sglang.srt.layers.rotary_embedding import get_rope_wrapper +from sglang.srt.layers.utils import PPMissingLayer, get_layer_id from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, @@ -114,6 +114,7 @@ from sglang.srt.utils import ( is_flashinfer_available, is_hip, is_non_idle_and_non_empty, + is_sm100_supported, log_info_on_rank0, make_layers, use_intel_amx_backend, diff --git a/python/sglang/srt/models/gpt_oss.py b/python/sglang/srt/models/gpt_oss.py index eda1ed7e7..35c42d26e 100644 --- a/python/sglang/srt/models/gpt_oss.py +++ b/python/sglang/srt/models/gpt_oss.py @@ -58,7 +58,7 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.quantization.fp8_utils import dequant_mxfp4 from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.rotary_embedding import get_rope -from sglang.srt.layers.utils import PPMissingLayer, get_layer_id, is_sm100_supported +from sglang.srt.layers.utils import PPMissingLayer, get_layer_id from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, @@ -71,6 +71,7 @@ from sglang.srt.utils import ( add_prefix, is_cuda, is_flashinfer_available, + is_sm100_supported, make_layers, ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index aa973dec1..757ae295a 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -25,7 +25,6 @@ from typing import List, Literal, Optional, Union from sglang.srt.function_call.function_call_parser import FunctionCallParser from sglang.srt.hf_transformers_utils import check_gguf_file, get_config -from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported from sglang.srt.lora.lora_registry import LoRARef from sglang.srt.reasoning_parser import ReasoningParser from sglang.srt.utils import ( @@ -39,6 +38,8 @@ from sglang.srt.utils import ( is_hip, is_port_available, is_remote_url, + is_sm90_supported, + is_sm100_supported, is_triton_kernels_available, is_valid_ipv6_address, nullable_str, diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index acf011515..1ef3c8fd6 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -172,6 +172,20 @@ def is_blackwell(): return torch.cuda.get_device_capability()[0] == 10 +@lru_cache(maxsize=1) +def is_sm100_supported(device=None) -> bool: + return (torch.cuda.get_device_capability(device)[0] == 10) and ( + torch.version.cuda >= "12.8" + ) + + +@lru_cache(maxsize=1) +def is_sm90_supported(device=None) -> bool: + return (torch.cuda.get_device_capability(device)[0] == 9) and ( + torch.version.cuda >= "12.3" + ) + + _warned_bool_env_var_keys = set()