diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py index 97415b280..14637d672 100644 --- a/python/sglang/srt/utils/common.py +++ b/python/sglang/srt/utils/common.py @@ -452,7 +452,15 @@ def get_available_gpu_memory( if empty_cache: torch.cuda.empty_cache() - free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id) + SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121) # Orin, Thor, Spark + if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS: + # On these devices, which use sysmem as device mem, torch.cuda.mem_get_info() + # only reports "free" memory, which can be lower than what is actually + # available due to not including cache memory. So we use the system available + # memory metric instead. + free_gpu_memory = psutil.virtual_memory().available + else: + free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id) elif device == "xpu": num_gpus = torch.xpu.device_count() diff --git a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu index e18f2057b..5c9eeae80 100644 --- a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu +++ b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu @@ -568,7 +568,7 @@ void scaled_fp4_experts_quant_sm100a( torch::Tensor const& input_offset_by_experts, torch::Tensor const& output_scale_offset_by_experts) { auto sm_version = getSMVersion(); - TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a"); + TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+"); CHECK_INPUT(output, "output must be a CUDA tensor"); CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor"); @@ -652,7 +652,7 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a( torch::Tensor const& mask, bool use_silu_and_mul) { auto sm_version = getSMVersion(); - TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a"); + TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+"); CHECK_INPUT(output, "output must be a CUDA tensor"); CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor"); diff --git a/sgl-kernel/csrc/gemm/nvfp4_quant.cuh b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh index b2aa5f006..9fe2d9e3a 100644 --- a/sgl-kernel/csrc/gemm/nvfp4_quant.cuh +++ b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh @@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16; // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { - // PTX instructions used here requires sm100a/sm103a. -#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED + // PTX instructions used here requires >= sm100f. +#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \ + (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000)) uint32_t val; asm volatile( "{\n" @@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { "f"(array[7])); return val; #else + printf("fp32_vec_to_e2m1 is not supported on this architecture\n"); + __trap(); return 0; #endif } // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { - // PTX instructions used here requires sm100a/sm103a. -#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED + // PTX instructions used here requires >= sm100f. +#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \ + (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000)) uint32_t val; asm volatile( "{\n" @@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { "f"(array[3].y)); return val; #else + printf("fp32_vec_to_e2m1 is not supported on this architecture\n"); + __trap(); return 0; #endif } diff --git a/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu index d307f5fb7..e4b1fad8f 100644 --- a/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu +++ b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu @@ -202,7 +202,7 @@ inline int getMultiProcessorCount() { void scaled_fp4_quant_sm100a( torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) { auto sm_version = getSMVersion(); - TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a"); + TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+"); int32_t m = input.size(0); int32_t n = input.size(1);