[NVIDIA] Add new SMs support for Spark & Thor (#11287)

Signed-off-by: Serge Panev <spanev@nvidia.com>
2025-10-21 11:02:24 -07:00
parent 97710ccd1a
commit 2b1da821b5
4 changed files with 22 additions and 8 deletions
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -452,7 +452,15 @@ def get_available_gpu_memory(

        if empty_cache:
            torch.cuda.empty_cache()
-        free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
+        SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121)  # Orin, Thor, Spark
+        if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS:
+            # On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
+            # only reports "free" memory, which can be lower than what is actually
+            # available due to not including cache memory. So we use the system available
+            # memory metric instead.
+            free_gpu_memory = psutil.virtual_memory().available
+        else:
+            free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)

    elif device == "xpu":
        num_gpus = torch.xpu.device_count()
--- a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
@@ -568,7 +568,7 @@ void scaled_fp4_experts_quant_sm100a(
    torch::Tensor const& input_offset_by_experts,
    torch::Tensor const& output_scale_offset_by_experts) {
  auto sm_version = getSMVersion();
-  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+  TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");

  CHECK_INPUT(output, "output must be a CUDA tensor");
  CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
@@ -652,7 +652,7 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a(
    torch::Tensor const& mask,
    bool use_silu_and_mul) {
  auto sm_version = getSMVersion();
-  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+  TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");

  CHECK_INPUT(output, "output must be a CUDA tensor");
  CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
--- a/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
@@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;

 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-  // PTX instructions used here requires sm100a/sm103a.
-#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  // PTX instructions used here requires >= sm100f.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
+    (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
  uint32_t val;
  asm volatile(
      "{\n"
@@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
        "f"(array[7]));
  return val;
 #else
+  printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
+  __trap();
  return 0;
 #endif
 }

 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-  // PTX instructions used here requires sm100a/sm103a.
-#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  // PTX instructions used here requires >= sm100f.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
+    (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
  uint32_t val;
  asm volatile(
      "{\n"
@@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
        "f"(array[3].y));
  return val;
 #else
+  printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
+  __trap();
  return 0;
 #endif
 }
--- a/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
@@ -202,7 +202,7 @@ inline int getMultiProcessorCount() {
 void scaled_fp4_quant_sm100a(
    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
  auto sm_version = getSMVersion();
-  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+  TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");

  int32_t m = input.size(0);
  int32_t n = input.size(1);