[NVIDIA] Add new SMs support for Spark & Thor (#11287)

Signed-off-by: Serge Panev <spanev@nvidia.com>
2025-10-21 11:02:24 -07:00
parent 97710ccd1a
commit 2b1da821b5
4 changed files with 22 additions and 8 deletions
--- a/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
@@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;

 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-  // PTX instructions used here requires sm100a/sm103a.
-#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  // PTX instructions used here requires >= sm100f.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
+    (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
  uint32_t val;
  asm volatile(
      "{\n"
@@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
        "f"(array[7]));
  return val;
 #else
+  printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
+  __trap();
  return 0;
 #endif
 }

 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-  // PTX instructions used here requires sm100a/sm103a.
-#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  // PTX instructions used here requires >= sm100f.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
+    (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
  uint32_t val;
  asm volatile(
      "{\n"
@@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
        "f"(array[3].y));
  return val;
 #else
+  printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
+  __trap();
  return 0;
 #endif
 }