[NVIDIA] Add new SMs support for Spark & Thor (#11287)

Signed-off-by: Serge Panev <spanev@nvidia.com>
This commit is contained in:
Serge Panev
2025-10-21 11:02:24 -07:00
committed by GitHub
parent 97710ccd1a
commit 2b1da821b5
4 changed files with 22 additions and 8 deletions

View File

@@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
// PTX instructions used here requires sm100a/sm103a.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
// PTX instructions used here requires >= sm100f.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
(defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
uint32_t val;
asm volatile(
"{\n"
@@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
"f"(array[7]));
return val;
#else
printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
__trap();
return 0;
#endif
}
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
// PTX instructions used here requires sm100a/sm103a.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
// PTX instructions used here requires >= sm100f.
#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
(defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
uint32_t val;
asm volatile(
"{\n"
@@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
"f"(array[3].y));
return val;
#else
printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
__trap();
return 0;
#endif
}