diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
index 97415b280..14637d672 100644
--- a/python/sglang/srt/utils/common.py
+++ b/python/sglang/srt/utils/common.py
@@ -452,7 +452,15 @@ def get_available_gpu_memory(
 
         if empty_cache:
             torch.cuda.empty_cache()
-        free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
+        SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121)  # Orin, Thor, Spark
+        if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS:
+            # On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
+            # only reports "free" memory, which can be lower than what is actually
+            # available due to not including cache memory. So we use the system available
+            # memory metric instead.
+            free_gpu_memory = psutil.virtual_memory().available
+        else:
+            free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
 
     elif device == "xpu":
         num_gpus = torch.xpu.device_count()
diff --git a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
index e18f2057b..5c9eeae80 100644
--- a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
@@ -568,7 +568,7 @@ void scaled_fp4_experts_quant_sm100a(
     torch::Tensor const& input_offset_by_experts,
     torch::Tensor const& output_scale_offset_by_experts) {
   auto sm_version = getSMVersion();
-  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+  TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");
 
   CHECK_INPUT(output, "output must be a CUDA tensor");
   CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
@@ -652,7 +652,7 @@ void silu_and_mul_scaled_fp4_experts_quant_sm100a(
     torch::Tensor const& mask,
     bool use_silu_and_mul) {
   auto sm_version = getSMVersion();
-  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+  TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");
 
   CHECK_INPUT(output, "output must be a CUDA tensor");
   CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
diff --git a/sgl-kernel/csrc/gemm/nvfp4_quant.cuh b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
index b2aa5f006..9fe2d9e3a 100644
--- a/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
@@ -50,8 +50,9 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;
 
 // Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-  // PTX instructions used here requires sm100a/sm103a.
-#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  // PTX instructions used here requires >= sm100f.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
+    (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
   uint32_t val;
   asm volatile(
       "{\n"
@@ -76,14 +77,17 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
         "f"(array[7]));
   return val;
 #else
+  printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
+  __trap();
   return 0;
 #endif
 }
 
 // Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
 inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-  // PTX instructions used here requires sm100a/sm103a.
-#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  // PTX instructions used here requires >= sm100f.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED || \
+    (defined(__CUDA_ARCH_FAMILY_SPECIFIC__) && (__CUDA_ARCH_FAMILY_SPECIFIC__ > 1000))
   uint32_t val;
   asm volatile(
       "{\n"
@@ -108,6 +112,8 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
         "f"(array[3].y));
   return val;
 #else
+  printf("fp32_vec_to_e2m1 is not supported on this architecture\n");
+  __trap();
   return 0;
 #endif
 }
diff --git a/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
index d307f5fb7..e4b1fad8f 100644
--- a/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
@@ -202,7 +202,7 @@ inline int getMultiProcessorCount() {
 void scaled_fp4_quant_sm100a(
     torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
   auto sm_version = getSMVersion();
-  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+  TORCH_CHECK(sm_version >= 100, "fp4_quant is only supported on sm100+");
 
   int32_t m = input.size(0);
   int32_t n = input.size(1);