diff --git a/benchmark/kernels/quantization/tuning_block_wise_fp8.py b/benchmark/kernels/quantization/tuning_block_wise_fp8.py
index 07bdb4bf1..b2df304f0 100644
--- a/benchmark/kernels/quantization/tuning_block_wise_fp8.py
+++ b/benchmark/kernels/quantization/tuning_block_wise_fp8.py
@@ -23,8 +23,13 @@ import torch
 import triton
 from tqdm import tqdm
 
-from sglang.srt.layers.quantization.fp8_kernel import _w8a8_block_fp8_matmul
-from sglang.srt.utils import get_device_name
+from sglang.srt.layers.quantization.fp8_kernel import (
+    _w8a8_block_fp8_matmul,
+    _w8a8_block_fp8_matmul_unrolledx4,
+)
+from sglang.srt.utils import get_device_core_count, get_device_name, is_hip
+
+is_hip_ = is_hip()
 
 DTYPE_MAP = {
     "float32": torch.float32,
@@ -80,7 +85,19 @@ def w8a8_block_fp8_matmul(
             triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
         )
 
-    _w8a8_block_fp8_matmul[grid](
+    # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
+    # Empirical testing shows the sweet spot lies when it's less than the # of
+    # compute units available on the device.
+    num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(
+        N, config["BLOCK_SIZE_N"]
+    )
+    kernel = (
+        _w8a8_block_fp8_matmul_unrolledx4
+        if (is_hip_ == True and num_workgroups <= get_device_core_count())
+        else _w8a8_block_fp8_matmul
+    )
+
+    kernel[grid](
         A,
         B,
         C,
@@ -107,14 +124,15 @@ def w8a8_block_fp8_matmul(
     return C
 
 
-def get_configs_compute_bound():
+def get_rocm_configs_compute_bound():
     configs = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128]:
-                for block_n in [32, 64, 128, 256]:
+    waves_per_eu_range = 0
+    for num_stages in [2]:
+        for block_m in [32, 64, 128, 256]:
+            for block_k in [32, 64, 128, 256]:
+                for block_n in [16, 32, 64, 128, 256]:
                     for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
+                        for group_size in [1, 4, 8, 16, 32]:
                             configs.append(
                                 {
                                     "BLOCK_SIZE_M": block_m,
@@ -123,11 +141,36 @@ def get_configs_compute_bound():
                                     "GROUP_SIZE_M": group_size,
                                     "num_warps": num_warps,
                                     "num_stages": num_stages,
+                                    "waves_per_eu": waves_per_eu_range,
                                 }
                             )
     return configs
 
 
+def get_configs_compute_bound():
+    configs = []
+    if is_hip_:
+        configs = get_rocm_configs_compute_bound()
+    else:
+        for num_stages in [2, 3, 4, 5]:
+            for block_m in [16, 32, 64, 128, 256]:
+                for block_k in [64, 128]:
+                    for block_n in [32, 64, 128, 256]:
+                        for num_warps in [4, 8]:
+                            for group_size in [1, 16, 32, 64]:
+                                configs.append(
+                                    {
+                                        "BLOCK_SIZE_M": block_m,
+                                        "BLOCK_SIZE_N": block_n,
+                                        "BLOCK_SIZE_K": block_k,
+                                        "GROUP_SIZE_M": group_size,
+                                        "num_warps": num_warps,
+                                        "num_stages": num_stages,
+                                    }
+                                )
+    return configs
+
+
 def get_weight_shapes(tp_size):
     # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model.
     # cannot TP
@@ -190,14 +233,18 @@ def benchmark_config(
 
 def tune(M, N, K, block_size, out_dtype, search_space):
     factor_for_scale = 1e-2
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_info = torch.finfo(torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
     A_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
-    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(
+        torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
+    )
 
     B_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
-    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(
+        torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
+    )
 
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index a7be90051..e25d0492d 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -72,10 +72,10 @@
         "waves_per_eu": 0
     },
     "64": {
-        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index c098ef2db..3633e363f 100644
--- a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,15 +1,6 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 8,
@@ -17,35 +8,44 @@
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+    "2": {
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -54,10 +54,10 @@
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -66,7 +66,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -102,43 +102,43 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -147,16 +147,16 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index 6f5adbb93..80d23e969 100644
--- a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,33 +1,15 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -35,29 +17,47 @@
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "16": {
-        "BLOCK_SIZE_M": 64,
+    "4": {
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -93,23 +93,23 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -118,27 +118,27 @@
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1536": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -147,7 +147,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -156,7 +156,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index 4225c78eb..a5518d979 100644
--- a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,63 +1,63 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -66,43 +66,43 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "64": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "256": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -117,17 +117,17 @@
         "waves_per_eu": 0
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
@@ -138,7 +138,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index 5e6789d00..44c67c008 100644
--- a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,63 +1,63 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -66,7 +66,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -75,32 +75,32 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -109,7 +109,7 @@
     },
     "512": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -120,16 +120,16 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -138,7 +138,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -147,7 +147,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -156,7 +156,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index 49ac14d2a..09502f05c 100644
--- a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,15 +1,15 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -18,7 +18,7 @@
         "waves_per_eu": 0
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -27,16 +27,16 @@
         "waves_per_eu": 0
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -45,7 +45,7 @@
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -54,28 +54,28 @@
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -84,7 +84,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -117,8 +117,8 @@
         "waves_per_eu": 0
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -126,8 +126,8 @@
         "waves_per_eu": 0
     },
     "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 8,
         "num_warps": 4,
@@ -136,27 +136,27 @@
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
     "4096": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index dcbb0efc5..7fd942e3e 100644
--- a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,15 +1,15 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -18,7 +18,7 @@
         "waves_per_eu": 0
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -27,34 +27,34 @@
         "waves_per_eu": 0
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -63,19 +63,19 @@
         "waves_per_eu": 0
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -84,7 +84,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -102,7 +102,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -111,32 +111,32 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -144,19 +144,19 @@
         "waves_per_eu": 0
     },
     "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index dfe5c1e43..72ab3dff2 100644
--- a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,33 +1,33 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -36,7 +36,7 @@
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -45,19 +45,19 @@
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -66,14 +66,14 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -81,8 +81,8 @@
         "waves_per_eu": 0
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -90,17 +90,17 @@
         "waves_per_eu": 0
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 8,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -118,7 +118,7 @@
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -127,7 +127,7 @@
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index a87f5de1b..37208577d 100644
--- a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,6 +1,33 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
@@ -8,17 +35,26 @@
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "2": {
-        "BLOCK_SIZE_M": 64,
+    "16": {
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "4": {
-        "BLOCK_SIZE_M": 64,
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 16,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
@@ -26,45 +62,9 @@
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
     "48": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -72,8 +72,8 @@
         "waves_per_eu": 0
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -81,8 +81,8 @@
         "waves_per_eu": 0
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -91,7 +91,7 @@
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -99,8 +99,8 @@
         "waves_per_eu": 0
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -111,14 +111,14 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
@@ -127,9 +127,9 @@
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -138,7 +138,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -147,7 +147,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -156,7 +156,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
index 468f9e78d..d8cc0f896 100644
--- a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -1,61 +1,61 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -64,59 +64,14 @@
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2,
-        "waves_per_eu": 0
-    },
-    "1024": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
@@ -125,20 +80,65 @@
         "num_stages": 2,
         "waves_per_eu": 0
     },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2048": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -156,7 +156,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 4,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0