This commit is contained in:
@@ -1,12 +1,10 @@
|
||||
import itertools
|
||||
import os
|
||||
import time
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import triton
|
||||
from sgl_kernel.test_utils import create_per_token_group_quant_test_data
|
||||
|
||||
from sglang.srt.bench_utils import bench_kineto
|
||||
from sglang.srt.layers.quantization.fp8_kernel import (
|
||||
@@ -21,231 +19,78 @@ from sglang.srt.utils import is_hip
|
||||
_is_hip = is_hip()
|
||||
fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
|
||||
|
||||
mode_concentrated = os.environ.get("SGLANG_BENCH_MODE", "") == "concentrated"
|
||||
|
||||
if int(os.environ.get("SGLANG_NSYS_PROFILING", "0")):
|
||||
# configs = [[
|
||||
# 768,
|
||||
# 16384,
|
||||
# 128,
|
||||
# None,
|
||||
# fp8_type_,
|
||||
# dict(
|
||||
# column_major_scales=True,
|
||||
# scale_tma_aligned=True,
|
||||
# scale_ue8m0=True,
|
||||
# fuse_silu_and_mul=False,
|
||||
# masked_layout_mode=None,
|
||||
# ),
|
||||
# ]]
|
||||
configs = [
|
||||
[
|
||||
768 * 8,
|
||||
2048,
|
||||
128,
|
||||
48,
|
||||
fp8_type_,
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
# masked_layout_mode=None,
|
||||
masked_layout_mode="balanced",
|
||||
# masked_layout_mode="extreme",
|
||||
),
|
||||
]
|
||||
]
|
||||
elif mode_concentrated:
|
||||
configs = list(
|
||||
itertools.product(
|
||||
[768],
|
||||
[1536, 7168, 16384],
|
||||
[128],
|
||||
[None],
|
||||
[fp8_type_],
|
||||
[
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=False,
|
||||
masked_layout_mode=None,
|
||||
),
|
||||
],
|
||||
)
|
||||
) + list(
|
||||
itertools.product(
|
||||
[768 * 8],
|
||||
[2048],
|
||||
[128],
|
||||
[48],
|
||||
[fp8_type_],
|
||||
[
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode=None,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode="balanced",
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode="imbalanced",
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode="extreme",
|
||||
),
|
||||
],
|
||||
)
|
||||
)
|
||||
else:
|
||||
configs = list(
|
||||
itertools.product(
|
||||
[1, 4, 16, 64, 256, 768, 2048, 8192, 16384],
|
||||
[1536, 7168, 16384],
|
||||
[128],
|
||||
[None],
|
||||
[fp8_type_],
|
||||
[
|
||||
dict(
|
||||
column_major_scales=False,
|
||||
scale_tma_aligned=False,
|
||||
scale_ue8m0=False,
|
||||
fuse_silu_and_mul=False,
|
||||
masked_layout_mode=None,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=False,
|
||||
scale_ue8m0=False,
|
||||
fuse_silu_and_mul=False,
|
||||
masked_layout_mode=None,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=False,
|
||||
fuse_silu_and_mul=False,
|
||||
masked_layout_mode=None,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=False,
|
||||
masked_layout_mode=None,
|
||||
),
|
||||
],
|
||||
)
|
||||
) + list(
|
||||
itertools.product(
|
||||
[1 * 8, 4 * 8, 64 * 8, 256 * 8, 768 * 8],
|
||||
[2048],
|
||||
[128],
|
||||
[8, 16, 32, 48],
|
||||
[fp8_type_],
|
||||
[
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode=None,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode="balanced",
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode="imbalanced",
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
fuse_silu_and_mul=True,
|
||||
masked_layout_mode="extreme",
|
||||
),
|
||||
],
|
||||
)
|
||||
num_tokens_range = [1, 4, 16, 64, 256, 768, 2048, 8192, 16384]
|
||||
hidden_dim_range = [1536, 7168, 18432] # For DeepSeek V3/R1
|
||||
group_size_range = [128] # For DeepSeek V3/R1
|
||||
# TODO test int8
|
||||
dst_dtype_range = [fp8_type_]
|
||||
flags_range = [
|
||||
dict(
|
||||
column_major_scales=False,
|
||||
scale_tma_aligned=False,
|
||||
scale_ue8m0=False,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=False,
|
||||
scale_ue8m0=False,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=False,
|
||||
),
|
||||
dict(
|
||||
column_major_scales=True,
|
||||
scale_tma_aligned=True,
|
||||
scale_ue8m0=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
configs = list(
|
||||
itertools.product(
|
||||
num_tokens_range,
|
||||
hidden_dim_range,
|
||||
group_size_range,
|
||||
dst_dtype_range,
|
||||
flags_range,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=[
|
||||
"num_tokens",
|
||||
"hidden_dim",
|
||||
"group_size",
|
||||
"num_ranks",
|
||||
"dst_dtype",
|
||||
"flags",
|
||||
],
|
||||
x_names=["num_tokens", "hidden_dim", "group_size", "dst_dtype", "flags"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["triton", "sglang"],
|
||||
# Triton has multi kernels and we only report the time for the core one
|
||||
line_names=["Triton (Inaccurate)", "SGL Kernel"],
|
||||
line_names=["Triton", "SGL Kernel"],
|
||||
styles=[("blue", "-"), ("green", "-")],
|
||||
ylabel="us",
|
||||
plot_name="per-token-group-quant-8bit-performance",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(
|
||||
num_tokens, hidden_dim, group_size, num_ranks, dst_dtype, flags, provider
|
||||
):
|
||||
print(
|
||||
f"Testing: {num_tokens=} {hidden_dim=} {group_size=} {num_ranks=} {dst_dtype=} {flags=} {provider=}"
|
||||
)
|
||||
def benchmark(num_tokens, hidden_dim, group_size, dst_dtype, flags, provider):
|
||||
if flags["scale_ue8m0"] and group_size != 128:
|
||||
return
|
||||
|
||||
x, masked_m = create_per_token_group_quant_test_data(
|
||||
num_tokens=num_tokens, hidden_dim=hidden_dim, num_ranks=num_ranks, flags=flags
|
||||
)
|
||||
device = torch.device("cuda")
|
||||
|
||||
x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16)
|
||||
|
||||
fn, kernel_names = {
|
||||
"triton": (
|
||||
triton_per_token_group_quant_8bit,
|
||||
"_per_token_group_quant_8bit|_silu_and_mul_post_quant_kernel",
|
||||
),
|
||||
"triton": (triton_per_token_group_quant_8bit, "_per_token_group_quant_fp8"),
|
||||
"sglang": (
|
||||
sglang_per_token_group_quant_8bit,
|
||||
"per_token_group_quant_8bit_kernel",
|
||||
),
|
||||
}[provider]
|
||||
bench_fn = lambda: fn(
|
||||
x=x,
|
||||
masked_m=masked_m,
|
||||
group_size=group_size,
|
||||
dst_dtype=dst_dtype,
|
||||
**{k: v for k, v in flags.items() if k not in ["masked_layout_mode"]},
|
||||
)
|
||||
bench_fn = lambda: fn(x=x, group_size=group_size, dst_dtype=dst_dtype, **flags)
|
||||
|
||||
time_s = bench_kineto(
|
||||
bench_fn, kernel_names=kernel_names, num_tests=300 if mode_concentrated else 30
|
||||
)
|
||||
time_s = bench_kineto(bench_fn, kernel_names=kernel_names)
|
||||
return time_s * 1e6
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user