From e34cf6ad75dd310b5f8a75d82b7325cc51da0f0f Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Sun, 27 Jul 2025 15:30:24 +0800 Subject: [PATCH] Fix bench script making input data on L2 cache (#7739) --- sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py b/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py index c83172997..5a9248982 100644 --- a/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py +++ b/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py @@ -205,9 +205,9 @@ def benchmark(batch_size, seq_len, group_size, dst_dtype, provider): quantiles = [0.5, 0.2, 0.8] if provider == "triton": - fn = lambda: triton_per_token_group_quant_8bit(x.clone(), group_size, dst_dtype) + fn = lambda: triton_per_token_group_quant_8bit(x, group_size, dst_dtype) elif provider == "sglang": - fn = lambda: sglang_per_token_group_quant_8bit(x.clone(), group_size, dst_dtype) + fn = lambda: sglang_per_token_group_quant_8bit(x, group_size, dst_dtype) ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)