Fix sgl-kernel benchmark dead code (#11022)

This commit is contained in:
Xiaoyu Zhang
2025-09-29 15:06:40 +08:00
committed by GitHub
parent 71959545df
commit 11965b0daf
25 changed files with 1019 additions and 260 deletions

View File

@@ -1,13 +1,32 @@
import itertools
import os
import pytest
import torch
import triton
from sgl_kernel import topk_softmax
from vllm import _custom_ops as vllm_custom_ops
# Optional vLLM import
try:
from vllm import _custom_ops as vllm_custom_ops
VLLM_AVAILABLE = True
except ImportError:
vllm_custom_ops = None
VLLM_AVAILABLE = False
# CI environment detection
IS_CI = (
os.getenv("CI", "false").lower() == "true"
or os.getenv("GITHUB_ACTIONS", "false").lower() == "true"
)
def vllm_topk_softmax(gating_output, topk):
if not VLLM_AVAILABLE:
# Fallback to SGLang implementation if vLLM is not available
return sglang_topk_softmax(gating_output, topk)
num_tokens, num_experts = gating_output.shape
topk_weights = torch.empty(
@@ -54,6 +73,10 @@ def calculate_diff(num_tokens, num_experts, topk):
weights_diff = torch.abs(weights_vllm - weights_sglang).mean().item()
indices_match = torch.equal(indices_vllm, indices_sglang)
if not VLLM_AVAILABLE:
print("⚠️ vLLM not available, skipping comparison")
return
if (
torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3)
and indices_match
@@ -65,21 +88,38 @@ def calculate_diff(num_tokens, num_experts, topk):
)
num_tokens_range = [128, 512, 1024, 2048, 4096, 8192, 16384, 32768]
num_experts_range = [32, 64, 128, 256, 12, 512]
topk_range = [1, 2, 4, 8]
# CI environment uses simplified parameters
if IS_CI:
num_tokens_range = [128] # Single value for CI
num_experts_range = [32] # Single value for CI
topk_range = [2] # Single value for CI
else:
num_tokens_range = [128, 512, 1024, 2048, 4096, 8192, 16384, 32768]
num_experts_range = [32, 64, 128, 256, 12, 512]
topk_range = [1, 2, 4, 8]
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
# Filter providers based on vLLM availability
if VLLM_AVAILABLE:
line_vals = ["sglang", "vllm"]
line_names = ["SGLang", "VLLM"]
styles = [("blue", "-"), ("green", "-")]
else:
line_vals = ["sglang"]
line_names = ["SGLang"]
styles = [("blue", "-")]
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
line_vals=["sglang", "vllm"],
line_names=["SGLang", "VLLM"],
styles=[("blue", "-"), ("green", "-")],
line_vals=line_vals,
line_names=line_names,
styles=styles,
ylabel="Latency (us)",
plot_name="topk-softmax-performance",
args={},
@@ -92,6 +132,8 @@ def benchmark(num_tokens, num_experts, topk, provider):
)
if provider == "vllm" or provider == "vllm1":
if not VLLM_AVAILABLE:
return (0, 0, 0)
fn = lambda: vllm_topk_softmax(gating_output, topk)
elif provider == "sglang" or provider == "sglang1":
fn = lambda: sglang_topk_softmax(gating_output, topk)
@@ -103,14 +145,19 @@ def benchmark(num_tokens, num_experts, topk, provider):
if __name__ == "__main__":
configs = [
(20, 256, 4),
(20, 256, 8),
(20, 12, 4),
(20, 12, 1),
(20, 512, 4),
(20, 512, 1),
]
for num_tokens, num_experts, topk in configs:
# Simplify configs for CI environment
if IS_CI:
test_configs = [(20, 32, 2)] # Single config for CI
else:
test_configs = [
(20, 256, 4),
(20, 256, 8),
(20, 12, 4),
(20, 12, 1),
(20, 512, 4),
(20, 512, 1),
]
for num_tokens, num_experts, topk in test_configs:
calculate_diff(num_tokens, num_experts, topk)
benchmark.run(print_data=True)