fix per token cuda kernel hidden dim cannot divide by 16 (#8543)

This commit is contained in:
Stefan He
2025-08-01 09:27:18 -07:00
committed by GitHub
parent 533cb5b274
commit db7343c992
3 changed files with 167 additions and 47 deletions

View File

@@ -36,7 +36,7 @@ def sglang_per_token_quant_fp8(
@pytest.mark.parametrize(
"num_tokens,hidden_dim",
list(itertools.product([128, 256, 512], [512, 2048, 4096])),
list(itertools.product([128, 256, 512], [512, 1368, 2048, 4096])),
)
def test_per_token_quant_compare_implementations(
num_tokens: int,