feat: add DeepGEMM build warning (#5176)

Co-authored-by: grimoire <streetyao@live.com>
2025-04-08 21:16:23 -07:00
parent f2b70afde0
commit 6669d12707
1 changed files with 26 additions and 2 deletions
--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -16,6 +16,7 @@ import functools
 import json
 import logging
 import os
 from contextlib import contextmanager
 from typing import Any, Dict, List, Optional, Tuple
 import torch
@@ -59,6 +60,9 @@ if supports_custom_op():
        Bs: torch.Tensor,
        C: torch.Tensor,
    ) -> None:
        M, K = A.shape
        N, _ = B.shape
        with _log_jit_build(M, N, K):
            deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
    def deep_gemm_fp8_fp8_bf16_nt_fake(
@@ -708,6 +712,25 @@ def get_w8a8_block_fp8_configs(
    return None
@contextmanager
 def _log_jit_build(M: int, N: int, K: int):
    from deep_gemm.jit.runtime import RuntimeCache
    origin_func = RuntimeCache.__getitem__
    def __patched_func(self, *args, **kwargs):
        ret = origin_func(self, *args, **kwargs)
        if ret is None:
            logger.warning(
                f"DeepGEMM JIT code generation <gemm_fp8_fp8_bf16_nt>: M={M}, N={N}, K={K}. Please wait."
            )
        return ret
    RuntimeCache.__getitem__ = __patched_func
    yield
    RuntimeCache.__getitem__ = origin_func
 def w8a8_block_fp8_matmul(
    A: torch.Tensor,
    B: torch.Tensor,
@@ -782,6 +805,7 @@ def w8a8_block_fp8_matmul(
        if supports_custom_op():
            torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
        else:
            with _log_jit_build(M, N, K):
                deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
    else:
        kernel = (