Add GemLite caching after each capture (#2669)

2024-12-30 14:27:29 +01:00
parent 9c6ba2484f
commit a29dd9501d
2 changed files with 21 additions and 3 deletions
--- a/python/sglang/srt/layers/torchao_utils.py
+++ b/python/sglang/srt/layers/torchao_utils.py
@@ -11,6 +11,22 @@ import torch
 logger = logging.getLogger(__name__)
 def get_gemlite_cache_path() -> str:
    return f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
 def save_gemlite_cache(print_error: bool = False) -> bool:
    try:
        from gemlite.core import GemLiteLinearTriton
        GemLiteLinearTriton.cache_config(get_gemlite_cache_path())
    except Exception:
        if print_error:
            logger.error("Failed to save the GemLite cache.")
        return False
    return True
 def apply_torchao_config_to_model(
    model: torch.nn.Module, torchao_config: str, filter_fn=None
 ):
@@ -74,9 +90,7 @@ def apply_torchao_config_to_model(
        )
        # try to load gemlite kernel config
-        GemLiteLinearTriton.load_config(
+        GemLiteLinearTriton.load_config(get_gemlite_cache_path())
            f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
        )
    elif "fp8wo" in torchao_config:
        # this requires newer hardware
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -31,6 +31,7 @@ from sglang.srt.layers.logits_processor import (
    LogitsProcessorOutput,
 )
 from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
 from sglang.srt.layers.torchao_utils import save_gemlite_cache
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.utils import maybe_torch_compile, monkey_patch_vllm_all_gather
@@ -276,6 +277,9 @@ class CudaGraphRunner:
                    self.graphs[bs] = graph
                    self.output_buffers[bs] = output_buffers
                # Save gemlite cache after each capture
                save_gemlite_cache()
    def capture_one_batch_size(self, bs: int, forward: Callable):
        graph = torch.cuda.CUDAGraph()
        stream = self.stream