Add PDL support for quant kernel and rope kernel (#9106)

2025-08-20 16:56:29 +08:00
parent c9bf3877a0
commit 42c8704560
7 changed files with 80 additions and 33 deletions
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -635,6 +635,8 @@ def _set_envs_and_config(server_args: ServerArgs):
        os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
    os.environ["CUDA_MODULE_LOADING"] = "AUTO"
+    # flashinfer uses this environment variable for various kernels from MoE to quant kernels
+    os.environ["TRTLLM_ENABLE_PDL"] = "1"

    # Set prometheus env vars
    if server_args.enable_metrics:
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -550,7 +550,6 @@ class ServerArgs:
            assert (
                self.quantization == "modelopt_fp4"
            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
-            os.environ["TRTLLM_ENABLE_PDL"] = "1"
            assert self.ep_size in [
                1,
                self.tp_size,