Allow disabling flashinfer sampling kernel (#778)

2024-07-27 20:18:56 -07:00
parent 30db99b3d9
commit 752e643007
6 changed files with 41 additions and 26 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -52,13 +52,14 @@ class ServerArgs:

    # Optimization/debug options
    disable_flashinfer: bool = False
+    disable_flashinfer_sampling: bool = False
    disable_radix_cache: bool = False
    disable_regex_jump_forward: bool = False
    disable_cuda_graph: bool = False
    disable_disk_cache: bool = False
    enable_torch_compile: bool = False
-    attention_reduce_in_fp32: bool = False
    enable_p2p_check: bool = False
+    attention_reduce_in_fp32: bool = False
    efficient_weight_load: bool = False

    # Distributed args
@@ -303,7 +304,12 @@ class ServerArgs:
        parser.add_argument(
            "--disable-flashinfer",
            action="store_true",
-            help="Disable flashinfer inference kernels.",
+            help="Disable flashinfer attention kernels.",
+        )
+        parser.add_argument(
+            "--disable-flashinfer-sampling",
+            action="store_true",
+            help="Disable flashinfer sampling kernels.",
        )
        parser.add_argument(
            "--disable-radix-cache",
@@ -330,17 +336,17 @@ class ServerArgs:
            action="store_true",
            help="Optimize the model with torch.compile, experimental feature.",
        )
+        parser.add_argument(
+            "--enable-p2p-check",
+            action="store_true",
+            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
+        )
        parser.add_argument(
            "--attention-reduce-in-fp32",
            action="store_true",
            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
            "This only affects Triton attention kernels",
        )
-        parser.add_argument(
-            "--enable-p2p-check",
-            action="store_true",
-            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
-        )
        parser.add_argument(
            "--efficient-weight-load",
            action="store_true",