Support MLA for DeepSeek-V2 with Triton - step 1 (#905)

2024-08-05 01:40:33 +08:00
parent f4d9953d9d
commit e1eae1fd15
10 changed files with 439 additions and 78 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -80,6 +80,7 @@ class ServerArgs:
    disable_disk_cache: bool = False
    enable_torch_compile: bool = False
    enable_p2p_check: bool = False
+    enable_mla: bool = False
    attention_reduce_in_fp32: bool = False
    efficient_weight_load: bool = False

@@ -393,6 +394,11 @@ class ServerArgs:
            action="store_true",
            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
        )
+        parser.add_argument(
+            "--enable-mla",
+            action="store_true",
+            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2",
+        )
        parser.add_argument(
            "--attention-reduce-in-fp32",
            action="store_true",