Support MLA for DeepSeek-V2 with Triton - step 1 (#905)
This commit is contained in:
@@ -80,6 +80,7 @@ class ServerArgs:
|
||||
disable_disk_cache: bool = False
|
||||
enable_torch_compile: bool = False
|
||||
enable_p2p_check: bool = False
|
||||
enable_mla: bool = False
|
||||
attention_reduce_in_fp32: bool = False
|
||||
efficient_weight_load: bool = False
|
||||
|
||||
@@ -393,6 +394,11 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-mla",
|
||||
action="store_true",
|
||||
help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attention-reduce-in-fp32",
|
||||
action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user