Enable MLA by default (#1447)

This commit is contained in:
Ke Bao
2024-09-17 19:42:48 +08:00
committed by GitHub
parent 90a26be31c
commit c6b6d2e71b
8 changed files with 16 additions and 18 deletions

View File

@@ -108,12 +108,12 @@ class ServerArgs:
disable_cuda_graph_padding: bool = False
disable_disk_cache: bool = False
disable_custom_all_reduce: bool = False
disable_mla: bool = False
enable_mixed_chunk: bool = False
enable_torch_compile: bool = False
max_torch_compile_bs: int = 32
torchao_config: str = ""
enable_p2p_check: bool = False
enable_mla: bool = False
triton_attention_reduce_in_fp32: bool = False
# LoRA
@@ -173,7 +173,7 @@ class ServerArgs:
self.sampling_backend = "pytorch"
# Default kernel backends
if self.enable_mla:
if not self.disable_mla:
logger.info("MLA optimization is tunred on. Use triton backend.")
self.attention_backend = "triton"
@@ -514,6 +514,11 @@ class ServerArgs:
default=False,
help="Disable the custom all-reduce kernel and fall back to NCCL.",
)
parser.add_argument(
"--disable-mla",
action="store_true",
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
)
parser.add_argument(
"--enable-mixed-chunk",
action="store_true",
@@ -541,11 +546,6 @@ class ServerArgs:
action="store_true",
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
)
parser.add_argument(
"--enable-mla",
action="store_true",
help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
)
parser.add_argument(
"--triton-attention-reduce-in-fp32",
action="store_true",