Fix triton head num (#1482)

2024-09-21 10:25:20 +08:00
parent 014982b5e0
commit a68cb201dd
4 changed files with 54 additions and 1 deletions
--- a/python/sglang/srt/layers/attention_backend.py
+++ b/python/sglang/srt/layers/attention_backend.py
@@ -346,7 +346,9 @@ class TritonAttnBackend(AttentionBackend):

        self.decode_attention_fwd = decode_attention_fwd
        self.extend_attention_fwd = extend_attention_fwd
-        self.num_head = model_runner.model_config.num_attention_heads
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // model_runner.tp_size
+        )

        if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
            self.reduce_dtype = torch.float32
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -25,6 +25,7 @@ from sglang.utils import get_exception_traceback
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"