[Feat] Add custom Embedding tensor model parallel (#2616)

Similar to #2309 , this PR introduces Embedding tensor model parallel to achieve decreasing of memory consumption. It support both eager mode and graph mode. And this PR refactor module tensor parallel configurations supported in #2309, #2167, #2120, merge all config into `finegrained_tp_config` in `additional_config`, including: `lmhead_tensor_parallel_size` `oproj_tensor_parallel_size` `embedding_tensor_parallel_size` `mlp_tensor_parallel_size` - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zzhx1 <zzh_201018@outlook.com> Signed-off-by: zzhxx <zhangzihang23@mails.ucas.ac.cn> Co-authored-by: zzhx1 <zzh_201018@outlook.com> Co-authored-by: chenxiao <Jaychou1620@Gmail.com> Co-authored-by: zzhxx <zhangzihang23@mails.ucas.ac.cn> Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-12 14:41:20 +08:00
parent b8a317caac
commit d65fb194d9
9 changed files with 301 additions and 162 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -715,15 +715,23 @@ def get_ascend_device_type():


 def lmhead_tp_enable() -> bool:
-    return get_ascend_config().lmhead_tensor_parallel_size is not None
+    return get_ascend_config(
+    ).finegrained_tp_config.lmhead_tensor_parallel_size > 0
+
+
+def embedding_tp_enable() -> bool:
+    return get_ascend_config(
+    ).finegrained_tp_config.embedding_tensor_parallel_size > 0


 def oproj_tp_enable() -> bool:
-    return get_ascend_config().oproj_tensor_parallel_size is not None
+    return get_ascend_config(
+    ).finegrained_tp_config.oproj_tensor_parallel_size > 0


 def mlp_tp_enable() -> bool:
-    return envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE
+    return get_ascend_config(
+    ).finegrained_tp_config.mlp_tensor_parallel_size > 0


 def matmul_allreduce_enable() -> bool:
@@ -971,7 +979,7 @@ def get_flashcomm2_config_and_validate(ascend_config, vllm_config):
        logger.warning_once(
            "It is recommended to enable FLASHCOMM1 simultaneously when starting FLASHCOMM2 for optimal performance."
        )
-    if ascend_config.oproj_tensor_parallel_size is not None:
+    if ascend_config.finegrained_tp_config.oproj_tensor_parallel_size > 0:
        raise AssertionError(
            "flashcomm2_oproj_tensor_parallel_size cannot be enabled simultaneously with oproj_tensor_parallel_size"
        )