[Feat] Add custom Embedding tensor model parallel (#2616)
Similar to #2309 , this PR introduces Embedding tensor model parallel to
achieve decreasing of memory consumption. It support both eager mode and
graph mode.
And this PR refactor module tensor parallel configurations supported in
#2309, #2167, #2120, merge all config into `finegrained_tp_config` in
`additional_config`, including:
`lmhead_tensor_parallel_size`
`oproj_tensor_parallel_size`
`embedding_tensor_parallel_size`
`mlp_tensor_parallel_size`
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zzhx1 <zzh_201018@outlook.com>
Signed-off-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: zzhx1 <zzh_201018@outlook.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
This commit is contained in:
@@ -715,15 +715,23 @@ def get_ascend_device_type():
|
||||
|
||||
|
||||
def lmhead_tp_enable() -> bool:
|
||||
return get_ascend_config().lmhead_tensor_parallel_size is not None
|
||||
return get_ascend_config(
|
||||
).finegrained_tp_config.lmhead_tensor_parallel_size > 0
|
||||
|
||||
|
||||
def embedding_tp_enable() -> bool:
|
||||
return get_ascend_config(
|
||||
).finegrained_tp_config.embedding_tensor_parallel_size > 0
|
||||
|
||||
|
||||
def oproj_tp_enable() -> bool:
|
||||
return get_ascend_config().oproj_tensor_parallel_size is not None
|
||||
return get_ascend_config(
|
||||
).finegrained_tp_config.oproj_tensor_parallel_size > 0
|
||||
|
||||
|
||||
def mlp_tp_enable() -> bool:
|
||||
return envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE
|
||||
return get_ascend_config(
|
||||
).finegrained_tp_config.mlp_tensor_parallel_size > 0
|
||||
|
||||
|
||||
def matmul_allreduce_enable() -> bool:
|
||||
@@ -971,7 +979,7 @@ def get_flashcomm2_config_and_validate(ascend_config, vllm_config):
|
||||
logger.warning_once(
|
||||
"It is recommended to enable FLASHCOMM1 simultaneously when starting FLASHCOMM2 for optimal performance."
|
||||
)
|
||||
if ascend_config.oproj_tensor_parallel_size is not None:
|
||||
if ascend_config.finegrained_tp_config.oproj_tensor_parallel_size > 0:
|
||||
raise AssertionError(
|
||||
"flashcomm2_oproj_tensor_parallel_size cannot be enabled simultaneously with oproj_tensor_parallel_size"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user