[Feat] Add custom Embedding tensor model parallel (#2616)
Similar to #2309 , this PR introduces Embedding tensor model parallel to
achieve decreasing of memory consumption. It support both eager mode and
graph mode.
And this PR refactor module tensor parallel configurations supported in
#2309, #2167, #2120, merge all config into `finegrained_tp_config` in
`additional_config`, including:
`lmhead_tensor_parallel_size`
`oproj_tensor_parallel_size`
`embedding_tensor_parallel_size`
`mlp_tensor_parallel_size`
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zzhx1 <zzh_201018@outlook.com>
Signed-off-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: zzhx1 <zzh_201018@outlook.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
This commit is contained in:
@@ -12,15 +12,17 @@ from vllm_ascend.distributed.parallel_state import (
|
||||
|
||||
@pytest.fixture
|
||||
def parallel_config():
|
||||
return ParallelConfig(data_parallel_size=2,
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=2)
|
||||
return ParallelConfig(
|
||||
data_parallel_size=2,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_distributed():
|
||||
with patch('torch.distributed.is_initialized', return_value=True), \
|
||||
patch('torch.distributed.get_world_size', return_value=8), \
|
||||
patch('torch.distributed.get_world_size', return_value=16), \
|
||||
patch('torch.distributed.get_backend', return_value='nccl'), \
|
||||
patch('vllm_ascend.distributed.parallel_state.get_world_group') as mock_group, \
|
||||
patch('vllm_ascend.distributed.parallel_state.get_tp_group') as mock_tp_group, \
|
||||
@@ -36,8 +38,9 @@ def mock_distributed():
|
||||
|
||||
def test_init_ascend_model_parallel(mock_distributed, parallel_config):
|
||||
mock_ascend_config = MagicMock()
|
||||
mock_ascend_config.lmhead_tensor_parallel_size = 2
|
||||
mock_ascend_config.oproj_tensor_parallel_size = 2
|
||||
mock_ascend_config.finegrained_tp_config.lmhead_tensor_parallel_size = 2
|
||||
mock_ascend_config.finegrained_tp_config.oproj_tensor_parallel_size = 2
|
||||
mock_ascend_config.finegrained_tp_config.embedding_tensor_parallel_size = 2
|
||||
mock_ascend_config.flashcomm2_oproj_tensor_parallel_size = 2
|
||||
mock_ascend_config.pd_tp_ratio = 2
|
||||
mock_ascend_config.num_head_replica = 0
|
||||
|
||||
Reference in New Issue
Block a user