[Feat] Add custom Embedding tensor model parallel (#2616)

Similar to #2309 , this PR introduces Embedding tensor model parallel to
achieve decreasing of memory consumption. It support both eager mode and
graph mode.

And this PR refactor module tensor parallel configurations supported in
#2309, #2167, #2120, merge all config into `finegrained_tp_config` in
`additional_config`, including:
`lmhead_tensor_parallel_size`
`oproj_tensor_parallel_size`
`embedding_tensor_parallel_size`
`mlp_tensor_parallel_size`

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: zzhx1 <zzh_201018@outlook.com>
Signed-off-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: zzhx1 <zzh_201018@outlook.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
This commit is contained in:
lidenghui1110
2025-12-12 14:41:20 +08:00
committed by GitHub
parent b8a317caac
commit d65fb194d9
9 changed files with 301 additions and 162 deletions

View File

@@ -12,15 +12,17 @@ from vllm_ascend.distributed.parallel_state import (
@pytest.fixture
def parallel_config():
return ParallelConfig(data_parallel_size=2,
tensor_parallel_size=2,
pipeline_parallel_size=2)
return ParallelConfig(
data_parallel_size=2,
tensor_parallel_size=4,
pipeline_parallel_size=2,
)
@pytest.fixture
def mock_distributed():
with patch('torch.distributed.is_initialized', return_value=True), \
patch('torch.distributed.get_world_size', return_value=8), \
patch('torch.distributed.get_world_size', return_value=16), \
patch('torch.distributed.get_backend', return_value='nccl'), \
patch('vllm_ascend.distributed.parallel_state.get_world_group') as mock_group, \
patch('vllm_ascend.distributed.parallel_state.get_tp_group') as mock_tp_group, \
@@ -36,8 +38,9 @@ def mock_distributed():
def test_init_ascend_model_parallel(mock_distributed, parallel_config):
mock_ascend_config = MagicMock()
mock_ascend_config.lmhead_tensor_parallel_size = 2
mock_ascend_config.oproj_tensor_parallel_size = 2
mock_ascend_config.finegrained_tp_config.lmhead_tensor_parallel_size = 2
mock_ascend_config.finegrained_tp_config.oproj_tensor_parallel_size = 2
mock_ascend_config.finegrained_tp_config.embedding_tensor_parallel_size = 2
mock_ascend_config.flashcomm2_oproj_tensor_parallel_size = 2
mock_ascend_config.pd_tp_ratio = 2
mock_ascend_config.num_head_replica = 0

View File

@@ -1,4 +1,3 @@
import os
import unittest
from unittest import mock
from unittest.mock import MagicMock, patch
@@ -26,7 +25,8 @@ class BaseLinearTest(unittest.TestCase):
parallel_state._OTP = self.mock_group
self.mock_ascend_config = MagicMock()
self.mock_ascend_config.oproj_tensor_parallel_size = 2
self.mock_ascend_config.finegrained_tp_config.oproj_tensor_parallel_size = 2
self.mock_ascend_config.finegrained_tp_config.mlp_tensor_parallel_size = 2
self.patches = [
patch("vllm_ascend.ascend_config.get_ascend_config",
@@ -81,7 +81,11 @@ class TestAscendUnquantizedLinearMethod(TestBase):
class TestAscendRowParallelLinear(BaseLinearTest):
def test_mlp_optimize(self):
os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1"
ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
linear = AscendRowParallelLinear(
input_size=16,
@@ -98,8 +102,9 @@ class TestAscendRowParallelLinear(BaseLinearTest):
config._current_vllm_config = MagicMock()
ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.oproj_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
ascend_config._ASCEND_CONFIG.finegrained_tp_config.oproj_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
linear = AscendRowParallelLinear(
input_size=16,
@@ -115,7 +120,11 @@ class TestAscendRowParallelLinear(BaseLinearTest):
class TestAscendMergedColumnParallelLinear(BaseLinearTest):
def test_merged_mlp_tp_init(self):
os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1"
ascend_config._ASCEND_CONFIG = MagicMock()
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
linear = AscendMergedColumnParallelLinear(
input_size=16,

View File

@@ -14,11 +14,12 @@
# Adapted from vllm/tests/lora/test_layers.py
import unittest
from unittest import mock
from unittest.mock import MagicMock, patch
import torch
from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.distributed import parallel_state
from vllm_ascend.ops.vocab_parallel_embedding import (
AscendLogitsProcessor, AscendParallelLMHead, AscendVocabParallelEmbedding)
@@ -32,9 +33,33 @@ class TestCustomVocabParallelEmbedding(unittest.TestCase):
self.embedding_dim = 10
self.org_num_embeddings = 40
self.padding_size = 8
self.mock_group = mock.MagicMock()
self.mock_group.world_size = 2
self.mock_group.rank_in_group = 0
parallel_state._MLP_TP = self.mock_group
parallel_state._OTP = self.mock_group
mock_vllm_config = MagicMock()
mock_vllm_config.additional_config = {}
init_ascend_config(mock_vllm_config)
self.mock_ascend_config = MagicMock()
self.mock_ascend_config.finegrained_tp_config.lmhead_tensor_parallel_size = 2
self.mock_ascend_config.finegrained_tp_config.embedding_tensor_parallel_size = 2
self.patches = [
patch("vllm_ascend.utils.get_ascend_config",
return_value=self.mock_ascend_config),
patch("vllm_ascend.distributed.parallel_state.get_lmhead_tp_group",
return_value=self.mock_group),
patch(
"vllm.distributed.parallel_state.get_tp_group",
return_value=self.mock_group,
),
]
for p in self.patches:
p.start()
def _create_layer(self):
# Patch methods and dependencies for VocabParallelEmbedding