[Feat] Add custom Embedding tensor model parallel (#2616)
Similar to #2309 , this PR introduces Embedding tensor model parallel to
achieve decreasing of memory consumption. It support both eager mode and
graph mode.
And this PR refactor module tensor parallel configurations supported in
#2309, #2167, #2120, merge all config into `finegrained_tp_config` in
`additional_config`, including:
`lmhead_tensor_parallel_size`
`oproj_tensor_parallel_size`
`embedding_tensor_parallel_size`
`mlp_tensor_parallel_size`
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zzhx1 <zzh_201018@outlook.com>
Signed-off-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: zzhx1 <zzh_201018@outlook.com>
Co-authored-by: chenxiao <Jaychou1620@Gmail.com>
Co-authored-by: zzhxx <zhangzihang23@mails.ucas.ac.cn>
Co-authored-by: Jade Zheng <zheng.shoujian@outlook.com>
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
import os
|
||||
import unittest
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock, patch
|
||||
@@ -26,7 +25,8 @@ class BaseLinearTest(unittest.TestCase):
|
||||
parallel_state._OTP = self.mock_group
|
||||
|
||||
self.mock_ascend_config = MagicMock()
|
||||
self.mock_ascend_config.oproj_tensor_parallel_size = 2
|
||||
self.mock_ascend_config.finegrained_tp_config.oproj_tensor_parallel_size = 2
|
||||
self.mock_ascend_config.finegrained_tp_config.mlp_tensor_parallel_size = 2
|
||||
|
||||
self.patches = [
|
||||
patch("vllm_ascend.ascend_config.get_ascend_config",
|
||||
@@ -81,7 +81,11 @@ class TestAscendUnquantizedLinearMethod(TestBase):
|
||||
class TestAscendRowParallelLinear(BaseLinearTest):
|
||||
|
||||
def test_mlp_optimize(self):
|
||||
os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1"
|
||||
|
||||
ascend_config._ASCEND_CONFIG = MagicMock()
|
||||
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
|
||||
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
|
||||
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
|
||||
|
||||
linear = AscendRowParallelLinear(
|
||||
input_size=16,
|
||||
@@ -98,8 +102,9 @@ class TestAscendRowParallelLinear(BaseLinearTest):
|
||||
config._current_vllm_config = MagicMock()
|
||||
|
||||
ascend_config._ASCEND_CONFIG = MagicMock()
|
||||
ascend_config._ASCEND_CONFIG.oproj_tensor_parallel_size = 2
|
||||
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
|
||||
ascend_config._ASCEND_CONFIG.finegrained_tp_config.oproj_tensor_parallel_size = 2
|
||||
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
|
||||
|
||||
linear = AscendRowParallelLinear(
|
||||
input_size=16,
|
||||
@@ -115,7 +120,11 @@ class TestAscendRowParallelLinear(BaseLinearTest):
|
||||
class TestAscendMergedColumnParallelLinear(BaseLinearTest):
|
||||
|
||||
def test_merged_mlp_tp_init(self):
|
||||
os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1"
|
||||
|
||||
ascend_config._ASCEND_CONFIG = MagicMock()
|
||||
ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
|
||||
ascend_config._ASCEND_CONFIG.finegrained_tp_config.mlp_tensor_parallel_size = 2
|
||||
ascend_config._ASCEND_CONFIG.ascend_scheduler_config.enabled = False
|
||||
|
||||
linear = AscendMergedColumnParallelLinear(
|
||||
input_size=16,
|
||||
|
||||
@@ -14,11 +14,12 @@
|
||||
# Adapted from vllm/tests/lora/test_layers.py
|
||||
|
||||
import unittest
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
|
||||
from vllm_ascend.ascend_config import init_ascend_config
|
||||
from vllm_ascend.distributed import parallel_state
|
||||
from vllm_ascend.ops.vocab_parallel_embedding import (
|
||||
AscendLogitsProcessor, AscendParallelLMHead, AscendVocabParallelEmbedding)
|
||||
|
||||
@@ -32,9 +33,33 @@ class TestCustomVocabParallelEmbedding(unittest.TestCase):
|
||||
self.embedding_dim = 10
|
||||
self.org_num_embeddings = 40
|
||||
self.padding_size = 8
|
||||
|
||||
self.mock_group = mock.MagicMock()
|
||||
self.mock_group.world_size = 2
|
||||
self.mock_group.rank_in_group = 0
|
||||
|
||||
parallel_state._MLP_TP = self.mock_group
|
||||
parallel_state._OTP = self.mock_group
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.additional_config = {}
|
||||
init_ascend_config(mock_vllm_config)
|
||||
self.mock_ascend_config = MagicMock()
|
||||
self.mock_ascend_config.finegrained_tp_config.lmhead_tensor_parallel_size = 2
|
||||
self.mock_ascend_config.finegrained_tp_config.embedding_tensor_parallel_size = 2
|
||||
|
||||
self.patches = [
|
||||
patch("vllm_ascend.utils.get_ascend_config",
|
||||
return_value=self.mock_ascend_config),
|
||||
patch("vllm_ascend.distributed.parallel_state.get_lmhead_tp_group",
|
||||
return_value=self.mock_group),
|
||||
patch(
|
||||
"vllm.distributed.parallel_state.get_tp_group",
|
||||
return_value=self.mock_group,
|
||||
),
|
||||
]
|
||||
|
||||
for p in self.patches:
|
||||
p.start()
|
||||
|
||||
def _create_layer(self):
|
||||
# Patch methods and dependencies for VocabParallelEmbedding
|
||||
|
||||
Reference in New Issue
Block a user