[Feat]: Add custom lmhead tensor model parallel (#2309)
### What this PR does / why we need it?
This PR introduces LMhead tensor model parallel to achieve decreasing of
memory consumption, and TPOT performance improvement. It support both
eager mode and graph mode.
In deepseek r1 w8a8 PD disagregated Decode instance, using pure DP, with
lmhead_tensor_parallel_size = 8, we have 1 ms TPOT optimization, saved
1.48 GB NPU memory per RANK.
performance data:
<img width="1444" height="438" alt="image"
src="https://github.com/user-attachments/assets/3c5ef0d3-a7c7-46fd-9797-4de728eb0cb0"
/>
### Does this PR introduce _any_ user-facing change?
This PR introduces one new config in `additional_config`.
| Name | Effect | Required | Type | Constraints |
| :---------------------------- |
:--------------------------------------- | :------- | :--- |
:----------------- |
| lmhead_tensor_parallel_size | Split the lm_head matrix along the
column dimension (vocab_size) into lmhead_tensor_parallel_size pieces |
No | int | default value is None, once this value is set, the feature
will be enabled, vocab_size must be divisible by this value. |
example
`--additional_config={"lmhead_tensor_parallel_size": 8}`
### How was this patch tested?
- vLLM version: v0.10.1.1
- vLLM main:
de533ab2a1
---------
Signed-off-by: zzhx1 <zzh_201018@outlook.com>
Co-authored-by: zhangzihang <zzh_201018@outlook.com>
This commit is contained in:
@@ -33,6 +33,7 @@ from torch_npu.npu.streams import Event
|
||||
from vllm.logger import logger
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
@@ -489,6 +490,9 @@ def register_ascend_customop():
|
||||
AscendMlpRowParallelLinear)
|
||||
from vllm_ascend.ops.rotary_embedding import (
|
||||
AscendDeepseekScalingRotaryEmbedding, AscendRotaryEmbedding)
|
||||
from vllm_ascend.ops.vocab_parallel_embedding import (
|
||||
AscendLogitsProcessor, AscendParallelLMHead,
|
||||
AscendVocabParallelEmbedding)
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendQuickGELU, name="QuickGELU")
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendSiluAndMul,
|
||||
name="SiluAndMul")
|
||||
@@ -497,6 +501,12 @@ def register_ascend_customop():
|
||||
CustomOp.register_oot(
|
||||
_decorated_op_cls=AscendDeepseekScalingRotaryEmbedding,
|
||||
name="DeepseekScalingRotaryEmbedding")
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendVocabParallelEmbedding,
|
||||
name="VocabParallelEmbedding")
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendParallelLMHead,
|
||||
name="ParallelLMHead")
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendLogitsProcessor,
|
||||
name="LogitsProcessor")
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE:
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendMlpColumnParallelLinear,
|
||||
name="ColumnParallelLinear")
|
||||
@@ -512,11 +522,6 @@ def register_ascend_customop():
|
||||
from vllm_ascend.ops.common_fused_moe import AscendFusedMoE
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendFusedMoE, name="FusedMoE")
|
||||
|
||||
from vllm_ascend.ops.vocab_parallel_embedding import \
|
||||
AscendVocabParallelEmbedding
|
||||
CustomOp.register_oot(_decorated_op_cls=AscendVocabParallelEmbedding,
|
||||
name="VocabParallelEmbedding")
|
||||
|
||||
# NOTE: Keep this at last to ensure all custom actions are registered
|
||||
_ASCEND_CUSTOMOP_IS_REIGISTERED = True
|
||||
|
||||
@@ -547,3 +552,7 @@ def get_ascend_soc_version():
|
||||
global _ascend_soc_version
|
||||
assert _ascend_soc_version is not None
|
||||
return _ascend_soc_version
|
||||
|
||||
|
||||
def lmhead_tp_enable() -> bool:
|
||||
return get_ascend_config().lmhead_tensor_parallel_size is not None
|
||||
|
||||
Reference in New Issue
Block a user