[Bugfix] Add constraints for sequence parallelism (#4014)
### What this PR does / why we need it?
Add Add constraints for sequence parallelism for unsupported scenarios:
1. tp_size > 1
2. enable_expert_parallel must be True for MoE model
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
---------
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -115,12 +115,10 @@ def set_ascend_forward_context(
|
|||||||
# the performance may degrade due to the switching of communication methods.
|
# the performance may degrade due to the switching of communication methods.
|
||||||
mmrs_fusion = True
|
mmrs_fusion = True
|
||||||
if is_moe_model(vllm_config):
|
if is_moe_model(vllm_config):
|
||||||
sp_enabled = enable_sp(vllm_config) and \
|
sp_enabled = enable_sp(vllm_config) and num_tokens is not None
|
||||||
tp_world_size > 1 and num_tokens is not None
|
|
||||||
mmrs_fusion = False
|
mmrs_fusion = False
|
||||||
else:
|
else:
|
||||||
sp_enabled = enable_sp(vllm_config) and \
|
sp_enabled = enable_sp(vllm_config) and \
|
||||||
tp_world_size > 1 and \
|
|
||||||
num_tokens is not None and num_tokens > 1000
|
num_tokens is not None and num_tokens > 1000
|
||||||
forward_context.mmrs_fusion = mmrs_fusion
|
forward_context.mmrs_fusion = mmrs_fusion
|
||||||
|
|
||||||
|
|||||||
@@ -659,6 +659,17 @@ def enable_sp(vllm_config=None) -> bool:
|
|||||||
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
|
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
|
||||||
or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))))
|
or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))))
|
||||||
|
|
||||||
|
if not _ENABLE_SP:
|
||||||
|
return _ENABLE_SP
|
||||||
|
|
||||||
|
assert vllm_config.parallel_config.tensor_parallel_size > 1, \
|
||||||
|
"Flash Comm v1 (Sequence Parallelism) is only supported when tp_size > 1."
|
||||||
|
|
||||||
|
assert (
|
||||||
|
not is_moe_model(vllm_config)
|
||||||
|
or vllm_config.parallel_config.enable_expert_parallel
|
||||||
|
), "Flash Comm v1 (Sequence Parallelism) requires enable_expert_parallel=True for MoE models."
|
||||||
|
|
||||||
return _ENABLE_SP
|
return _ENABLE_SP
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user