[Quickfix] Fix dp+ep+tp error when sp chunked the hidden_states (#3246)
### What this PR does / why we need it?
Fix dp+ep+tp inplace copy error when sp chunked the `hidden_states`.
### How was this patch tested?
test locally with the following scripts
```bash
python examples/offline_data_parallel.py \
--model="Qwen/Qwen3-30B-A3B" \
--dp-size=2 \
--tp-size=2 \
--enable-expert-parallel
```
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -295,6 +295,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
in_dtype=params_dtype,
|
||||
)
|
||||
self.moe_config = moe
|
||||
# TODO: The self.moe_config.tp_size here is not correct, fixme soon
|
||||
|
||||
if quant_config is None:
|
||||
self.quant_method = AscendUnquantizedFusedMoEMethod(moe)
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#
|
||||
|
||||
import gc
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING, Optional, Tuple
|
||||
|
||||
@@ -260,6 +261,8 @@ class NPUPlatform(Platform):
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
if parallel_config and parallel_config.worker_cls == "auto":
|
||||
# TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
|
||||
os.environ["VLLM_ALL2ALL_BACKEND"] = "flashinfer_all2allv"
|
||||
if ascend_config.torchair_graph_config.enabled or ascend_config.enable_shared_expert_dp:
|
||||
parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker"
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user