[Cherry-pick] Port MoE multi-stream fix to v0.11.0-dev (#3753)
This PR moves the communication operation of shared experts out of extra stream because I found that this might cause rtMemcpy related errors when running shared experts multistream with aclgraph. Furthermore, I utilize a global variable as extra stream object to avoid allocating streams for each layer in full-graph mode. Signed-off-by: whx-sjtu <2952154980@qq.com>
This commit is contained in:
@@ -52,6 +52,7 @@ _IS_310P = None
|
||||
_SLEEP_MODE_ENABLED = None
|
||||
_CURRENT_STREAM = None
|
||||
_PREFETCH_STREAM = None
|
||||
_SHARED_EXPERTS_COMPUTE_STREAM = None
|
||||
_ASCEND_CUSTOMOP_IS_REIGISTERED = False
|
||||
_DEFAULT_BUFFER_SIZE = 200
|
||||
_MIN_DP_BUFFER_SIZE = 50
|
||||
@@ -259,6 +260,15 @@ def prefetch_stream() -> torch.npu.Stream:
|
||||
return _PREFETCH_STREAM
|
||||
|
||||
|
||||
def shared_experts_compute_stream() -> torch.npu.Stream:
|
||||
global _SHARED_EXPERTS_COMPUTE_STREAM
|
||||
if _SHARED_EXPERTS_COMPUTE_STREAM is None:
|
||||
# when this function is called before any stream is set,
|
||||
# we return the default stream.
|
||||
_SHARED_EXPERTS_COMPUTE_STREAM = torch_npu.npu.Stream()
|
||||
return _SHARED_EXPERTS_COMPUTE_STREAM
|
||||
|
||||
|
||||
def adapt_patch(is_global_patch: bool = False):
|
||||
if is_global_patch:
|
||||
from vllm_ascend.patch import platform # noqa: F401
|
||||
|
||||
Reference in New Issue
Block a user