[Feat] Multi-stream for eplb heat collection and aggregation (#4214)
### What this PR does / why we need it? This PR optimizes multistream for eplb heat collection and aggregation - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0 --------- Signed-off-by: daishixun <dsxsteven@sina.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -18,6 +18,9 @@
|
||||
import types
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
|
||||
_MOE_LOAD_ASYNC_STREAM = None
|
||||
|
||||
|
||||
def get_expert_map(self, layer_id):
|
||||
@@ -75,3 +78,12 @@ def model_register(model, model_config):
|
||||
model.num_moe_layers = config.num_hidden_layers - model.num_dense_layers
|
||||
else:
|
||||
raise NotImplementedError("EPLB is not supported.")
|
||||
|
||||
|
||||
def moe_load_async_stream() -> torch_npu.npu.Stream:
|
||||
global _MOE_LOAD_ASYNC_STREAM
|
||||
if _MOE_LOAD_ASYNC_STREAM is None:
|
||||
# when this function is called before any stream is set,
|
||||
# we return the default stream.
|
||||
_MOE_LOAD_ASYNC_STREAM = torch_npu.npu.Stream()
|
||||
return _MOE_LOAD_ASYNC_STREAM
|
||||
Reference in New Issue
Block a user