[Feat] Multi-stream for eplb heat collection and aggregation (#4214)

### What this PR does / why we need it? This PR optimizes multistream for eplb heat collection and aggregation - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.12.0 --------- Signed-off-by: daishixun <dsxsteven@sina.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-09 16:16:55 +08:00
parent dda027e680
commit 9a885d08d0
3 changed files with 38 additions and 15 deletions
--- a/vllm_ascend/eplb/utils.py
+++ b/vllm_ascend/eplb/utils.py
@@ -18,6 +18,9 @@
 import types

 import torch
+import torch_npu
+
+_MOE_LOAD_ASYNC_STREAM = None


 def get_expert_map(self, layer_id):
@@ -75,3 +78,12 @@ def model_register(model, model_config):
        model.num_moe_layers = config.num_hidden_layers - model.num_dense_layers
    else:
        raise NotImplementedError("EPLB is not supported.")
+
+
+def moe_load_async_stream() -> torch_npu.npu.Stream:
+    global _MOE_LOAD_ASYNC_STREAM
+    if _MOE_LOAD_ASYNC_STREAM is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _MOE_LOAD_ASYNC_STREAM = torch_npu.npu.Stream()
+    return _MOE_LOAD_ASYNC_STREAM