From 8786412f5cbf1f12626633ffb2ff2f9ff0da2674 Mon Sep 17 00:00:00 2001 From: baxingpiaochong <771405853@qq.com> Date: Fri, 23 Jan 2026 19:47:33 +0800 Subject: [PATCH] [Bugfix]KV pool rank 0 consumes more HBM (#6113) ### What this PR does / why we need it? before add_set_deivce image after image ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 --------- Signed-off-by: baxingpiaochong <771405853@qq.com> --- .../kv_pool/ascend_store/backend/mooncake_backend.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py index 25a103ca..3375e741 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py @@ -2,6 +2,8 @@ import json import os import re +import torch + from dataclasses import dataclass from typing import Union @@ -32,6 +34,7 @@ class MooncakeBackend(Backend): "to run vLLM with MooncakeConnector.") from e self.config = MooncakeStoreConfig.load_from_env() self.store = MooncakeDistributedStore() + self.rank = parallel_config.rank if self.config.protocol == "ascend": local_hostname = get_ip() transfer_engine = global_te.get_transfer_engine(local_hostname, @@ -50,6 +53,10 @@ class MooncakeBackend(Backend): logger.error(msg) raise RuntimeError(msg) + def set_device(self): + device = torch.device(f"npu:{self.rank}") + torch.npu.set_device(device) + def register_buffer(self, ptrs: list[int], lengths: list[int]): global_te.register_buffer(ptrs, lengths)