From 8786412f5cbf1f12626633ffb2ff2f9ff0da2674 Mon Sep 17 00:00:00 2001
From: baxingpiaochong <771405853@qq.com>
Date: Fri, 23 Jan 2026 19:47:33 +0800
Subject: [PATCH] [Bugfix]KV pool rank 0 consumes more HBM (#6113)
### What this PR does / why we need it?
before add_set_deivce
after
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
---------
Signed-off-by: baxingpiaochong <771405853@qq.com>
---
.../kv_pool/ascend_store/backend/mooncake_backend.py | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py
index 25a103ca..3375e741 100644
--- a/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py
+++ b/vllm_ascend/distributed/kv_transfer/kv_pool/ascend_store/backend/mooncake_backend.py
@@ -2,6 +2,8 @@
import json
import os
import re
+import torch
+
from dataclasses import dataclass
from typing import Union
@@ -32,6 +34,7 @@ class MooncakeBackend(Backend):
"to run vLLM with MooncakeConnector.") from e
self.config = MooncakeStoreConfig.load_from_env()
self.store = MooncakeDistributedStore()
+ self.rank = parallel_config.rank
if self.config.protocol == "ascend":
local_hostname = get_ip()
transfer_engine = global_te.get_transfer_engine(local_hostname,
@@ -50,6 +53,10 @@ class MooncakeBackend(Backend):
logger.error(msg)
raise RuntimeError(msg)
+ def set_device(self):
+ device = torch.device(f"npu:{self.rank}")
+ torch.npu.set_device(device)
+
def register_buffer(self, ptrs: list[int], lengths: list[int]):
global_te.register_buffer(ptrs, lengths)