[Feature]cpu offload connector (#1659)

This PR implements cpu offload connector to enable NPU kv cache offload
to host DRAM.

- vLLM version: v0.10.2
- vLLM main:
5aeb925452

Signed-off-by: lidenghui <lidenghui1110@gmail.com>
Signed-off-by: AlvisGong <gwly0401@163.com>
Signed-off-by: CalvinXKY <kyxiezju@163.com>
Co-authored-by: AlvisGong <gwly0401@163.com>
This commit is contained in:
lidenghui1110
2025-09-23 14:25:05 +08:00
committed by GitHub
parent 96eb1ed408
commit 0f3939e5a9
10 changed files with 990 additions and 44 deletions

View File

@@ -554,7 +554,11 @@ class TestAscendMLAImpl(TestBase):
self.impl.num_kv_heads = self.impl.num_heads
decode_res, prefill_res = self.impl._mla_preprocess(
hidden_states, kv_cache, attn_metadata, need_gather_q_kv=False)
"mock_layer",
hidden_states,
kv_cache,
attn_metadata,
need_gather_q_kv=False)
self.assertIsNotNone(decode_res)
self.assertIsNotNone(prefill_res)