[Feature]cpu offload connector (#1659)

This PR implements cpu offload connector to enable NPU kv cache offload to host DRAM. - vLLM version: v0.10.2 - vLLM main: 5aeb925452 Signed-off-by: lidenghui <lidenghui1110@gmail.com> Signed-off-by: AlvisGong <gwly0401@163.com> Signed-off-by: CalvinXKY <kyxiezju@163.com> Co-authored-by: AlvisGong <gwly0401@163.com>
2025-09-23 14:25:05 +08:00
parent 96eb1ed408
commit 0f3939e5a9
10 changed files with 990 additions and 44 deletions
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -554,7 +554,11 @@ class TestAscendMLAImpl(TestBase):
        self.impl.num_kv_heads = self.impl.num_heads

        decode_res, prefill_res = self.impl._mla_preprocess(
-            hidden_states, kv_cache, attn_metadata, need_gather_q_kv=False)
+            "mock_layer",
+            hidden_states,
+            kv_cache,
+            attn_metadata,
+            need_gather_q_kv=False)

        self.assertIsNotNone(decode_res)
        self.assertIsNotNone(prefill_res)
--- a/tests/ut/torchair/models/test_torchair_deepseek_v2.py
+++ b/tests/ut/torchair/models/test_torchair_deepseek_v2.py
@@ -328,4 +328,4 @@ def test_torchair_deepseek_v2_for_causal_lm(mock_distributed, vllm_config):
            "vllm.model_executor.model_loader.weight_utils.default_weight_loader"
    ):
        loaded = model.load_weights(weights)
-        assert loaded is not None
+        assert loaded is not None