[Feature]cpu offload connector (#1659)
This PR implements cpu offload connector to enable NPU kv cache offload
to host DRAM.
- vLLM version: v0.10.2
- vLLM main:
5aeb925452
Signed-off-by: lidenghui <lidenghui1110@gmail.com>
Signed-off-by: AlvisGong <gwly0401@163.com>
Signed-off-by: CalvinXKY <kyxiezju@163.com>
Co-authored-by: AlvisGong <gwly0401@163.com>
This commit is contained in:
@@ -554,7 +554,11 @@ class TestAscendMLAImpl(TestBase):
|
||||
self.impl.num_kv_heads = self.impl.num_heads
|
||||
|
||||
decode_res, prefill_res = self.impl._mla_preprocess(
|
||||
hidden_states, kv_cache, attn_metadata, need_gather_q_kv=False)
|
||||
"mock_layer",
|
||||
hidden_states,
|
||||
kv_cache,
|
||||
attn_metadata,
|
||||
need_gather_q_kv=False)
|
||||
|
||||
self.assertIsNotNone(decode_res)
|
||||
self.assertIsNotNone(prefill_res)
|
||||
|
||||
@@ -328,4 +328,4 @@ def test_torchair_deepseek_v2_for_causal_lm(mock_distributed, vllm_config):
|
||||
"vllm.model_executor.model_loader.weight_utils.default_weight_loader"
|
||||
):
|
||||
loaded = model.load_weights(weights)
|
||||
assert loaded is not None
|
||||
assert loaded is not None
|
||||
|
||||
Reference in New Issue
Block a user