### What this PR does / why we need it? This PR enables separate attention backend configuration for target and draft models in speculative decoding, decoupling the previously bound attention backend settings between the two models. It solves the compatibility issue where some draft models do not support the attention backend used by the target model, and allows users to select the optimal attention backend for each model individually to maximize inference performance. The change is fully backward compatible. --------- Signed-off-by: SidaoY <1024863041@qq.com>
89 lines
3.3 KiB
Python
89 lines
3.3 KiB
Python
import unittest
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock
|
|
|
|
import torch
|
|
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor
|
|
|
|
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
|
|
|
|
|
class TestNPUModelRunnerKVCache(unittest.TestCase):
|
|
|
|
def _build_runner(self):
|
|
runner = NPUModelRunner.__new__(NPUModelRunner)
|
|
runner.device = torch.device("cpu")
|
|
runner.use_sparse = False
|
|
runner.use_sparse_c8_indexer = False
|
|
runner.use_hybrid_blocks = False
|
|
runner.hybrid_with_attn_and_mamba = False
|
|
runner.runner_only_attn_layers = set()
|
|
runner.is_kv_consumer = False
|
|
runner.vllm_config = MagicMock()
|
|
runner.vllm_config.kv_transfer_config = None
|
|
runner.model_config = MagicMock()
|
|
runner.model_config.use_mla = True
|
|
backend = MagicMock()
|
|
backend.get_kv_cache_shape.side_effect = lambda num_blocks, block_size, num_kv_heads, head_size: (
|
|
2,
|
|
num_blocks,
|
|
block_size,
|
|
num_kv_heads,
|
|
head_size,
|
|
)
|
|
runner.attn_backend = backend
|
|
return runner
|
|
|
|
def test_allocate_kv_cache_uses_layer_spec_for_draft_gqa(self):
|
|
runner = self._build_runner()
|
|
kv_cache_spec = FullAttentionSpec(
|
|
block_size=16,
|
|
num_kv_heads=8,
|
|
head_size=64,
|
|
head_size_v=64,
|
|
dtype=torch.float16,
|
|
)
|
|
kv_cache_config = KVCacheConfig(
|
|
num_blocks=2,
|
|
kv_cache_tensors=[KVCacheTensor(size=kv_cache_spec.page_size_bytes * 2, shared_by=["draft_attn"])],
|
|
kv_cache_groups=[KVCacheGroupSpec(layer_names=["draft_attn"], kv_cache_spec=kv_cache_spec)],
|
|
)
|
|
|
|
kv_cache_raw_tensors = runner._allocate_kv_cache_tensors(kv_cache_config)
|
|
k_cache_raw, v_cache_raw = kv_cache_raw_tensors["draft_attn"]
|
|
|
|
self.assertEqual(k_cache_raw.numel(), kv_cache_spec.page_size_bytes)
|
|
self.assertEqual(v_cache_raw.numel(), kv_cache_spec.page_size_bytes)
|
|
|
|
def test_reshape_kv_cache_uses_layer_spec_for_draft_gqa(self):
|
|
runner = self._build_runner()
|
|
kv_cache_spec = FullAttentionSpec(
|
|
block_size=16,
|
|
num_kv_heads=8,
|
|
head_size=64,
|
|
head_size_v=64,
|
|
dtype=torch.float16,
|
|
)
|
|
kv_cache_config = KVCacheConfig(
|
|
num_blocks=2,
|
|
kv_cache_tensors=[KVCacheTensor(size=kv_cache_spec.page_size_bytes * 2, shared_by=["draft_attn"])],
|
|
kv_cache_groups=[KVCacheGroupSpec(layer_names=["draft_attn"], kv_cache_spec=kv_cache_spec)],
|
|
)
|
|
kv_cache_raw_tensors = runner._allocate_kv_cache_tensors(kv_cache_config)
|
|
runner._kv_cache_spec_attn_group_iterator = lambda: [
|
|
SimpleNamespace(
|
|
kv_cache_spec=kv_cache_spec,
|
|
backend=runner.attn_backend,
|
|
layer_names=["draft_attn"],
|
|
)
|
|
]
|
|
|
|
kv_caches = runner._reshape_kv_cache_tensors(kv_cache_config, kv_cache_raw_tensors)
|
|
k_cache, v_cache = kv_caches["draft_attn"]
|
|
|
|
self.assertEqual(k_cache.shape, (2, 16, 8, 64))
|
|
self.assertEqual(v_cache.shape, (2, 16, 8, 64))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main() |