Files
xc-llm-ascend/tests/ut/worker/test_model_runner_v1.py
HongtaoYang 80a4265717 [Feat] Support separate attention backend for target and draft model. (#7342)
### What this PR does / why we need it?
This PR enables separate attention backend configuration for target and
draft models in speculative decoding, decoupling the previously bound
attention backend settings between the two models.

It solves the compatibility issue where some draft models do not support
the attention backend used by the target model, and allows users to
select the optimal attention backend for each model individually to
maximize inference performance. The change is fully backward compatible.
---------
Signed-off-by: SidaoY <1024863041@qq.com>
2026-03-21 10:48:01 +08:00

89 lines
3.3 KiB
Python

import unittest
from types import SimpleNamespace
from unittest.mock import MagicMock
import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
class TestNPUModelRunnerKVCache(unittest.TestCase):
def _build_runner(self):
runner = NPUModelRunner.__new__(NPUModelRunner)
runner.device = torch.device("cpu")
runner.use_sparse = False
runner.use_sparse_c8_indexer = False
runner.use_hybrid_blocks = False
runner.hybrid_with_attn_and_mamba = False
runner.runner_only_attn_layers = set()
runner.is_kv_consumer = False
runner.vllm_config = MagicMock()
runner.vllm_config.kv_transfer_config = None
runner.model_config = MagicMock()
runner.model_config.use_mla = True
backend = MagicMock()
backend.get_kv_cache_shape.side_effect = lambda num_blocks, block_size, num_kv_heads, head_size: (
2,
num_blocks,
block_size,
num_kv_heads,
head_size,
)
runner.attn_backend = backend
return runner
def test_allocate_kv_cache_uses_layer_spec_for_draft_gqa(self):
runner = self._build_runner()
kv_cache_spec = FullAttentionSpec(
block_size=16,
num_kv_heads=8,
head_size=64,
head_size_v=64,
dtype=torch.float16,
)
kv_cache_config = KVCacheConfig(
num_blocks=2,
kv_cache_tensors=[KVCacheTensor(size=kv_cache_spec.page_size_bytes * 2, shared_by=["draft_attn"])],
kv_cache_groups=[KVCacheGroupSpec(layer_names=["draft_attn"], kv_cache_spec=kv_cache_spec)],
)
kv_cache_raw_tensors = runner._allocate_kv_cache_tensors(kv_cache_config)
k_cache_raw, v_cache_raw = kv_cache_raw_tensors["draft_attn"]
self.assertEqual(k_cache_raw.numel(), kv_cache_spec.page_size_bytes)
self.assertEqual(v_cache_raw.numel(), kv_cache_spec.page_size_bytes)
def test_reshape_kv_cache_uses_layer_spec_for_draft_gqa(self):
runner = self._build_runner()
kv_cache_spec = FullAttentionSpec(
block_size=16,
num_kv_heads=8,
head_size=64,
head_size_v=64,
dtype=torch.float16,
)
kv_cache_config = KVCacheConfig(
num_blocks=2,
kv_cache_tensors=[KVCacheTensor(size=kv_cache_spec.page_size_bytes * 2, shared_by=["draft_attn"])],
kv_cache_groups=[KVCacheGroupSpec(layer_names=["draft_attn"], kv_cache_spec=kv_cache_spec)],
)
kv_cache_raw_tensors = runner._allocate_kv_cache_tensors(kv_cache_config)
runner._kv_cache_spec_attn_group_iterator = lambda: [
SimpleNamespace(
kv_cache_spec=kv_cache_spec,
backend=runner.attn_backend,
layer_names=["draft_attn"],
)
]
kv_caches = runner._reshape_kv_cache_tensors(kv_cache_config, kv_cache_raw_tensors)
k_cache, v_cache = kv_caches["draft_attn"]
self.assertEqual(k_cache.shape, (2, 16, 8, 64))
self.assertEqual(v_cache.shape, (2, 16, 8, 64))
if __name__ == "__main__":
unittest.main()