[bugfix] fix deepseek rope sincoscache re-generation (#2744)

### What this PR does / why we need it?
The current implementation will result in duplicate generation of
`sin_cos_cache` in rope when `kv_seqlen` > 4k, because the
initialization length of the `sin_cos_cache` is only 4k.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
After this PR merged, sin_cos_cache will not increase in forward func,
so `test_native_rope_deepseek_forward_cache_handling` is not necessary.

- vLLM version: v0.10.1.1
- vLLM main:
60f0843ef8

Signed-off-by: zzzzwwjj <1183291235@qq.com>
This commit is contained in:
zzzzwwjj
2025-09-08 22:03:34 +08:00
committed by GitHub
parent 7d6d9449a8
commit 4df8df5b94
5 changed files with 63 additions and 79 deletions

View File

@@ -157,6 +157,28 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
args, kwargs = mock_npu_rotary.call_args
self.assertFalse(args[-1])
@patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled',
return_value=False)
@patch('torch_npu._npu_rotary_embedding')
def test_rope_forward_oot_rotary_dim_less_than_head_size(
self, mock_npu_rotary, mock_custom_enabled):
mock_config = MagicMock()
mock_config.torchair_graph_config.enabled = False
# test case when rotary_dim < head_size
org_rotary_dim = self.layer.rotary_dim
self.layer.rotary_dim = self.layer.head_size // 2
result_q, result_k = self.layer.forward(self.positions, self.query,
self.key)
mock_npu_rotary.assert_called_once()
self.assertEqual(result_q.shape, self.query.shape)
self.assertEqual(result_k.shape, self.key.shape)
# restore rotary_dim
self.layer.rotary_dim = org_rotary_dim
class MockRopeModule:
@@ -207,28 +229,6 @@ class TestAscendDeepseekScalingRotaryEmbedding(TestBase):
assert q_pe.shape == self.query.shape
assert k_pe.shape == self.key.shape
@patch('vllm_ascend.ops.rotary_embedding._rope_forward_oot')
@patch("vllm.platforms.current_platform.device_type",
new=torch.device("cpu"))
@patch("vllm_ascend.ops.rotary_embedding.NPUPlatform",
new_callable=PropertyMock)
def test_native_rope_deepseek_forward_cache_handling(
self, mock_npuplatform, mock_rope_forward_oot):
mock_npuplatform.device_type = torch.device("cpu")
self.layer = self._create_layer()
self.layer.max_seq_len = 1024
# Test cache situation is true
with patch.object(self.layer, "_set_cos_sin_cache") as mock_set_cache:
mock_rope_forward_oot.return_value = (self.query, self.key)
q_pe, k_pe = self.layer.forward(self.positions,
self.query,
self.key,
max_seq_len=2048)
mock_set_cache.assert_called_once()
assert q_pe.shape == self.query.shape
assert k_pe.shape == self.key.shape
@patch('vllm_ascend.ops.rotary_embedding._rope_forward_oot')
@patch("vllm.platforms.current_platform.device_type",
new=torch.device("cpu"))

View File

@@ -5,8 +5,9 @@ import torch
from tests.ut.base import TestBase
from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
custom_rotary_embedding_enabled, native_rope_deepseek_forward,
rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale)
_set_cos_sin_cache, custom_rotary_embedding_enabled,
native_rope_deepseek_forward, rope_forward_oot, rotate_half,
yarn_find_correction_dim, yarn_get_mscale)
class TestCustomRotaryEmbeddingEnabled(TestBase):
@@ -200,6 +201,28 @@ class MockRopeModule:
self.sin_cached = None
self.rotary_dim = 1
self.base = 1
self.beta_fast = 32
self.beta_slow = 1
self.max_position_embeddings = 4096
self.mscale = 1.0
self.scaling_factor = 40
def register_buffer(self):
pass
class TestSetSinCosCache(TestBase):
def test_set_cos_sin_cache(self):
module = MockRopeModule()
with patch.object(module, "register_buffer") as mock_register_buffer:
_set_cos_sin_cache(module,
1024,
device="cpu",
dtype=torch.bfloat16)
mock_register_buffer.assert_called()
class TestNativeRopeDeepseekForward(TestBase):
@@ -220,30 +243,6 @@ class TestNativeRopeDeepseekForward(TestBase):
assert q_pe.shape == query.shape
assert k_pe.shape == key.shape
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding._set_cos_sin_cache'
)
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
def test_native_rope_deepseek_forward_cache_handling(
self, mock_rope_forward_oot, mock_set_cache):
# Test cache situation is true
module = MockRopeModule(max_seq_len=1024)
positions = torch.tensor([1, 2, 3])
query = torch.randn(1, 8, 128)
key = torch.randn(1, 8, 128)
mock_rope_forward_oot.return_value = (query, key)
q_pe, k_pe = native_rope_deepseek_forward(module,
positions,
query,
key,
max_seq_len=2048)
assert q_pe.shape == query.shape
assert k_pe.shape == key.shape
@patch(
'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
def test_native_rope_deepseek_forward_key_reshaping(