[bugfix] fix deepseek rope sincoscache re-generation (#2744)

### What this PR does / why we need it? The current implementation will result in duplicate generation of `sin_cos_cache` in rope when `kv_seqlen` > 4k, because the initialization length of the `sin_cos_cache` is only 4k. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? After this PR merged, sin_cos_cache will not increase in forward func, so `test_native_rope_deepseek_forward_cache_handling` is not necessary. - vLLM version: v0.10.1.1 - vLLM main: 60f0843ef8 Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-09-08 22:03:34 +08:00
parent 7d6d9449a8
commit 4df8df5b94
5 changed files with 63 additions and 79 deletions
--- a/tests/ut/torchair/ops/test_torchair_rotary_embedding.py
+++ b/tests/ut/torchair/ops/test_torchair_rotary_embedding.py
@@ -5,8 +5,9 @@ import torch

 from tests.ut.base import TestBase
 from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
-    custom_rotary_embedding_enabled, native_rope_deepseek_forward,
-    rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale)
+    _set_cos_sin_cache, custom_rotary_embedding_enabled,
+    native_rope_deepseek_forward, rope_forward_oot, rotate_half,
+    yarn_find_correction_dim, yarn_get_mscale)


 class TestCustomRotaryEmbeddingEnabled(TestBase):
@@ -200,6 +201,28 @@ class MockRopeModule:
        self.sin_cached = None
        self.rotary_dim = 1
        self.base = 1
+        self.beta_fast = 32
+        self.beta_slow = 1
+        self.max_position_embeddings = 4096
+        self.mscale = 1.0
+        self.scaling_factor = 40
+
+    def register_buffer(self):
+        pass
+
+
+class TestSetSinCosCache(TestBase):
+
+    def test_set_cos_sin_cache(self):
+        module = MockRopeModule()
+
+        with patch.object(module, "register_buffer") as mock_register_buffer:
+            _set_cos_sin_cache(module,
+                               1024,
+                               device="cpu",
+                               dtype=torch.bfloat16)
+
+        mock_register_buffer.assert_called()


 class TestNativeRopeDeepseekForward(TestBase):
@@ -220,30 +243,6 @@ class TestNativeRopeDeepseekForward(TestBase):
        assert q_pe.shape == query.shape
        assert k_pe.shape == key.shape

-    @patch(
-        'vllm_ascend.torchair.ops.torchair_rotary_embedding._set_cos_sin_cache'
-    )
-    @patch(
-        'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
-    def test_native_rope_deepseek_forward_cache_handling(
-            self, mock_rope_forward_oot, mock_set_cache):
-        # Test cache situation is true
-        module = MockRopeModule(max_seq_len=1024)
-        positions = torch.tensor([1, 2, 3])
-        query = torch.randn(1, 8, 128)
-        key = torch.randn(1, 8, 128)
-
-        mock_rope_forward_oot.return_value = (query, key)
-
-        q_pe, k_pe = native_rope_deepseek_forward(module,
-                                                  positions,
-                                                  query,
-                                                  key,
-                                                  max_seq_len=2048)
-
-        assert q_pe.shape == query.shape
-        assert k_pe.shape == key.shape
-
    @patch(
        'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot')
    def test_native_rope_deepseek_forward_key_reshaping(