[bugfix] fix deepseek rope sincoscache re-generation (#2744)

### What this PR does / why we need it? The current implementation will result in duplicate generation of `sin_cos_cache` in rope when `kv_seqlen` > 4k, because the initialization length of the `sin_cos_cache` is only 4k. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? After this PR merged, sin_cos_cache will not increase in forward func, so `test_native_rope_deepseek_forward_cache_handling` is not necessary. - vLLM version: v0.10.1.1 - vLLM main: 60f0843ef8 Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-09-08 22:03:34 +08:00
parent 7d6d9449a8
commit 4df8df5b94
5 changed files with 63 additions and 79 deletions
--- a/tests/ut/ops/test_rotary_embedding.py
+++ b/tests/ut/ops/test_rotary_embedding.py
@@ -157,6 +157,28 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
        args, kwargs = mock_npu_rotary.call_args
        self.assertFalse(args[-1])

+    @patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled',
+           return_value=False)
+    @patch('torch_npu._npu_rotary_embedding')
+    def test_rope_forward_oot_rotary_dim_less_than_head_size(
+            self, mock_npu_rotary, mock_custom_enabled):
+        mock_config = MagicMock()
+        mock_config.torchair_graph_config.enabled = False
+
+        # test case when rotary_dim < head_size
+        org_rotary_dim = self.layer.rotary_dim
+        self.layer.rotary_dim = self.layer.head_size // 2
+
+        result_q, result_k = self.layer.forward(self.positions, self.query,
+                                                self.key)
+
+        mock_npu_rotary.assert_called_once()
+        self.assertEqual(result_q.shape, self.query.shape)
+        self.assertEqual(result_k.shape, self.key.shape)
+
+        # restore rotary_dim
+        self.layer.rotary_dim = org_rotary_dim
+

 class MockRopeModule:

@@ -207,28 +229,6 @@ class TestAscendDeepseekScalingRotaryEmbedding(TestBase):
        assert q_pe.shape == self.query.shape
        assert k_pe.shape == self.key.shape

-    @patch('vllm_ascend.ops.rotary_embedding._rope_forward_oot')
-    @patch("vllm.platforms.current_platform.device_type",
-           new=torch.device("cpu"))
-    @patch("vllm_ascend.ops.rotary_embedding.NPUPlatform",
-           new_callable=PropertyMock)
-    def test_native_rope_deepseek_forward_cache_handling(
-            self, mock_npuplatform, mock_rope_forward_oot):
-        mock_npuplatform.device_type = torch.device("cpu")
-        self.layer = self._create_layer()
-        self.layer.max_seq_len = 1024
-        # Test cache situation is true
-        with patch.object(self.layer, "_set_cos_sin_cache") as mock_set_cache:
-            mock_rope_forward_oot.return_value = (self.query, self.key)
-
-            q_pe, k_pe = self.layer.forward(self.positions,
-                                            self.query,
-                                            self.key,
-                                            max_seq_len=2048)
-        mock_set_cache.assert_called_once()
-        assert q_pe.shape == self.query.shape
-        assert k_pe.shape == self.key.shape
-
    @patch('vllm_ascend.ops.rotary_embedding._rope_forward_oot')
    @patch("vllm.platforms.current_platform.device_type",
           new=torch.device("cpu"))