[BugFix] Improve the performance of prefixcache features (#4022)

### What this PR does / why we need it?
The code bug caused an empty bubble. When the npu_paged_cache_load
operator was called, it forcibly transferred seq_len2 to the device,
which triggered synchronization and interrupted the CPU operator's
launch stream.

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

---------

Signed-off-by: underfituu <hzhucong@163.com>
This commit is contained in:
hucong
2025-11-08 18:45:31 +08:00
committed by GitHub
parent 1d81a289d0
commit 48094148f8
5 changed files with 31 additions and 10 deletions

View File

@@ -82,7 +82,8 @@ class TestAscendMLAPrefillMetadata(TestBase):
seq_tot=seq_tot,
max_seq_lens=max_seq_lens,
workspace=workspace,
chunk_seq_lens=chunk_seq_lens)
chunk_seq_lens=chunk_seq_lens,
chunk_seq_lens_npu=chunk_seq_lens)
metadata = AscendMLAPrefillMetadata(
attn_mask=torch.tensor([[1, 0], [1, 1]], dtype=torch.bool),
@@ -103,6 +104,8 @@ class TestAscendMLAPrefillMetadata(TestBase):
self.assertEqual(metadata.chunked_context.max_seq_lens, max_seq_lens)
self.assertIs(metadata.chunked_context.workspace, workspace)
self.assertIs(metadata.chunked_context.chunk_seq_lens, chunk_seq_lens)
self.assertIs(metadata.chunked_context.chunk_seq_lens_npu,
chunk_seq_lens)
class TestAscendMLADecodeMetadata(TestBase):
@@ -478,6 +481,7 @@ class TestAscendMLAImpl(TestBase):
chunk_ctx = MagicMock()
chunk_ctx.seq_tot = [8]
chunk_ctx.chunk_seq_lens = [torch.tensor([8])]
chunk_ctx.chunk_seq_lens_npu = [torch.tensor([8])]
chunk_ctx.starts = [torch.tensor([0])]
prefill_meta = MagicMock()

View File

@@ -86,7 +86,8 @@ class TestAscendMLATorchairPrefillMetadata(TestBase):
seq_tot=seq_tot,
max_seq_lens=max_seq_lens,
workspace=workspace,
chunk_seq_lens=chunk_seq_lens)
chunk_seq_lens=chunk_seq_lens,
chunk_seq_lens_npu=chunk_seq_lens)
metadata = AscendMLATorchairPrefillMetadata(
attn_mask=torch.tensor([[1, 0], [1, 1]], dtype=torch.bool),
@@ -107,6 +108,8 @@ class TestAscendMLATorchairPrefillMetadata(TestBase):
self.assertEqual(metadata.chunked_context.max_seq_lens, max_seq_lens)
self.assertIs(metadata.chunked_context.workspace, workspace)
self.assertIs(metadata.chunked_context.chunk_seq_lens, chunk_seq_lens)
self.assertIs(metadata.chunked_context.chunk_seq_lens_npu,
chunk_seq_lens)
class TestAscendMLATorchairDecodeMetadata(TestBase):
@@ -661,6 +664,7 @@ class TestAscendMLATorchairImpl(TestBase):
chunk_ctx = MagicMock()
chunk_ctx.seq_tot = [8]
chunk_ctx.chunk_seq_lens = [torch.tensor([8])]
chunk_ctx.chunk_seq_lens_npu = [torch.tensor([8])]
chunk_ctx.starts = [torch.tensor([0])]
prefill_meta = MagicMock()