Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/kernels/test_cache_kernels.py
+++ b/tests/kernels/test_cache_kernels.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for CUDA kernels in cache_kernels.cu."""
+
+import pytest
+import torch
+
+try:
+    from vllm import _custom_ops as ops
+except ImportError:
+    pytest.skip(
+        "Could not import vllm._custom_ops. (pip install -e .)", allow_module_level=True
+    )
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device")
+def test_gather_cache_oob():
+    """
+    Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909).
+    This test constructs a boundary case identified in the issue where
+    seq_starts causes the block_table offset to read out of bounds.
+    """
+
+    batch_size = 1
+    block_size = 64
+    entry_size = 128
+
+    block_table = torch.tensor([[1, 2]], dtype=torch.int32, device="cuda")
+
+    # This will result in offset = 128 / block_size = 128 / 64 = 2
+    # This will cause the kernel to try to read from
+    # block_table[0, 2], but its size is only 2.
+    seq_starts = torch.tensor([128], dtype=torch.int32, device="cuda")
+
+    seq_len = 65
+    cu_seq_lens = torch.tensor([0, seq_len], dtype=torch.int32, device="cuda")
+
+    # src_cache: [num_blocks, block_size, entry_size]
+    num_blocks = 5
+    src_cache = torch.randn(
+        (num_blocks, block_size, entry_size), dtype=torch.float16, device="cuda"
+    )
+
+    dst = torch.empty((seq_len, entry_size), dtype=torch.float16, device="cuda")
+
+    scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+
+    # Calling the C++ function gather_and_maybe_dequant_cache
+    ops.gather_and_maybe_dequant_cache(
+        src_cache,
+        dst,
+        block_table,
+        cu_seq_lens,
+        batch_size,
+        "auto",  # kv_cache_dtype
+        scale,
+        seq_starts,
+    )
+
+    torch.cuda.synchronize()
+    assert True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])