fix issues

This commit is contained in:
2026-06-26 12:55:02 +08:00
parent 3d62430fd7
commit c84151eef9
9 changed files with 1879 additions and 5 deletions

View File

@@ -393,6 +393,20 @@ class PagedAttention:
# --------------------------------------------------------------
if ctx_len > 0:
num_ctx_blocks = (ctx_len + block_size - 1) // block_size
# Safety: if block_tables is too narrow this indicates a
# prefix_cache_hit + chunked-prefill bug in model_runner.py
# (Case 1 leaves prefix_cache_hit=True but block_table is
# only computed_block_nums, not the full context blocks).
# patch_model_runner.py fixes the root cause; this guard
# prevents a zero-dim amax() crash if it still slips through.
if num_ctx_blocks > block_tables.shape[1]:
print(
f"[paged_attn WARNING] seq {i}: num_ctx_blocks={num_ctx_blocks} "
f"> block_tables.shape[1]={block_tables.shape[1]}, ctx_len={ctx_len}. "
"Block table is undersized (prefix_cache_hit bug). "
"Capping context to available blocks — attention may be incorrect.",
file=sys.stderr, flush=True)
num_ctx_blocks = block_tables.shape[1]
for tile_blk in range(0, num_ctx_blocks, _BLOCKS_PER_TILE):
blk_end = min(tile_blk + _BLOCKS_PER_TILE, num_ctx_blocks)
blk_ids = block_tables[i, tile_blk:blk_end]