fix issues
This commit is contained in:
@@ -393,6 +393,20 @@ class PagedAttention:
|
||||
# --------------------------------------------------------------
|
||||
if ctx_len > 0:
|
||||
num_ctx_blocks = (ctx_len + block_size - 1) // block_size
|
||||
# Safety: if block_tables is too narrow this indicates a
|
||||
# prefix_cache_hit + chunked-prefill bug in model_runner.py
|
||||
# (Case 1 leaves prefix_cache_hit=True but block_table is
|
||||
# only computed_block_nums, not the full context blocks).
|
||||
# patch_model_runner.py fixes the root cause; this guard
|
||||
# prevents a zero-dim amax() crash if it still slips through.
|
||||
if num_ctx_blocks > block_tables.shape[1]:
|
||||
print(
|
||||
f"[paged_attn WARNING] seq {i}: num_ctx_blocks={num_ctx_blocks} "
|
||||
f"> block_tables.shape[1]={block_tables.shape[1]}, ctx_len={ctx_len}. "
|
||||
"Block table is undersized (prefix_cache_hit bug). "
|
||||
"Capping context to available blocks — attention may be incorrect.",
|
||||
file=sys.stderr, flush=True)
|
||||
num_ctx_blocks = block_tables.shape[1]
|
||||
for tile_blk in range(0, num_ctx_blocks, _BLOCKS_PER_TILE):
|
||||
blk_end = min(tile_blk + _BLOCKS_PER_TILE, num_ctx_blocks)
|
||||
blk_ids = block_tables[i, tile_blk:blk_end]
|
||||
|
||||
Reference in New Issue
Block a user