fix issues

2026-06-26 12:55:02 +08:00
parent 3d62430fd7
commit c84151eef9
9 changed files with 1879 additions and 5 deletions
--- a/qwen3_6_scripts/paged_attn.py
+++ b/qwen3_6_scripts/paged_attn.py
@@ -393,6 +393,20 @@ class PagedAttention:
                # --------------------------------------------------------------
                if ctx_len > 0:
                    num_ctx_blocks = (ctx_len + block_size - 1) // block_size
+                    # Safety: if block_tables is too narrow this indicates a
+                    # prefix_cache_hit + chunked-prefill bug in model_runner.py
+                    # (Case 1 leaves prefix_cache_hit=True but block_table is
+                    # only computed_block_nums, not the full context blocks).
+                    # patch_model_runner.py fixes the root cause; this guard
+                    # prevents a zero-dim amax() crash if it still slips through.
+                    if num_ctx_blocks > block_tables.shape[1]:
+                        print(
+                            f"[paged_attn WARNING] seq {i}: num_ctx_blocks={num_ctx_blocks} "
+                            f"> block_tables.shape[1]={block_tables.shape[1]}, ctx_len={ctx_len}. "
+                            "Block table is undersized (prefix_cache_hit bug). "
+                            "Capping context to available blocks — attention may be incorrect.",
+                            file=sys.stderr, flush=True)
+                        num_ctx_blocks = block_tables.shape[1]
                    for tile_blk in range(0, num_ctx_blocks, _BLOCKS_PER_TILE):
                        blk_end = min(tile_blk + _BLOCKS_PER_TILE, num_ctx_blocks)
                        blk_ids = block_tables[i, tile_blk:blk_end]