fix issues

2026-06-26 12:55:02 +08:00
parent 3d62430fd7
commit c84151eef9
9 changed files with 1879 additions and 5 deletions
--- a/computility-run.yaml
+++ b/computility-run.yaml
@@ -19,7 +19,7 @@ command:
    - --disable-log-requests
    - --disable-frontend-multiprocessing
    - --max-num-batched-tokens
-    - '4096'
+    - '8192'
    - --enable-chunked-prefill
    - --max-seq-len-to-capture
    - '32768'
--- a/qwen3_6_scripts/paged_attn.py
+++ b/qwen3_6_scripts/paged_attn.py
@@ -393,6 +393,20 @@ class PagedAttention:
                # --------------------------------------------------------------
                if ctx_len > 0:
                    num_ctx_blocks = (ctx_len + block_size - 1) // block_size
+                    # Safety: if block_tables is too narrow this indicates a
+                    # prefix_cache_hit + chunked-prefill bug in model_runner.py
+                    # (Case 1 leaves prefix_cache_hit=True but block_table is
+                    # only computed_block_nums, not the full context blocks).
+                    # patch_model_runner.py fixes the root cause; this guard
+                    # prevents a zero-dim amax() crash if it still slips through.
+                    if num_ctx_blocks > block_tables.shape[1]:
+                        print(
+                            f"[paged_attn WARNING] seq {i}: num_ctx_blocks={num_ctx_blocks} "
+                            f"> block_tables.shape[1]={block_tables.shape[1]}, ctx_len={ctx_len}. "
+                            "Block table is undersized (prefix_cache_hit bug). "
+                            "Capping context to available blocks — attention may be incorrect.",
+                            file=sys.stderr, flush=True)
+                        num_ctx_blocks = block_tables.shape[1]
                    for tile_blk in range(0, num_ctx_blocks, _BLOCKS_PER_TILE):
                        blk_end = min(tile_blk + _BLOCKS_PER_TILE, num_ctx_blocks)
                        blk_ids = block_tables[i, tile_blk:blk_end]
--- a/qwen3_6_scripts/patch_model_runner.py
+++ b/qwen3_6_scripts/patch_model_runner.py
@@ -0,0 +1,78 @@
+"""
+Fix: prefix_cache_hit stays True for chunked-prefill chunk 2+ even when past cache.
+
+Root cause:
+  model_runner.py _compute_for_prefix_cache_hit has three cases:
+    Case 1: prefix_cache_len <= context_len  → "already past cache, do normal"
+    Case 2: context_len < prefix_cache_len < seq_len  → partial hit, correct
+    Case 3: seq_len <= prefix_cache_len  → full hit, reduce to 1 token
+
+  Case 1 does nothing (leaves prefix_cache_hit = True). Then in utils.py:
+    if inter_data.prefix_cache_hit:
+        block_table = computed_block_nums   ← ONLY the original prefix blocks!
+
+  But context_len > prefix_cache_len means chunk 1 tokens (between prefix_cache_len
+  and context_len) are ALSO in KV cache and need to be in block_table.
+  block_table = computed_block_nums misses all chunk-1 blocks.
+
+  In _forward_prefix_pytorch:
+    num_ctx_blocks = ceil(context_len / block_size)  # e.g. 268
+    block_tables.shape[1] = len(computed_block_nums)  # e.g. 12  <-- too small!
+    At tile_blk >= 12: blk_ids is empty → k_t shape [..., 0] → amax crash.
+
+Fix:
+  Set prefix_cache_hit = False for Case 1, so utils.py falls through to:
+    elif chunked_prefill_enabled:
+        block_table = block_tables[seq_id]   ← full block table (prefix + chunk1)
+"""
+
+import re
+import sys
+
+CANDIDATE_PATHS = [
+    "/usr/local/corex/lib64/python3/dist-packages/vllm/worker/model_runner.py",
+    "/usr/local/corex/lib/python3/dist-packages/vllm/worker/model_runner.py",
+]
+
+OLD_BLOCK = """\
+        if prefix_cache_len <= context_len:
+            # We already passed the cache hit region,
+            # so do normal computation.
+            pass"""
+
+NEW_BLOCK = """\
+        if prefix_cache_len <= context_len:
+            # We already passed the cache hit region,
+            # so do normal computation.
+            # Must clear prefix_cache_hit so _add_seq_group uses the full
+            # block_tables (prefix + previous-chunk blocks) instead of only
+            # computed_block_nums (prefix only).  Without this, block_tables
+            # passed to _forward_prefix_pytorch is too narrow for context_len,
+            # causing an empty blk_ids slice and a zero-dim amax() crash.
+            inter_data.prefix_cache_hit = False"""
+
+import os
+
+patched = False
+for path in CANDIDATE_PATHS:
+    if not os.path.exists(path):
+        continue
+    with open(path, "r") as f:
+        src = f.read()
+    if OLD_BLOCK not in src:
+        if NEW_BLOCK in src:
+            print(f"[patch_model_runner] already patched: {path}")
+            patched = True
+            break
+        print(f"[patch_model_runner] WARNING: expected block not found in {path}, skipping")
+        continue
+    patched_src = src.replace(OLD_BLOCK, NEW_BLOCK, 1)
+    with open(path, "w") as f:
+        f.write(patched_src)
+    print(f"[patch_model_runner] patched Case-1 prefix_cache_hit fix in: {path}")
+    patched = True
+    break
+
+if not patched:
+    print("[patch_model_runner] ERROR: could not find model_runner.py at any known path", file=sys.stderr)
+    sys.exit(1)
--- a/qwen3_6_scripts/patch_ops.sh
+++ b/qwen3_6_scripts/patch_ops.sh
@@ -8,8 +8,6 @@
 #     are already correct for standard Triton 2.3.1 — do NOT overwrite them.
 #   - DO NOT install BI-V150 corex Triton 2.1.0 (pkgs/triton): that causes
 #     GPU hang on BI-V100 because the Triton CUDA PTX kernels are incompatible.
-#
-# Important Note: Qwen3.6-27B must apply TP=4,PP=2 combination in order to deploy using 8 GPUs

 # Recommended server start command for TP=4 support 100K, need chunked prefill
 # CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \
@@ -17,6 +15,14 @@
 #     --max-model-len 100000 --enforce-eager --trust-remote-code -tp 4 --gpu-memory-utilization 0.95 \
 #     --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \
 #     --max-num-batched-tokens 4096 --enable-chunked-prefill
+#
+# With prefix caching (GDN align-mode, requires chunked prefill):
+# CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \
+#     --model /workspace/models/Qwen3.6-35B-A3B --port 1111 --served-model-name llm \
+#     --max-model-len 150000 --trust-remote-code -tp 4 --gpu-memory-utilization 0.90 \
+#     --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \
+#     --max-num-batched-tokens 8192 --enable-chunked-prefill --enable-prefix-caching \
+#     --max-seq-len-to-capture 32768

 # --- paged_attn.py: replace forward_prefix with pure-PyTorch fallback -------
 # The Triton context_attention_fwd kernel hangs BI-V100 GPUs permanently
@@ -26,6 +32,15 @@
 # when context length is high
 cp ./paged_attn.py /usr/local/corex/lib/python3/dist-packages/vllm/attention/ops/paged_attn.py

+# --- model_runner.py: fix prefix_cache_hit stays True in chunked-prefill chunk 2+ ---
+# Bug: _compute_for_prefix_cache_hit Case 1 (prefix_cache_len <= context_len)
+# leaves prefix_cache_hit=True. Then _add_seq_group uses block_table=computed_block_nums
+# (only the original prefix blocks), ignoring chunk-1 KV cache blocks.
+# _forward_prefix_pytorch then gets an undersized block_tables and crashes with
+# "amax(): Expected reduction dim -1 to have non-zero size" on the 2nd tile.
+# Fix: set prefix_cache_hit=False for Case 1 so the full block_tables is used.
+python3 ./patch_model_runner.py
+
 # --- transformers: Qwen3_5 tokenizer / model files --------------------------
 pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
 cp -r ./qwen3_5 /usr/local/lib/python3.10/site-packages/transformers/models/
@@ -42,8 +57,15 @@ python3 ./patch_vllm_qwen3_5.py
 # returns _cached_all_token_ids[-0:] == [0:] (the ENTIRE prompt+output list).
 # Each prefill chunk step adds prompt_len to previous_num_tokens, so a 10K
 # prompt processed in 3 chunks inflates completion_tokens by ~30K.
+# Also adds num_cached_tokens field to RequestMetrics for prefix-cache stats.
 cp ./sequence.py /usr/local/corex/lib/python3/dist-packages/vllm/sequence.py

+# --- scheduler.py: record num_cached_tokens in RequestMetrics ----------------
+# Sets seq_group.metrics.num_cached_tokens = prefix_cache_len on first prefill
+# when --enable-prefix-caching is active, so serving_chat.py can report it in
+# usage.prompt_tokens_details.cached_tokens (OpenAI-compatible API response).
+cp ./scheduler.py /usr/local/corex/lib/python3/dist-packages/vllm/core/scheduler.py
+
 # --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------
 # Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py.
 # Required because head_dim=256 > 128 and ixformer flash attention either
--- a/qwen3_6_scripts/protocol.py
+++ b/qwen3_6_scripts/protocol.py
@@ -99,11 +99,16 @@ class ModelList(OpenAIBaseModel):
    data: List[ModelCard] = Field(default_factory=list)


+class PromptTokensDetails(OpenAIBaseModel):
+    cached_tokens: int = 0
+
+
 class UsageInfo(OpenAIBaseModel):
    prompt_tokens: int = 0
    total_tokens: int = 0
    completion_tokens: Optional[int] = 0
    reasoning_tokens: Optional[int] = None
+    prompt_tokens_details: Optional[PromptTokensDetails] = None


 class RequestResponseMetadata(BaseModel):
--- a/qwen3_6_scripts/qwen3_5.py
+++ b/qwen3_6_scripts/qwen3_5.py
@@ -2,7 +2,8 @@
 # Pure-PyTorch DeltaNet (no fla / causal_conv1d dependency).
 # Text-only (no VL, no MTP).

-from typing import Iterable, List, Optional, Tuple
+from collections import OrderedDict
+from typing import Dict, Iterable, List, Optional, Tuple

 import torch
 import torch.nn.functional as F
@@ -1033,6 +1034,15 @@ class Qwen3_5ForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
        # Lazy initialised in first forward call
        self.mamba_cache: Optional[MambaCacheManager] = None

+        # GDN prefix state cache (align mode): stores (conv_states, temporal_states) snapshots
+        # at KV-block boundaries so that prefix-cache-hit requests can restore correct GDN state.
+        # Key: tuple of physical block IDs covering the cached prefix
+        # Value: (conv_states_cpu, temporal_states_cpu) each of shape (num_gdn_layers, ...)
+        self._gdn_prefix_cache: OrderedDict = OrderedDict()
+        self._gdn_prefix_cache_max: int = 16   # ~16 × 16 MB ≈ 256 MB CPU RAM
+        self._block_size: int = (cache_config.block_size
+                                  if cache_config is not None else 16)
+
    def _get_mamba_cache_shape(self):
        tp_size = get_tensor_model_parallel_world_size()
        # Each sequence's state is stored in float32
@@ -1069,9 +1079,69 @@ class Qwen3_5ForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
        # temporal_states: (num_linear_layers, batch, local_num_v, k_dim, v_dim)
        conv_states, temporal_states = mamba_tensors

+        # ── GDN prefix-cache align mode: inject saved state on prefix hit ─────
+        # Conditions: prefill pass, batch=1, context_len > 0 (prefix cached or
+        # previous chunk already processed), block_tables available.
+        # We always attempt a lookup: for subsequent chunked-prefill chunks the
+        # key matches our own saved state (same data already in slot → no-op).
+        # For a true cross-request prefix hit the key matches a previous request.
+        _is_single_seq_prefill = (
+            attn_metadata is not None
+            and attn_metadata.num_prefill_tokens > 0
+            and conv_states.shape[1] == 1               # batch == 1
+            and getattr(attn_metadata, 'context_lens_tensor', None) is not None
+            and getattr(attn_metadata, 'block_tables', None) is not None
+            and attn_metadata.block_tables.numel() > 0
+        )
+        if _is_single_seq_prefill:
+            context_len = int(attn_metadata.context_lens_tensor[0].item())
+            if context_len > 0:
+                num_prefix_blocks = context_len // self._block_size
+                if (num_prefix_blocks > 0
+                        and attn_metadata.block_tables.shape[1] >= num_prefix_blocks):
+                    lookup_key = tuple(
+                        attn_metadata.block_tables[0, :num_prefix_blocks]
+                        .cpu().tolist())
+                    if lookup_key in self._gdn_prefix_cache:
+                        saved_conv, saved_temporal = self._gdn_prefix_cache[lookup_key]
+                        conv_states[:, 0].copy_(
+                            saved_conv.to(conv_states.device), non_blocking=True)
+                        temporal_states[:, 0].copy_(
+                            saved_temporal.to(temporal_states.device), non_blocking=True)
+                        self._gdn_prefix_cache.move_to_end(lookup_key)
+                        logger.debug("GDN prefix cache hit: prefix_len=%d blocks=%d",
+                                     context_len, num_prefix_blocks)
+        # ── End inject ──────────────────────────────────────────────────────────
+
        hidden_states = self.model(
            input_ids, positions, kv_caches, attn_metadata,
            conv_states, temporal_states)
+
+        # ── GDN prefix-cache align mode: save state after this prefill chunk ───
+        # Save state keyed by ALL complete KV blocks processed so far.
+        # Next requests reusing this prefix will restore from here.
+        if _is_single_seq_prefill:
+            context_len = int(attn_metadata.context_lens_tensor[0].item())
+            query_len = attn_metadata.num_prefill_tokens
+            total_processed = context_len + query_len
+            num_complete_blocks = total_processed // self._block_size
+            if (num_complete_blocks > 0
+                    and attn_metadata.block_tables.shape[1] >= num_complete_blocks):
+                save_key = tuple(
+                    attn_metadata.block_tables[0, :num_complete_blocks]
+                    .cpu().tolist())
+                # Move to end (LRU: most recent = last) and update value
+                if save_key in self._gdn_prefix_cache:
+                    self._gdn_prefix_cache.move_to_end(save_key)
+                self._gdn_prefix_cache[save_key] = (
+                    conv_states[:, 0].cpu().clone(),
+                    temporal_states[:, 0].cpu().clone(),
+                )
+                # Evict oldest entries beyond max
+                while len(self._gdn_prefix_cache) > self._gdn_prefix_cache_max:
+                    self._gdn_prefix_cache.popitem(last=False)
+        # ── End save ────────────────────────────────────────────────────────────
+
        return hidden_states

    def compute_logits(
--- a/qwen3_6_scripts/scheduler.py
+++ b/qwen3_6_scripts/scheduler.py
--- a/qwen3_6_scripts/sequence.py
+++ b/qwen3_6_scripts/sequence.py
@@ -119,6 +119,7 @@ class RequestMetrics:
    scheduler_time: Optional[float] = None
    model_forward_time: Optional[float] = None
    model_execute_time: Optional[float] = None
+    num_cached_tokens: Optional[int] = None


 class SequenceDataDelta(
--- a/qwen3_6_scripts/serving_chat.py
+++ b/qwen3_6_scripts/serving_chat.py
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import (
    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
    ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
    DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata,
-    ToolCall, UsageInfo)
+    PromptTokensDetails, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
                                                    LoRAModulePath,
                                                    OpenAIServing,
@@ -179,6 +179,16 @@ class OpenAIServingChat(OpenAIServing):
            logger.exception("Error in loading multi-modal data")
            return self.create_error_response(str(e))

+        # n > max_num_seqs deadlock guard: scheduler uses break (not continue)
+        # when can_schedule(num_new_seqs=n) fails, so an n that exceeds
+        # max_num_seqs permanently blocks the entire waiting queue with no error.
+        _sched_cfg = await self.engine_client.get_scheduler_config()
+        _max_seqs = _sched_cfg.max_num_seqs
+        if request.n is not None and request.n > _max_seqs:
+            return self.create_error_response(
+                f"n={request.n} exceeds max_num_seqs={_max_seqs}. "
+                f"Use n<={_max_seqs} or omit n.")
+
        # validation for OpenAI tools
        # tool_choice = "required" is not supported
        if request.tool_choice == "required":
@@ -318,6 +328,7 @@ class OpenAIServingChat(OpenAIServing):
        previous_num_tokens = [0] * num_choices
        finish_reason_sent = [False] * num_choices
        num_prompt_tokens = 0
+        num_cached_tokens: Optional[int] = None

        if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
            tool_choice_function_name = request.tool_choice.function.name
@@ -385,6 +396,10 @@ class OpenAIServingChat(OpenAIServing):
                    num_prompt_tokens = len(res.prompt_token_ids)
                    if res.encoder_prompt_token_ids is not None:
                        num_prompt_tokens += len(res.encoder_prompt_token_ids)
+                if (num_cached_tokens is None
+                        and res.metrics is not None
+                        and res.metrics.num_cached_tokens is not None):
+                    num_cached_tokens = res.metrics.num_cached_tokens

                # We need to do it here, because if there are exceptions in
                # the result_generator, it needs to be sent as the FIRST
@@ -691,6 +706,9 @@ class OpenAIServingChat(OpenAIServing):
                    completion_tokens=completion_tokens,
                    total_tokens=num_prompt_tokens + completion_tokens,
                    reasoning_tokens=total_reasoning,
+                    prompt_tokens_details=(
+                        PromptTokensDetails(cached_tokens=num_cached_tokens)
+                        if num_cached_tokens is not None else None),
                )

                final_usage_chunk = ChatCompletionStreamResponse(
@@ -713,6 +731,10 @@ class OpenAIServingChat(OpenAIServing):
                total_tokens=num_prompt_tokens + num_completion_tokens,
                reasoning_tokens=total_reasoning)

+        except asyncio.CancelledError:
+            # Client disconnected; abort the engine request so GPU is freed.
+            await self.engine_client.abort(request_id)
+            return
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            logger.error("error in chat completion stream generator: %s", e)
@@ -739,6 +761,7 @@ class OpenAIServingChat(OpenAIServing):
            async for res in result_generator:
                final_res = res
        except asyncio.CancelledError:
+            await self.engine_client.abort(request_id)
            return self.create_error_response("Client disconnected")

        assert final_res is not None
@@ -881,11 +904,16 @@ class OpenAIServingChat(OpenAIServing):
            total_reasoning_tokens = sum(
                rp.count_reasoning_tokens(list(output.token_ids))
                for output in final_res.outputs)
+        num_cached_tokens = (final_res.metrics.num_cached_tokens
+                             if final_res.metrics is not None else None)
        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
            reasoning_tokens=total_reasoning_tokens,
+            prompt_tokens_details=(
+                PromptTokensDetails(cached_tokens=num_cached_tokens)
+                if num_cached_tokens is not None else None),
        )

        request_metadata.final_usage_info = usage