some modifications to ensure 50K context input

2026-06-04 17:56:29 +08:00
parent 1c33ef1355
commit 8c047a70ea
3 changed files with 150 additions and 0 deletions
--- a/qwen3_6_scripts/paged_attn.py
+++ b/qwen3_6_scripts/paged_attn.py
@@ -85,6 +85,85 @@ class PagedAttention:
            v_scale,
        )

+    @staticmethod
+    def _forward_decode_pytorch(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        scale: float,
+    ) -> torch.Tensor:
+        """Pure-PyTorch decode attention for long contexts (no hardware kernel).
+
+        paged_attention_v1 hangs on BI-V100 when max_seq_len > ~32K due to
+        shared memory limits. For decode, q_len=1 per sequence so no Q-tiling
+        is needed — the attention weight tensor is [H, 1, seq_len] which is
+        trivially small (~5 MB at 50K).
+
+        Shapes
+        ------
+        query       : [num_seqs, num_heads, head_dim]
+        key_cache   : [num_blocks, num_kv_heads, head_dim//x, block_size, x]
+        value_cache : [num_blocks, num_kv_heads, head_dim,    block_size]
+        block_tables: [num_seqs, max_blocks_per_seq]
+        seq_lens    : [num_seqs]
+        """
+        num_seqs, num_heads, head_dim = query.shape
+        num_kv_heads = key_cache.shape[1]
+        block_size = value_cache.shape[3]
+        gqa_ratio = num_heads // num_kv_heads
+        orig_dtype = query.dtype
+
+        output = torch.empty_like(query)
+
+        try:
+            for i in range(num_seqs):
+                seq_len = int(seq_lens[i].item())
+                num_blocks = (seq_len + block_size - 1) // block_size
+                blk_ids = block_tables[i, :num_blocks]
+
+                # Gather K from paged cache: [seq_len, num_kv_heads, head_dim]
+                k_seq = (key_cache[blk_ids]
+                         .permute(0, 3, 1, 2, 4)
+                         .contiguous()
+                         .view(-1, num_kv_heads, head_dim))[:seq_len]
+
+                # Gather V from paged cache: [seq_len, num_kv_heads, head_dim]
+                v_seq = (value_cache[blk_ids]
+                         .permute(0, 3, 1, 2)
+                         .contiguous()
+                         .view(-1, num_kv_heads, head_dim))[:seq_len]
+
+                if gqa_ratio > 1:
+                    k_seq = k_seq.repeat_interleave(gqa_ratio, dim=1)
+                    v_seq = v_seq.repeat_interleave(gqa_ratio, dim=1)
+
+                # [H, head_dim, seq_len] and [H, seq_len, head_dim]
+                k_t = k_seq.permute(1, 2, 0).float()
+                v_t = v_seq.permute(1, 0, 2).float()
+
+                # q: [H, 1, head_dim]; attn_w: [H, 1, seq_len]
+                q_i = query[i].float().unsqueeze(1)
+                attn_w = torch.matmul(q_i * scale, k_t)
+                attn_w = torch.softmax(attn_w, dim=-1)
+
+                out_i = torch.matmul(attn_w, v_t)          # [H, 1, head_dim]
+                output[i] = out_i.squeeze(1).to(orig_dtype)
+        except Exception as e:
+            print(f"[decode_pytorch ERROR] {type(e).__name__}: {e}",
+                  file=sys.stderr, flush=True)
+            traceback.print_exc(file=sys.stderr)
+            raise
+
+        return output
+
+    # paged_attention_v1 on BI-V100 hangs when max_seq_len exceeds ~32K due to
+    # shared memory limits; use pure-PyTorch fallback above this threshold.
+    # Set to a large value to disable for now (50K decode confirmed working via
+    # hardware kernel); lower to 32768 if kernel hangs are observed at long contexts.
+    _PYTORCH_DECODE_THRESHOLD = 10_000_000
+
    @staticmethod
    def forward_decode(
        query: torch.Tensor,
@@ -105,6 +184,10 @@ class PagedAttention:
        blocksparse_block_size: int = 64,
        blocksparse_head_sliding_step: int = 0,
    ) -> torch.Tensor:
+        if max_seq_len > PagedAttention._PYTORCH_DECODE_THRESHOLD:
+            return PagedAttention._forward_decode_pytorch(
+                query, key_cache, value_cache, block_tables, seq_lens, scale)
+
        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
            # use blocksparse paged attention
            block_size = value_cache.size(-1)
--- a/qwen3_6_scripts/patch_xformers_sdpa_seq.py
+++ b/qwen3_6_scripts/patch_xformers_sdpa_seq.py
@@ -24,6 +24,12 @@ flash attention kernel（ixformer / cudnnFlashAttnForward）。
  max-model-len=8192  → 峰值 ~800 MB
  max-model-len=16384 → 峰值 ~3.2 GB

+额外 patch（arg_utils.py）：
+  vllm 0.6.3 在 max_model_len > 32K 时会自动开启 chunked prefill（无命令行
+  关闭选项），原意是防止 profiling OOM。但 _run_sdpa_fallback 已通过 Q-tiling
+  解决了该问题，chunked prefill 反而会把推理路径从 _run_sdpa_fallback 切换到
+  _forward_prefix_pytorch，属于不必要的行为变更，因此一并禁用该自动逻辑。
+
 Deploy:
  python3 modified_scripts/patch_xformers_sdpa_seq.py
 """
@@ -33,6 +39,33 @@ XFORMERS_PATH = (
    "vllm/attention/backends/xformers.py"
 )

+ARG_UTILS_PATH = (
+    "/usr/local/corex/lib64/python3/dist-packages/"
+    "vllm/engine/arg_utils.py"
+)
+
+# vllm 0.6.3 自动开启 chunked prefill 的原始块
+_ARG_OLD_BLOCK = """\
+                if (is_gpu and not use_sliding_window and not use_spec_decode
+                        and not self.enable_lora
+                        and not self.enable_prompt_adapter):
+                    self.enable_chunked_prefill = True
+                    logger.warning(
+                        "Chunked prefill is enabled by default for models with "
+                        "max_model_len > 32K. Currently, chunked prefill might "
+                        "not work with some features or models. If you "
+                        "encounter any issues, please disable chunked prefill "
+                        "by setting --enable-chunked-prefill=False.")\
+"""
+
+_ARG_NEW_BLOCK = """\
+                if (is_gpu and not use_sliding_window and not use_spec_decode
+                        and not self.enable_lora
+                        and not self.enable_prompt_adapter):
+                    pass  # skip auto-enable: Q-tiling in _run_sdpa_fallback
+                          # handles long-context memory without chunked prefill\
+"""
+
 FALLBACK_METHOD = '''
    def _run_sdpa_fallback(
        self,
@@ -203,10 +236,35 @@ def patch_file(path):
        print(f"  Written: {path}")


+def patch_arg_utils(path):
+    with open(path, "r") as f:
+        content = f.read()
+    changed = False
+
+    if "skip auto-enable: Q-tiling" in content:
+        print("  [skip] chunked-prefill auto-enable already disabled")
+    elif _ARG_OLD_BLOCK in content:
+        content = content.replace(_ARG_OLD_BLOCK, _ARG_NEW_BLOCK, 1)
+        print("  [ok]   disabled chunked-prefill auto-enable for 32K+")
+        changed = True
+    else:
+        print("  [warn] target block not found — check arg_utils.py version")
+
+    if changed:
+        with open(path, "w") as f:
+            f.write(content)
+        print(f"  Written: {path}")
+
+
 def main():
    print("=== patch_xformers_sdpa_seq (sequential, pure-math) ===")
    print(f"Target: {XFORMERS_PATH}")
    patch_file(XFORMERS_PATH)
+
+    print("\n=== patch_arg_utils (disable chunked-prefill auto-enable) ===")
+    print(f"Target: {ARG_UTILS_PATH}")
+    patch_arg_utils(ARG_UTILS_PATH)
+
    print("\nDone.")


--- a/qwen3_6_scripts/qwen3_5.py
+++ b/qwen3_6_scripts/qwen3_5.py
@@ -412,6 +412,9 @@ class GatedDeltaNet(nn.Module):

        else:
            # Decode: one token per sequence
+            with open("/tmp/vllm_decode_debug.log", "a") as _f:
+                _f.write(f"[deltanet decode] layer={self.layer_idx} num_seqs={hidden_states.shape[0]}\n")
+                _f.flush()
            num_seqs = hidden_states.shape[0]
            weight_2d = self.conv1d_weight.squeeze(1)

@@ -847,6 +850,12 @@ class Qwen3_5ForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
+        # Non-driver TP ranks have seq_groups=None in sampling_metadata (normal
+        # TP behavior); they must still call logits_processor to participate in
+        # the NCCL gather inside lm_head. logits_processor returns None for
+        # non-driver ranks after the gather, safely skipping _apply_logits_processors.
+        # Rank 0 (driver) always has seq_groups != None given
+        # --max-num-batched-tokens >= --max-model-len (no chunked-prefill splits).
        return self.logits_processor(self.lm_head, hidden_states,
                                     sampling_metadata)