fix completion token statistic bug when input context is large

2026-06-08 15:04:34 +08:00
parent c2de1c83b0
commit d972854fb7
2 changed files with 1393 additions and 1 deletions
--- a/qwen3_6_scripts/patch_ops.sh
+++ b/qwen3_6_scripts/patch_ops.sh
@@ -24,7 +24,7 @@
 # Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which
 # utilizes K-tiling techniques, and also have _forward_decode_pytorch to bypass kernel
 # when context length is high
-cp ./paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
+cp ./paged_attn.py /usr/local/corex/lib/python3/dist-packages/vllm/attention/ops/paged_attn.py
 # --- transformers: Qwen3_5 tokenizer / model files --------------------------
 pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
@@ -36,6 +36,13 @@ cp ./mamba_cache.py /usr/local/corex/lib/python3/dist-packages/vllm/model_execut
 cp ./qwen3_5.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/qwen3_5.py
 python3 ./patch_vllm_qwen3_5.py
 # --- sequence.py: fix completion_tokens inflation under chunked prefill ------
 # Bug: get_output_token_ids_to_return(delta=True) with num_new_tokens=0
 # returns _cached_all_token_ids[-0:] == [0:] (the ENTIRE prompt+output list).
 # Each prefill chunk step adds prompt_len to previous_num_tokens, so a 10K
 # prompt processed in 3 chunks inflates completion_tokens by ~30K.
 cp ./sequence.py /usr/local/corex/lib/python3/dist-packages/vllm/sequence.py
 # --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------
 # Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py.
 # Required because head_dim=256 > 128 and ixformer flash attention either
--- a/qwen3_6_scripts/sequence.py
+++ b/qwen3_6_scripts/sequence.py