fix completion token statistic bug when input context is large

This commit is contained in:
2026-06-08 15:04:34 +08:00
parent c2de1c83b0
commit d972854fb7
2 changed files with 1393 additions and 1 deletions

View File

@@ -24,7 +24,7 @@
# Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which # Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which
# utilizes K-tiling techniques, and also have _forward_decode_pytorch to bypass kernel # utilizes K-tiling techniques, and also have _forward_decode_pytorch to bypass kernel
# when context length is high # when context length is high
cp ./paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py cp ./paged_attn.py /usr/local/corex/lib/python3/dist-packages/vllm/attention/ops/paged_attn.py
# --- transformers: Qwen3_5 tokenizer / model files -------------------------- # --- transformers: Qwen3_5 tokenizer / model files --------------------------
pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
@@ -36,6 +36,13 @@ cp ./mamba_cache.py /usr/local/corex/lib/python3/dist-packages/vllm/model_execut
cp ./qwen3_5.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/qwen3_5.py cp ./qwen3_5.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/qwen3_5.py
python3 ./patch_vllm_qwen3_5.py python3 ./patch_vllm_qwen3_5.py
# --- sequence.py: fix completion_tokens inflation under chunked prefill ------
# Bug: get_output_token_ids_to_return(delta=True) with num_new_tokens=0
# returns _cached_all_token_ids[-0:] == [0:] (the ENTIRE prompt+output list).
# Each prefill chunk step adds prompt_len to previous_num_tokens, so a 10K
# prompt processed in 3 chunks inflates completion_tokens by ~30K.
cp ./sequence.py /usr/local/corex/lib/python3/dist-packages/vllm/sequence.py
# --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------ # --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------
# Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py. # Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py.
# Required because head_dim=256 > 128 and ixformer flash attention either # Required because head_dim=256 > 128 and ixformer flash attention either

1385
qwen3_6_scripts/sequence.py Normal file

File diff suppressed because it is too large Load Diff