fix completion token statistic bug when input context is large
This commit is contained in:
@@ -24,7 +24,7 @@
|
|||||||
# Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which
|
# Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which
|
||||||
# utilizes K-tiling techniques, and also have _forward_decode_pytorch to bypass kernel
|
# utilizes K-tiling techniques, and also have _forward_decode_pytorch to bypass kernel
|
||||||
# when context length is high
|
# when context length is high
|
||||||
cp ./paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
|
cp ./paged_attn.py /usr/local/corex/lib/python3/dist-packages/vllm/attention/ops/paged_attn.py
|
||||||
|
|
||||||
# --- transformers: Qwen3_5 tokenizer / model files --------------------------
|
# --- transformers: Qwen3_5 tokenizer / model files --------------------------
|
||||||
pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
|
pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
@@ -36,6 +36,13 @@ cp ./mamba_cache.py /usr/local/corex/lib/python3/dist-packages/vllm/model_execut
|
|||||||
cp ./qwen3_5.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/qwen3_5.py
|
cp ./qwen3_5.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/qwen3_5.py
|
||||||
python3 ./patch_vllm_qwen3_5.py
|
python3 ./patch_vllm_qwen3_5.py
|
||||||
|
|
||||||
|
# --- sequence.py: fix completion_tokens inflation under chunked prefill ------
|
||||||
|
# Bug: get_output_token_ids_to_return(delta=True) with num_new_tokens=0
|
||||||
|
# returns _cached_all_token_ids[-0:] == [0:] (the ENTIRE prompt+output list).
|
||||||
|
# Each prefill chunk step adds prompt_len to previous_num_tokens, so a 10K
|
||||||
|
# prompt processed in 3 chunks inflates completion_tokens by ~30K.
|
||||||
|
cp ./sequence.py /usr/local/corex/lib/python3/dist-packages/vllm/sequence.py
|
||||||
|
|
||||||
# --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------
|
# --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------
|
||||||
# Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py.
|
# Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py.
|
||||||
# Required because head_dim=256 > 128 and ixformer flash attention either
|
# Required because head_dim=256 > 128 and ixformer flash attention either
|
||||||
|
|||||||
1385
qwen3_6_scripts/sequence.py
Normal file
1385
qwen3_6_scripts/sequence.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user