Utilize chunked prefill + K-tiling techniques to ensure 100K context
This commit is contained in:
@@ -10,18 +10,11 @@
|
||||
# GPU hang on BI-V100 because the Triton CUDA PTX kernels are incompatible.
|
||||
#
|
||||
# Important Note: Qwen3.6-27B must apply TP=4,PP=2 combination in order to deploy using 8 GPUs
|
||||
#
|
||||
# Recommended server start command for TP=4, context length: 50K, no chunked prefill mechanism:
|
||||
# CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \
|
||||
# --model /workspace/models/Qwen3.6-27B --port 1111 --served-model-name llm \
|
||||
# --max-model-len 50000 --enforce-eager --trust-remote-code -tp 4 --gpu-memory-utilization 0.90 \
|
||||
# --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \
|
||||
# --max-num-batched-tokens 50000
|
||||
|
||||
# Recommended server start command for TP=4 support 100K, need chunked prefill
|
||||
# CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \
|
||||
# --model /workspace/models/Qwen3.6-27B --port 1111 --served-model-name llm \
|
||||
# --max-model-len 100000 --enforce-eager --trust-remote-code -tp 8 --gpu-memory-utilization 0.95 \
|
||||
# --max-model-len 100000 --enforce-eager --trust-remote-code -tp 4 --gpu-memory-utilization 0.95 \
|
||||
# --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \
|
||||
# --max-num-batched-tokens 4096 --enable-chunked-prefill
|
||||
|
||||
@@ -29,8 +22,8 @@
|
||||
# The Triton context_attention_fwd kernel hangs BI-V100 GPUs permanently
|
||||
# (standard Triton 2.3.1 PTX is not supported by the corex runtime either).
|
||||
# Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which
|
||||
# also implements query-chunking (_ATTN_Q_CHUNK=256) to keep peak attention
|
||||
# memory at O(256 × kv_len) instead of O(q_len × kv_len).
|
||||
# utilizes K-tiling techniques, and also have _forward_decode_pytorch to bypass kernel
|
||||
# when context length is high
|
||||
cp ./paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
|
||||
|
||||
# --- transformers: Qwen3_5 tokenizer / model files --------------------------
|
||||
@@ -49,4 +42,5 @@ python3 ./patch_vllm_qwen3_5.py
|
||||
# crashes (is_causal=True) or produces wrong output (attn_mask path).
|
||||
# The fallback uses query_start_loc to derive actual query lengths, so it
|
||||
# works correctly during profiling runs with chunked-prefill-style batches.
|
||||
# also bypasses auto chunked prefill on
|
||||
python3 ./patch_xformers_sdpa_seq.py
|
||||
|
||||
Reference in New Issue
Block a user