Files
enginex-vllm-bi100-qwen36/qwen3_6_scripts/patch_ops.sh

53 lines
3.3 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# BI-V100 patch script for Qwen3.6-27B (Qwen3_5 architecture)
#
# Triton situation on BI-V100:
# - Standard Triton 2.3.1 is already present in the image.
# - HAS_TRITON = False (hardcoded in vendor vllm), but Triton is still used
# for TP-mode cache management (custom_cache_manager / libentry).
# - The vendor's triton_utils/__init__.py, custom_cache_manager.py, libentry.py
# are already correct for standard Triton 2.3.1 — do NOT overwrite them.
# - DO NOT install BI-V150 corex Triton 2.1.0 (pkgs/triton): that causes
# GPU hang on BI-V100 because the Triton CUDA PTX kernels are incompatible.
#
# Important Note: Qwen3.6-27B must apply TP=4,PP=2 combination in order to deploy using 8 GPUs
#
# Recommended server start command for TP=4, context length: 50K, no chunked prefill mechanism:
# CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \
# --model /workspace/models/Qwen3.6-27B --port 1111 --served-model-name llm \
# --max-model-len 50000 --enforce-eager --trust-remote-code -tp 4 --gpu-memory-utilization 0.90 \
# --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \
# --max-num-batched-tokens 50000
# Recommended server start command for TP=4 support 100K, need chunked prefill
# CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \
# --model /workspace/models/Qwen3.6-27B --port 1111 --served-model-name llm \
# --max-model-len 100000 --enforce-eager --trust-remote-code -tp 8 --gpu-memory-utilization 0.95 \
# --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \
# --max-num-batched-tokens 4096 --enable-chunked-prefill
# --- paged_attn.py: replace forward_prefix with pure-PyTorch fallback -------
# The Triton context_attention_fwd kernel hangs BI-V100 GPUs permanently
# (standard Triton 2.3.1 PTX is not supported by the corex runtime either).
# Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which
# also implements query-chunking (_ATTN_Q_CHUNK=256) to keep peak attention
# memory at O(256 × kv_len) instead of O(q_len × kv_len).
cp ./paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py
# --- transformers: Qwen3_5 tokenizer / model files --------------------------
pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
cp -r ./qwen3_5 /usr/local/lib/python3.10/site-packages/transformers/models/
python3 ./patch_transformers_qwen3_5.py
# --- vllm model: Qwen3.6-27B (Qwen3_5 arch) --------------------------------
cp ./mamba_cache.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/
cp ./qwen3_5.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/qwen3_5.py
python3 ./patch_vllm_qwen3_5.py
# --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------
# Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py.
# Required because head_dim=256 > 128 and ixformer flash attention either
# crashes (is_causal=True) or produces wrong output (attn_mask path).
# The fallback uses query_start_loc to derive actual query lengths, so it
# works correctly during profiling runs with chunked-prefill-style batches.
python3 ./patch_xformers_sdpa_seq.py