# BI-V100 patch script for Qwen3.6-27B (Qwen3_5 architecture) # # Triton situation on BI-V100: # - Standard Triton 2.3.1 is already present in the image. # - HAS_TRITON = False (hardcoded in vendor vllm), but Triton is still used # for TP-mode cache management (custom_cache_manager / libentry). # - The vendor's triton_utils/__init__.py, custom_cache_manager.py, libentry.py # are already correct for standard Triton 2.3.1 — do NOT overwrite them. # - DO NOT install BI-V150 corex Triton 2.1.0 (pkgs/triton): that causes # GPU hang on BI-V100 because the Triton CUDA PTX kernels are incompatible. # # Important Note: Qwen3.6-27B must apply TP=4,PP=2 combination in order to deploy using 8 GPUs # # Recommended server start command for TP=4, context length: 50K, no chunked prefill mechanism: # CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \ # --model /workspace/models/Qwen3.6-27B --port 1111 --served-model-name llm \ # --max-model-len 50000 --enforce-eager --trust-remote-code -tp 4 --gpu-memory-utilization 0.90 \ # --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \ # --max-num-batched-tokens 50000 # Recommended server start command for TP=4 support 100K, need chunked prefill # CUDA_VISIBLE_DEVICES="4,5,6,7" VLLM_ENGINE_ITERATION_TIMEOUT_S=3600 python3 -m vllm.entrypoints.openai.api_server \ # --model /workspace/models/Qwen3.6-27B --port 1111 --served-model-name llm \ # --max-model-len 100000 --enforce-eager --trust-remote-code -tp 8 --gpu-memory-utilization 0.95 \ # --max-num-seqs 1 --disable-log-requests --disable-frontend-multiprocessing \ # --max-num-batched-tokens 4096 --enable-chunked-prefill # --- paged_attn.py: replace forward_prefix with pure-PyTorch fallback ------- # The Triton context_attention_fwd kernel hangs BI-V100 GPUs permanently # (standard Triton 2.3.1 PTX is not supported by the corex runtime either). # Our paged_attn.py bypasses it entirely via _forward_prefix_pytorch, which # also implements query-chunking (_ATTN_Q_CHUNK=256) to keep peak attention # memory at O(256 × kv_len) instead of O(q_len × kv_len). cp ./paged_attn.py /usr/local/corex/lib64/python3/dist-packages/vllm/attention/ops/paged_attn.py # --- transformers: Qwen3_5 tokenizer / model files -------------------------- pip install transformers==4.55.3 -i https://pypi.tuna.tsinghua.edu.cn/simple cp -r ./qwen3_5 /usr/local/lib/python3.10/site-packages/transformers/models/ python3 ./patch_transformers_qwen3_5.py # --- vllm model: Qwen3.6-27B (Qwen3_5 arch) -------------------------------- cp ./mamba_cache.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/ cp ./qwen3_5.py /usr/local/corex/lib/python3/dist-packages/vllm/model_executor/models/qwen3_5.py python3 ./patch_vllm_qwen3_5.py # --- xformers: bypass cudnnFlashAttnForward (head_dim=256 > 128 limit) ------ # Injects _run_sdpa_fallback (pure matmul+softmax) into xformers.py. # Required because head_dim=256 > 128 and ixformer flash attention either # crashes (is_causal=True) or produces wrong output (attn_mask path). # The fallback uses query_start_loc to derive actual query lengths, so it # works correctly during profiling runs with chunked-prefill-style batches. python3 ./patch_xformers_sdpa_seq.py