diff --git a/Dockerfile b/Dockerfile index 33d8f83..e2c5aa5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ FROM cr.metax-tech.com/public-ai-release/maca/vllm:maca.ai3.0.0.5-torch2.6-py310-ubuntu22.04-amd64 +RUN /opt/conda/bin/pip install --no-cache-dir --upgrade transformers COPY vllm/ /opt/conda/lib/python3.10/site-packages/vllm/ COPY code_generator.py /opt/conda/lib/python3.10/site-packages/triton/compiler/code_generator.py diff --git a/README.md b/README.md index 82fecd6..2e116d0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # metax-c500-vllm -本项目包含了对于原版 vllm 的升级,使其可以在沐曦 C500芯片上支持运行 gpt-oss +本项目包含了对于原版 vllm 的升级,使其可以在沐曦 C500芯片上支持运行 gpt-oss, qwen3-omni 本项目中提供的 Dockerfile 中的主要内容为: 1. 将 `vllm` 目录覆盖到镜像中的 `/opt/conda/lib/python3.10/site-packages/vllm`。运行`gpt-oss`时需指定`VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1` diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 55a2d27..57c3582 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -54,6 +54,14 @@ def check_xformers_availability(): return USE_XFORMERS_OPS +def check_upstream_fa_availability(dtype: torch.dtype): + if dtype in (torch.float16, torch.bfloat16) and current_platform.is_cuda( + ) and current_platform.has_device_capability(80): + from transformers.utils import is_flash_attn_2_available + return is_flash_attn_2_available() + return False + + class Attention(nn.Module): """Attention layer. diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 95c806c..b5f9a0e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -531,7 +531,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "" if model_type in ("mllama", "llama4"): return "<|image|>" - if model_type in ("qwen2_vl", "qwen2_5_vl"): + if model_type in ("qwen2_vl", "qwen2_5_vl", "qwen3_omni_moe"): return "<|vision_start|><|image_pad|><|vision_end|>" if model_type == "qwen2_5_omni": return "<|vision_start|><|IMAGE|><|vision_end|>" @@ -553,13 +553,15 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): if model_type in ("qwen2_audio", "qwen2_5_omni"): return (f"Audio {current_count}: " f"<|audio_bos|><|AUDIO|><|audio_eos|>") + if model_type == "qwen3_omni_moe": + return f"<|audio_start|><|audio_pad|><|audio_end|>" if model_type == "minicpmo": return "()" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": if model_type == "internvl_chat": return "