diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index a99d812..24c087d 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=17b4c6685ce62d5652654784d6771a3d38e4273e + VLLM_COMMIT=releases/v0.11.0 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index a45d676..7470694 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: 17b4c6685ce62d5652654784d6771a3d38e4273e + vllm: releases/v0.11.0 changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2] + vllm_version: [releases/v0.11.0, v0.10.2] steps: - name: Install packages run: | @@ -138,7 +138,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2] + vllm_version: [releases/v0.11.0, v0.10.2] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index 6306032..5f5089d 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -68,7 +68,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [17b4c6685ce62d5652654784d6771a3d38e4273e, v0.10.2] + vllm_version: [releases/v0.11.0, v0.10.2] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index 4497104..a3e53c0 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -42,6 +42,8 @@ from vllm.model_executor.models.qwen2_5_vl import ( from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm_ascend.utils import vllm_version_is + MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -496,12 +498,20 @@ class AscendQwen2_5_VLForConditionalGeneration( super().__init__(vllm_config=vllm_config, prefix=prefix) config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.visual = AscendQwen2_5_VisionTransformer( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) + if vllm_version_is("0.10.2"): + self.visual = AscendQwen2_5_VisionTransformer( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + else: + self.visual = AscendQwen2_5_VisionTransformer( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py index d3fac4c..54f5947 100644 --- a/vllm_ascend/models/qwen2_5_vl_without_padding.py +++ b/vllm_ascend/models/qwen2_5_vl_without_padding.py @@ -68,6 +68,7 @@ from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding +from vllm_ascend.utils import vllm_version_is class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention): @@ -483,12 +484,20 @@ class AscendQwen2_5_VLForConditionalGeneration_Without_Padding( super().__init__(vllm_config=vllm_config, prefix=prefix) config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( - vision_config=config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) + if vllm_version_is("0.10.2"): + self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + else: + self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: @@ -554,12 +563,20 @@ class AscendQwen3VLForConditionalGeneration(Qwen3VLForConditionalGeneration): super().__init__(vllm_config=vllm_config, prefix=prefix) config: Qwen3VLConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel) + if vllm_version_is("0.10.2"): + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel) + else: + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel) @MULTIMODAL_REGISTRY.register_processor(Qwen3VLMultiModalProcessor, @@ -596,11 +613,19 @@ class AscendQwen3VLMoeForConditionalGeneration( multimodal_config = vllm_config.model_config.multimodal_config self.multimodal_config = multimodal_config self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" - - self.visual = AscendQwen3_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config(quant_config), - prefix=maybe_prefix(prefix, "visual"), - use_data_parallel=self.use_data_parallel, - ) \ No newline at end of file + if vllm_version_is("0.10.2"): + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, + ) + else: + self.visual = AscendQwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, + ) diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index a677b06..0eea6f8 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -40,6 +40,8 @@ from vllm.model_executor.models.qwen2_vl import ( from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm_ascend.utils import vllm_version_is + MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -343,10 +345,18 @@ class AscendQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) - self.visual = AscendQwen2VisionTransformer( - self.config.vision_config, - norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), - quant_config=self._maybe_ignore_quant_config( - vllm_config.quant_config), - prefix=maybe_prefix(prefix, "visual"), - ) \ No newline at end of file + if vllm_version_is("0.10.2"): + self.visual = AscendQwen2VisionTransformer( + self.config.vision_config, + norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config( + vllm_config.quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + else: + self.visual = AscendQwen2VisionTransformer( + self.config.vision_config, + norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), + quant_config=self.vllm_config.quant_config, + prefix=maybe_prefix(prefix, "visual"), + )