diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 357a1e32..bbaba802 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87 + VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 9116b5a6..277c874e 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87 +ARG VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index b7f35825..76b5cbc8 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 0ce86dfa..e607b0c2 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 4497431df654e46fb1fb5e64bf8611e762ae5d87 + vllm: 8a680463fab3bc9e6760417cd5c0a6aa58283065 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index a50a9c16..74864db0 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 9bab96b9..386c7064 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py index ec5ca3a4..6635f491 100644 --- a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py +++ b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py @@ -32,6 +32,12 @@ TENSOR_PARALLELS = [1] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) async def test_models(model: str, tp_size: int) -> None: + from vllm_ascend.utils import vllm_version_is + + if not vllm_version_is("0.17.0"): + pytest.skip( + "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", + ) encode_port = get_open_port() pd_port = get_open_port() vllm_server_args = [ diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe.py b/tests/e2e/multicard/2-cards/test_qwen3_moe.py index 385b32e8..4ce5e33e 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py @@ -76,6 +76,12 @@ def test_qwen3_moe_distributed_aiv_tp2(): @pytest.mark.asyncio async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb(): + from vllm_ascend.utils import vllm_version_is + + if not vllm_version_is("0.17.0"): + pytest.skip( + "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", + ) model = "vllm-ascend/Qwen3-30B-A3B-W8A8" port = get_open_port() compilation_config = json.dumps({"cudagraph_capture_sizes": [8]}) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 8dd63427..ebd6c5aa 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -181,30 +181,47 @@ class AscendConfig: stacklevel=2, ) + @staticmethod + def _get_compile_ranges(compilation_config): + from vllm_ascend.utils import vllm_version_is + + if vllm_version_is("0.17.0"): + return compilation_config.compile_ranges_split_points + else: + return compilation_config.compile_ranges_endpoints + + @staticmethod + def _set_compile_ranges(compilation_config, value): + from vllm_ascend.utils import vllm_version_is + + if vllm_version_is("0.17.0"): + compilation_config.compile_ranges_split_points = value + else: + compilation_config.compile_ranges_endpoints = value + def update_compile_ranges_split_points(self): vllm_config = self.vllm_config if self.ascend_compilation_config.enable_npugraph_ex: if self.ascend_compilation_config.fuse_allreduce_rms: from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD - new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points + new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config) new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD) new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) - vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points + self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points) logger.debug( "set compile_ranges_split_points to " "{new_compile_ranges_split_points} for matmul and allreduce fusion" ) else: - new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points + new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config) if vllm_config.additional_config.get("ascend_compilation_config", {}).get("fuse_allreduce_rms", True): from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD - new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD) new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) - vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points + self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points) logger.debug( "set compile_ranges_split_points to " "{new_compile_ranges_split_points} for matmul and allreduce fusion" @@ -218,9 +235,9 @@ class AscendConfig: sp_threshold = get_sp_threshold(vllm_config) new_compile_ranges_split_points.append(sp_threshold) logger.debug(f"add {sp_threshold} to compile_ranges_split_points for sequence parallelism") - if len(new_compile_ranges_split_points) > len(vllm_config.compilation_config.compile_ranges_split_points): + if len(new_compile_ranges_split_points) > len(self._get_compile_ranges(vllm_config.compilation_config)): new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) - vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points + self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points) class FinegrainedTPConfig: diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index 211f3dce..828df509 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -12,6 +12,7 @@ from vllm.v1.kv_offload.spec import OffloadingSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler +from vllm_ascend.utils import vllm_version_is class NPUOffloadingSpec(OffloadingSpec): @@ -31,12 +32,23 @@ class NPUOffloadingSpec(OffloadingSpec): def get_manager(self) -> OffloadingManager: if not self._manager: - kv_events_config = self.vllm_config.kv_events_config - enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events - self._manager = LRUOffloadingManager( - CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks), - enable_events=enable_events, - ) + if vllm_version_is("0.17.0"): + kv_events_config = self.vllm_config.kv_events_config + enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events + self._manager = LRUOffloadingManager( + CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks), + enable_events=enable_events, + ) + else: + kv_events_config = self.vllm_config.kv_events_config + enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] + offloaded_block_size = gpu_block_size * self.block_size_factor + self._manager = LRUOffloadingManager( + CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), + enable_events=enable_events, + ) return self._manager def get_handlers( @@ -45,13 +57,24 @@ class NPUOffloadingSpec(OffloadingSpec): attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: - self._handler = CpuNpuOffloadingHandler( - attn_backends=attn_backends, - gpu_block_size=self.gpu_block_size, - cpu_block_size=self.offloaded_block_size, - num_cpu_blocks=self.num_cpu_blocks, - gpu_caches=kv_caches, - ) + if vllm_version_is("0.17.0"): + self._handler = CpuNpuOffloadingHandler( + attn_backends=attn_backends, + gpu_block_size=self.gpu_block_size, + cpu_block_size=self.offloaded_block_size, + num_cpu_blocks=self.num_cpu_blocks, + gpu_caches=kv_caches, + ) + else: + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] + self._handler = CpuNpuOffloadingHandler( + attn_backends=attn_backends, + gpu_block_size=gpu_block_size, + cpu_block_size=gpu_block_size * self.block_size_factor, + num_cpu_blocks=self.num_cpu_blocks, + gpu_caches=kv_caches, + ) assert self._handler is not None yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler diff --git a/vllm_ascend/patch/platform/patch_torch_accelerator.py b/vllm_ascend/patch/platform/patch_torch_accelerator.py index 431dce4e..43bf4c10 100644 --- a/vllm_ascend/patch/platform/patch_torch_accelerator.py +++ b/vllm_ascend/patch/platform/patch_torch_accelerator.py @@ -6,3 +6,11 @@ def patch_empty_cache() -> None: torch.accelerator.empty_cache = patch_empty_cache + +# Monkey-patch torch.accelerator memory APIs for NPU compatibility. +# Upstream vLLM (commit 747b068) replaced current_platform.memory_stats() +# with torch.accelerator.memory_stats(), but torch.accelerator does not +# properly delegate to NPU. We redirect to torch.npu.* equivalents. +torch.accelerator.memory_stats = torch.npu.memory_stats # type: ignore[attr-defined] +torch.accelerator.memory_reserved = torch.npu.memory_reserved # type: ignore[attr-defined] +torch.accelerator.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore[attr-defined] diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 96f0f78b..1bf7095e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -292,16 +292,27 @@ class NPUModelRunner(GPUModelRunner): if self.use_sparse_c8_indexer: self.c8_k_cache_dtype = torch.int8 self.c8_k_scale_cache_dtype = torch.float16 + from vllm_ascend.utils import vllm_version_is - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - use_mla=self.model_config.use_mla, - use_sparse=self.use_sparse, - use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, - ) + if vllm_version_is("0.17.0"): + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + self.block_size, + use_mla=self.model_config.use_mla, + use_sparse=self.use_sparse, + use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, + ) + else: + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + use_mla=self.model_config.use_mla, + use_sparse=self.use_sparse, + use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, + ) try: self.dcp_size = get_dcp_group().world_size @@ -2553,7 +2564,17 @@ class NPUModelRunner(GPUModelRunner): with get_tp_context(self.drafter): self.drafter.load_model(self.model) if self.use_aux_hidden_state_outputs: - self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers()) + if vllm_version_is("0.17.0"): + self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers()) + else: + from vllm.model_executor.models.interfaces import supports_eagle3 + if not supports_eagle3(self.model): + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested" + ) + aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers() + self.model.set_aux_hidden_state_layers(aux_layers) if self.lora_config: self.model = self.load_lora_model(self.model, self.vllm_config, self.device) diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py index ac3b1f9c..64133235 100644 --- a/vllm_ascend/xlite/xlite.py +++ b/vllm_ascend/xlite/xlite.py @@ -92,9 +92,12 @@ class LlamaXliteModel(XliteModel): vision_config = getattr(vllm_config.model_config.hf_config, "vision_config", None) rope_parameters = getattr(hf_config, "rope_parameters", {}) - config.deepstack_num_level = len(getattr(vision_config, "deepstack_visual_indexes", [])) - config.mrope_section = rope_parameters.get("mrope_section", []) - config.mrope_interleaved = rope_parameters.get("mrope_interleaved", False) + if hasattr(config, "deepstack_num_level"): + config.deepstack_num_level = len(getattr(vision_config, "deepstack_visual_indexes", [])) + if hasattr(config, "mrope_section"): + config.mrope_section = rope_parameters.get("mrope_section", []) + if hasattr(config, "mrope_interleaved"): + config.mrope_interleaved = rope_parameters.get("mrope_interleaved", False) return config def _build_model(self, runnable: nn.Module, vllm_config: VllmConfig, config: ModelConfig) -> Model: