diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 1638e7f1..c43fe7c0 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -94,7 +94,7 @@ jobs: pytest -sv tests/e2e/singlecard/test_camem.py pytest -sv tests/e2e/singlecard/test_chunked.py pytest -sv tests/e2e/singlecard/test_embedding.py - pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py + # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py pytest -sv tests/e2e/singlecard/test_guided_decoding.py pytest -sv tests/e2e/singlecard/test_ilama_lora.py pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml index 2d8a729d..f790bb89 100644 --- a/.github/workflows/format_pr_body.yaml +++ b/.github/workflows/format_pr_body.yaml @@ -36,7 +36,7 @@ jobs: - name: Get vLLM version run: | - VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca + VLLM_COMMIT=releases/v0.11.1 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index cc504b8e..da761750 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -42,7 +42,7 @@ jobs: lint: uses: ./.github/workflows/pre-commit.yml with: - vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca + vllm: releases/v0.11.1 changes: runs-on: ubuntu-latest @@ -83,7 +83,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] + vllm_version: [releases/v0.11.1, v0.11.0] steps: - name: Install packages run: | @@ -140,7 +140,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] + vllm_version: [releases/v0.11.1, v0.11.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index a821263f..d9c6f6c6 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -69,7 +69,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0] + vllm_version: [releases/v0.11.1, v0.11.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index 8924ebcf..c19ec1b0 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -2,11 +2,17 @@ import numpy as np import torch from vllm.attention import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_pin_memory_available from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, TransferResult, TransferSpec) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.utils import is_pin_memory_available +else: + from vllm.utils.platform_utils import is_pin_memory_available + logger = init_logger(__name__) diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py index b19ed874..9ccfa43f 100644 --- a/vllm_ascend/models/qwen2_5_vl.py +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -42,7 +42,11 @@ from vllm.model_executor.models.qwen2_5_vl import ( from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, + vllm_version_is) + +if not vllm_version_is("0.11.0"): + from vllm.model_executor.models.vision import conv3d_to_linear_weight MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -355,6 +359,9 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: + if not vllm_version_is("0.11.0"): + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py index ccd46161..7b1ce44a 100644 --- a/vllm_ascend/models/qwen2_vl.py +++ b/vllm_ascend/models/qwen2_vl.py @@ -40,7 +40,11 @@ from vllm.model_executor.models.qwen2_vl import ( from vllm.model_executor.models.utils import maybe_prefix from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz, + vllm_version_is) + +if not vllm_version_is("0.11.0"): + from vllm.model_executor.models.vision import conv3d_to_linear_weight MIN_PAD_SIZE = 64 # min_size to pad weight MAX_PAD_SIZE = 128 # max_size to pad weight @@ -304,6 +308,10 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer): loaded_params: Set[str] = set() for name, loaded_weight in weights: + if not vllm_version_is("0.11.0"): + if name.endswith("patch_embed.proj.weight"): + loaded_weight = conv3d_to_linear_weight(loaded_weight) + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index fa4e802c..d7550bf1 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -33,7 +33,8 @@ from vllm_ascend.torchair.utils import (check_torchair_cache_exist, delete_torchair_cache_file) from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p, prefill_context_parallel_enable, - update_aclgraph_sizes, vllm_version_is) + update_aclgraph_sizes, + update_cudagraph_capture_sizes, vllm_version_is) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -142,24 +143,47 @@ class NPUPlatform(Platform): "Non-MLA LLMs forcibly disable the chunked prefill feature," "as the performance of operators supporting this feature " "functionality is currently suboptimal.") - if not model_config.is_multimodal_model and \ - structured_outputs_config.backend == "auto" and \ - not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ - scheduler_config.policy == "fcfs": - ascend_scheduler_config.enabled = True - chunked_prefill_enabled_in_ascend_scheduler = getattr( - ascend_scheduler_config, "enable_chunked_prefill", False) - if chunked_prefill_enabled_in_ascend_scheduler: - logger.warning( - "Chunked prefill feature is enabled in ascend_scheduler," - "but note that the operator supporting this feature " - "would lead to performance degradation.") - # In this situation, max_num_batched_tokens would have been rewritten. - # So we must make sure max_num_batched_tokens is not smaller than max_model_len. - if (scheduler_config.max_num_batched_tokens - < scheduler_config.max_model_len - and not chunked_prefill_enabled_in_ascend_scheduler): - scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len + if vllm_version_is("0.11.0"): + if not model_config.is_multimodal_model and \ + structured_outputs_config.backend == "auto" and \ + not scheduler_config.send_delta_data and \ + not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ + scheduler_config.policy == "fcfs": + ascend_scheduler_config.enabled = True + chunked_prefill_enabled_in_ascend_scheduler = getattr( + ascend_scheduler_config, "enable_chunked_prefill", + False) + if chunked_prefill_enabled_in_ascend_scheduler: + logger.warning( + "Chunked prefill feature is enabled in ascend_scheduler," + "but note that the operator supporting this feature " + "would lead to performance degradation.") + # In this situation, max_num_batched_tokens would have been rewritten. + # So we must make sure max_num_batched_tokens is not smaller than max_model_len. + if (scheduler_config.max_num_batched_tokens + < scheduler_config.max_model_len and + not chunked_prefill_enabled_in_ascend_scheduler): + scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len + else: + if not model_config.is_multimodal_model and \ + structured_outputs_config.backend == "auto" and \ + not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ + scheduler_config.policy == "fcfs": + ascend_scheduler_config.enabled = True + chunked_prefill_enabled_in_ascend_scheduler = getattr( + ascend_scheduler_config, "enable_chunked_prefill", + False) + if chunked_prefill_enabled_in_ascend_scheduler: + logger.warning( + "Chunked prefill feature is enabled in ascend_scheduler," + "but note that the operator supporting this feature " + "would lead to performance degradation.") + # In this situation, max_num_batched_tokens would have been rewritten. + # So we must make sure max_num_batched_tokens is not smaller than max_model_len. + if (scheduler_config.max_num_batched_tokens + < scheduler_config.max_model_len and + not chunked_prefill_enabled_in_ascend_scheduler): + scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len kv_cache_dtype = vllm_config.additional_config.get( "kv_cache_dtype", None) @@ -237,8 +261,12 @@ class NPUPlatform(Platform): f"{vllm_config.parallel_config.tensor_parallel_size}") if len(sp_aclgraph_sizes) != len(original_sizes): compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes - vllm_config.compilation_config.init_with_cudagraph_sizes( - sp_aclgraph_sizes) + if vllm_version_is("0.11.0"): + compilation_config.init_with_cudagraph_sizes( + sp_aclgraph_sizes) + else: + update_cudagraph_capture_sizes(vllm_config, + sp_aclgraph_sizes) # TODO: Full graph is fully supported later, and the default value will be set to full graph. if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index e0d770df..991e07fb 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -5,10 +5,17 @@ import torch import torch.nn as nn import vllm.v1.sample.rejection_sampler as rs from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.sample.rejection_sampler import (RejectionSampler, compute_probs, +from vllm.v1.sample.rejection_sampler import (RejectionSampler, generate_uniform_probs) from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.11.0"): + from vllm.v1.sample.rejection_sampler import compute_probs +else: + from vllm.v1.sample.rejection_sampler import apply_sampling_constraints + PLACEHOLDER_TOKEN_ID = -1 GREEDY_TEMPERATURE = -1 # Maximum number of speculative draft tokens allowed per request in a single @@ -82,11 +89,19 @@ class AscendRejectionSampler(RejectionSampler, nn.Module): # [num_tokens, vocab_size] # NOTE(woosuk): `target_logits` can be updated in place inside the # `compute_probs` function. - target_probs = compute_probs( - target_logits, - metadata.cu_num_draft_tokens, - sampling_metadata, - ) + if vllm_version_is("0.11.0"): + target_probs = compute_probs( + target_logits, + metadata.cu_num_draft_tokens, + sampling_metadata, + ) + else: + target_logits = apply_sampling_constraints( + target_logits, + metadata.cu_num_draft_tokens, + sampling_metadata, + ) + target_probs = target_logits.softmax(dim=-1, dtype=torch.float32) output_token_ids = rejection_sample( metadata.draft_token_ids, diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 6a8bb8d6..74e29178 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -12,7 +12,6 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM -from vllm.utils import is_pin_memory_available from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.metadata import SpecDecodeMetadata @@ -27,8 +26,10 @@ from vllm_ascend.utils import vllm_version_is if vllm_version_is("0.11.0"): from vllm.config import CompilationLevel + from vllm.utils import is_pin_memory_available else: from vllm.config import CompilationMode + from vllm.utils.platform_utils import is_pin_memory_available PADDING_SLOT_ID = -1 diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index a3b908a0..3af82cf9 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -311,6 +311,41 @@ def get_max_hidden_layers(hf_config) -> int: return max(layer_counts) +# Update cudagraph capture sizes for vllm config +def update_cudagraph_capture_sizes(vllm_config: VllmConfig, + cudagraph_capture_sizes: List[int]): + + valid_max_size = (cudagraph_capture_sizes[-1] + if cudagraph_capture_sizes else 0) + if (vllm_config.compilation_config.max_cudagraph_capture_size is not None + and vllm_config.compilation_config.max_cudagraph_capture_size + != valid_max_size): + if vllm_config.compilation_config.cudagraph_capture_sizes is not None: + raise ValueError( + "customized max_cudagraph_capture_size" + f"(={vllm_config.compilation_config.max_cudagraph_capture_size}) " + "should be consistent with the max value of " + f"cudagraph_capture_sizes(={valid_max_size})") + logger.warning( + "Truncating max_cudagraph_capture_size to %d", + valid_max_size, + ) + + vllm_config.compilation_config.max_cudagraph_capture_size = valid_max_size + + if vllm_config.compilation_config.cudagraph_capture_sizes is not None and len( + cudagraph_capture_sizes) < len( + vllm_config.compilation_config.cudagraph_capture_sizes): + logger.warning( + ("cudagraph_capture_sizes specified in compilation_config" + " %s is overridden by config %s"), + vllm_config.compilation_config.cudagraph_capture_sizes, + cudagraph_capture_sizes, + ) + vllm_config.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes + vllm_config.compilation_config.post_init_cudagraph_sizes() + + def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: """Update ACL graph capture sizes based on hardware limitations""" # NOTE: Currently, we can only capture 1800 graphs at most, @@ -402,7 +437,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: indices[0], indices[-1] = 0, len(original_sizes) - 1 sampled_sizes = [original_sizes[i] for i in indices] - compilation_config.init_with_cudagraph_sizes(sampled_sizes) + if vllm_version_is("0.11.0"): + compilation_config.init_with_cudagraph_sizes(sampled_sizes) + else: + update_cudagraph_capture_sizes(vllm_config, sampled_sizes) logger.info( "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes", @@ -433,7 +471,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs: enlarged_sizes = [(num_speculative_tokens + 1) * size for size in original_sizes] - compilation_config.init_with_cudagraph_sizes(enlarged_sizes) + if vllm_version_is("0.11.0"): + compilation_config.init_with_cudagraph_sizes(enlarged_sizes) + else: + update_cudagraph_capture_sizes(vllm_config, enlarged_sizes) logger.info( "Adjusted ACL graphs: %s → %s for speculative decoding", original_sizes, enlarged_sizes) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index bf0465c8..01bd128b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -72,7 +72,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask -from vllm.utils import cdiv, is_pin_memory_available +from vllm.utils import cdiv from vllm.utils.jsontree import json_map_leaves from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( @@ -159,13 +159,14 @@ else: if vllm_version_is("0.11.0"): from vllm.attention.layer import Attention from vllm.config import CompilationLevel - from vllm.utils import LazyLoader + from vllm.utils import LazyLoader, is_pin_memory_available from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention else: from vllm.attention.layer import MLAAttention from vllm.config import CompilationMode from vllm.utils.import_utils import LazyLoader + from vllm.utils.platform_utils import is_pin_memory_available if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] @@ -386,7 +387,11 @@ class NPUModelRunner(LoRAModelRunnerMixin): self.drafter = get_spec_decode_method( self.speculative_config.method, self.vllm_config, self.device, self) - self.rejection_sampler = AscendRejectionSampler() + if vllm_version_is("0.11.0"): + self.rejection_sampler = AscendRejectionSampler() + else: + self.rejection_sampler = AscendRejectionSampler( + self.sampler) self.actual_seq_lengths_q = list( range(self.decode_token_per_req, self.max_num_tokens + 1, self.decode_token_per_req)) @@ -1885,6 +1890,9 @@ class NPUModelRunner(LoRAModelRunnerMixin): # TODO: Optimize the CPU -> NPU copy. cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to( self.device, non_blocking=True) + if not vllm_version_is("0.11.0"): + cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to( + self.device, non_blocking=True) logits_indices = torch.from_numpy(logits_indices).to(self.device, non_blocking=True) target_logits_indices = torch.from_numpy(target_logits_indices).to( @@ -1896,15 +1904,25 @@ class NPUModelRunner(LoRAModelRunnerMixin): # draft_token_indices: [ 1, 2, 3, 105, 106, 208] draft_token_ids = self.input_ids[logits_indices] draft_token_ids = draft_token_ids[target_logits_indices + 1] - - metadata = SpecDecodeMetadata( - draft_token_ids=draft_token_ids, - num_draft_tokens=num_draft_tokens.tolist(), - cu_num_draft_tokens=cu_num_draft_tokens, - target_logits_indices=target_logits_indices, - bonus_logits_indices=bonus_logits_indices, - logits_indices=logits_indices, - ) + if vllm_version_is("0.11.0"): + metadata = SpecDecodeMetadata( + draft_token_ids=draft_token_ids, + num_draft_tokens=num_draft_tokens.tolist(), + cu_num_draft_tokens=cu_num_draft_tokens, + target_logits_indices=target_logits_indices, + bonus_logits_indices=bonus_logits_indices, + logits_indices=logits_indices, + ) + else: + metadata = SpecDecodeMetadata( + draft_token_ids=draft_token_ids, + num_draft_tokens=num_draft_tokens.tolist(), + cu_num_draft_tokens=cu_num_draft_tokens, + cu_num_sampled_tokens=cu_num_sampled_tokens, + target_logits_indices=target_logits_indices, + bonus_logits_indices=bonus_logits_indices, + logits_indices=logits_indices, + ) return metadata def apply_grammar_bitmask(