diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index f91e7e8..d3622dc 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -43,7 +43,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.8.5.post1] + vllm_version: [main, v0.9.0] concurrency: group: > ${{ diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 42f2abc..d46318d 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -41,7 +41,7 @@ jobs: strategy: max-parallel: 2 matrix: - vllm_version: [main, v0.8.5.post1] + vllm_version: [main, v0.9.0] name: vLLM Ascend long term test runs-on: linux-arm64-npu-1 container: diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 003b400..41457c2 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -40,7 +40,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} strategy: matrix: - vllm_verison: [main, v0.8.5.post1] + vllm_verison: [main, v0.9.0] name: vLLM Ascend prefilling decoding disaggregation test runs-on: linux-arm64-npu-static-8 diff --git a/Dockerfile b/Dockerfile index 3ca5431..1dfd10c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.8.5.post1 +ARG VLLM_TAG=v0.9.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 272a399..ffd1174 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.8.5.post1 +ARG VLLM_TAG=v0.9.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/tests/long_term/spec_decode/e2e/conftest.py b/tests/long_term/spec_decode/e2e/conftest.py index f39844b..67a46e1 100644 --- a/tests/long_term/spec_decode/e2e/conftest.py +++ b/tests/long_term/spec_decode/e2e/conftest.py @@ -26,9 +26,9 @@ import torch from vllm import SamplingParams from vllm.sequence import PromptLogprobs, SampleLogprobs -from ....model_utils import (TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs, - check_logprobs_close, check_outputs_equal) +from tests.model_utils import (TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + check_logprobs_close, check_outputs_equal) PROMPTS = [ "Hello, my name is", diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 61e26e1..675318e 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -30,7 +30,6 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch from vllm_ascend.ops.attention import vanilla_chunked_prefill -from vllm_ascend.utils import vllm_version_is class AscendAttentionBackend(AttentionBackend): @@ -142,14 +141,11 @@ class AscendAttentionMetadataBuilder: def build(self, num_reqs, num_actual_tokens, max_query_len, common_prefix_len): - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - block_table = (self.runner.input_batch.block_table. - get_device_tensor()[:num_reqs]) - else: - block_table = self.runner.input_batch.block_table[ - 0].get_device_tensor() - block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = ( - block_table[:num_reqs]) + + block_table = self.runner.input_batch.block_table[0].get_device_tensor( + ) + block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = ( + block_table[:num_reqs]) query_lens = self.runner.query_lens seq_lens = self.runner.seq_lens_cpu[:num_reqs] diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index eb40f41..d39a149 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -16,7 +16,6 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla -from vllm_ascend.utils import vllm_version_is from vllm_ascend.worker.model_runner_v1 import NPUModelRunner if TYPE_CHECKING: @@ -239,14 +238,11 @@ class AscendMLAMetadataBuilder: # function. We should avoid GPU -> CPU sync as much as possible because # it blocks on all previous kernels. device = self.runner.device - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - block_table = (self.runner.input_batch.block_table. - get_device_tensor()[:num_reqs]) - else: - block_table = self.runner.input_batch.block_table[ - 0].get_device_tensor() - block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = ( - block_table[:num_reqs]) + + block_table = self.runner.input_batch.block_table[0].get_device_tensor( + ) + block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = ( + block_table[:num_reqs]) slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to( device, non_blocking=True) input_positions = self.runner.positions_cpu[:num_actual_tokens].to( diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index bc3b86b..74a292d 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -26,18 +26,10 @@ from vllm.distributed import (GroupCoordinator, tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import get_dp_group from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map) - -from vllm_ascend.utils import vllm_version_is - -if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")): - from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoEParallelConfig, MoEConfig) -else: - MoEConfig = None - -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, QuantizeMethodBase) + FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod, + determine_expert_map) +from vllm.model_executor.layers.quantization.base_config import \ + QuantizationConfig import vllm_ascend.envs as envs_ascend from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group @@ -587,10 +579,8 @@ def select_experts( class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): def __init__(self, moe: MoEConfig = None): - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - super().__init__() - else: - super().__init__(moe=moe) + + super().__init__(moe=moe) vllm_config = get_current_vllm_config() ep_group = get_ep_group() @@ -731,24 +721,17 @@ class AscendFusedMoE(FusedMoE): params_dtype = torch.get_default_dtype() vllm_config = get_current_vllm_config() - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - self.ep_size = get_ep_group().world_size - self.tp_size = get_etp_group().world_size - self.dp_size = (dp_size if dp_size is not None else - get_dp_group().world_size) - self.dp_rank = (0 if self.dp_size == 1 else - get_dp_group().rank_in_group) - else: - self.moe_parallel_config: FusedMoEParallelConfig = ( - FusedMoEParallelConfig.make( - tp_size_=(tp_size if tp_size is not None else - get_tensor_model_parallel_world_size()), - dp_size_=(dp_size if dp_size is not None else - get_dp_group().world_size), - vllm_parallel_config=vllm_config.parallel_config)) - self.moe_parallel_config.ep_size = get_ep_group().world_size - self.moe_parallel_config.tp_size = get_etp_group().world_size + self.moe_parallel_config: FusedMoEParallelConfig = ( + FusedMoEParallelConfig.make( + tp_size_=(tp_size if tp_size is not None else + get_tensor_model_parallel_world_size()), + dp_size_=(dp_size if dp_size is not None else + get_dp_group().world_size), + vllm_parallel_config=vllm_config.parallel_config)) + + self.moe_parallel_config.ep_size = get_ep_group().world_size + self.moe_parallel_config.tp_size = get_etp_group().world_size self.top_k = top_k self.num_experts = num_experts @@ -773,54 +756,39 @@ class AscendFusedMoE(FusedMoE): self.local_num_experts, self.expert_map = determine_expert_map( self.ep_size, get_ep_group().rank_in_group, self.global_num_experts) - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - self.tp_rank = get_etp_group().rank_in_group - self.ep_rank = get_ep_group().rank_in_group - else: - self.moe_parallel_config.tp_rank = get_etp_group( - ).rank_in_group - self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group + + self.moe_parallel_config.tp_rank = get_etp_group().rank_in_group + self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group else: # Adjust TP size for DP attention # haven't test its functionality yet, may remove in the future - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - self.tp_rank = self.tp_size * self.dp_rank - self.ep_rank = 0 - self.tp_size = self.tp_size * self.dp_size - self.ep_size = 1 - else: - self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank - self.moe_parallel_config.ep_rank = 0 - self.moe_parallel_config.tp_size = self.tp_size * self.dp_size - self.moe_parallel_config.ep_size = 1 + + self.moe_parallel_config.tp_rank = self.tp_size * self.dp_rank + self.moe_parallel_config.ep_rank = 0 + self.moe_parallel_config.tp_size = self.tp_size * self.dp_size + self.moe_parallel_config.ep_size = 1 self.local_num_experts, self.expert_map = (self.global_num_experts, None) if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " "non-grouped topk.") - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - if quant_config is None: - self.quant_method: Optional[QuantizeMethodBase] = ( - AscendUnquantizedFusedMoEMethod()) - else: - self.quant_method = quant_config.get_quant_method(self, prefix) - else: - moe = MoEConfig( - num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - num_local_experts=self.local_num_experts, - moe_parallel_config=self.moe_parallel_config, - # TODO (bnell): this needs to be fixed for quantized types. - in_dtype=params_dtype, - ) - if quant_config is None: - self.quant_method = AscendUnquantizedFusedMoEMethod(moe) - else: - self.quant_method = quant_config.get_quant_method(self, prefix) + moe = MoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + # TODO (bnell): this needs to be fixed for quantized types. + in_dtype=params_dtype, + ) + + if quant_config is None: + self.quant_method = AscendUnquantizedFusedMoEMethod(moe) + else: + self.quant_method = quant_config.get_quant_method(self, prefix) assert self.quant_method is not None diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 4604c88..5660d62 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -24,16 +24,9 @@ # each worker's `__init__` function. # # Then in each kind of patch, there are three folders: -# - patch_0_8_5: contains the patches applied when vllm version is 0.8.5. +# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0. # - patch_main: contains the patches applied when vllm version is main branch. -# - patch_common: contains the patches applied in both 0.8.5 and main branch. -# -# In the future, with the vllm version upgrade, the new patch folder such as -# patch_0_8_5, patch_0_8_6, etc. will be added to manage the patch for different -# vllm version. And the patch_common will contain the patches applied in all the -# vllm version. -# Once the vllm version is too old that vllm-ascend will not support, the related -# patch folder will be removed as well. +# - patch_common: contains the patches applied in both 0.9.0 and main branch. # # Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # ---------------------------------------------------------------------------------- diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 6f13e40..e724fe5 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -17,8 +17,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - from vllm_ascend.patch.platform import patch_0_8_5 # noqa: F401 +if vllm_version_is("0.9.0"): + from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401 else: from vllm_ascend.patch.platform import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_8_5/__init__.py b/vllm_ascend/patch/platform/patch_0_9_0/__init__.py similarity index 100% rename from vllm_ascend/patch/platform/patch_0_8_5/__init__.py rename to vllm_ascend/patch/platform/patch_0_9_0/__init__.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 931e355..d1d3d42 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -18,8 +18,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - from vllm_ascend.patch.worker import patch_0_8_5 # noqa: F401 +if vllm_version_is("0.9.0"): + from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401 else: from vllm_ascend.patch.worker import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/worker/patch_0_8_5/__init__.py b/vllm_ascend/patch/worker/patch_0_9_0/__init__.py similarity index 100% rename from vllm_ascend/patch/worker/patch_0_8_5/__init__.py rename to vllm_ascend/patch/worker/patch_0_9_0/__init__.py diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py index 49c221e..11a4642 100644 --- a/vllm_ascend/worker/model_runner.py +++ b/vllm_ascend/worker/model_runner.py @@ -64,8 +64,6 @@ from vllm.worker.model_runner_base import ( _init_attn_metadata_from_tensor_dict, _init_sampling_metadata_from_tensor_dict) -from vllm_ascend.utils import vllm_version_is - if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionBackend @@ -1017,10 +1015,8 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]): pattern: Optional[str] = None, max_size: Optional[int] = None, ) -> None: - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - from vllm.model_executor.model_loader.loader import ShardedStateLoader # type: ignore[import] # isort: skip # noqa - else: - from vllm.model_executor.model_loader import ShardedStateLoader + + from vllm.model_executor.model_loader import ShardedStateLoader ShardedStateLoader.save_model( self.model, path, @@ -1032,12 +1028,9 @@ class NPUModelRunnerBase(ModelRunnerBase[TModelInputForNPU]): self, tensorizer_config: TensorizerConfig, ) -> None: - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - from vllm.model_executor.model_loader.loader import \ - TensorizerLoader # type: ignore # noqa - else: - from vllm.model_executor.model_loader import \ - TensorizerLoader # type: ignore # noqa + + from vllm.model_executor.model_loader import \ + TensorizerLoader # type: ignore # noqa TensorizerLoader.save_model( self.model, tensorizer_config=tensorizer_config, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 91f8195..24bd2b4 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -61,7 +61,6 @@ from vllm_ascend.attention.attention import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import vllm_version_is if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] @@ -210,16 +209,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): # Request states. self.requests: Dict[str, CachedRequestState] = {} # Persistent batch. - # Remove this after we drop 0.8.5 support - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - ) self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32, @@ -573,10 +562,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): block_table_indices = (req_indices * self.max_num_blocks_per_req + positions_np // self.block_size) - if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"): - block_table_cpu = self.input_batch.block_table.get_cpu_tensor() - else: - block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() + + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, @@ -1182,16 +1169,16 @@ class NPUModelRunner(LoRAModelRunnerMixin): """ import torch_npu kv_caches: Dict[str, torch.Tensor] = {} - if not (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")): - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_size=self.cache_config.block_size, - ) + + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + block_size=self.cache_config.block_size, + ) for kv_cache_group in kv_cache_config.kv_cache_groups: kv_cache_spec = kv_cache_group.kv_cache_spec