2025-08-05 18:43:04 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# Copyright 2023 The vLLM team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
|
2025-09-01 09:09:21 +08:00
|
|
|
# isort: skip_file
|
2025-08-05 18:43:04 +08:00
|
|
|
|
2025-09-05 09:11:22 +08:00
|
|
|
import math
|
2025-08-21 08:54:57 +08:00
|
|
|
import types
|
2025-10-21 00:00:42 +08:00
|
|
|
from typing import Any, Optional
|
2025-08-11 14:06:49 +08:00
|
|
|
|
2025-11-17 10:50:35 +08:00
|
|
|
import numpy as np
|
2025-08-05 18:43:04 +08:00
|
|
|
import torch
|
2025-08-21 08:54:57 +08:00
|
|
|
import torch.distributed as dist
|
|
|
|
|
import torch.nn as nn
|
2025-08-11 21:39:24 +08:00
|
|
|
import torch_npu
|
2025-10-21 00:00:42 +08:00
|
|
|
from vllm.config import CUDAGraphMode, VllmConfig
|
2025-08-21 08:54:57 +08:00
|
|
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
|
|
|
|
from vllm.distributed.parallel_state import get_dp_group
|
2025-08-11 18:03:19 +08:00
|
|
|
from vllm.forward_context import get_forward_context
|
2025-08-12 14:24:50 +08:00
|
|
|
from vllm.logger import logger
|
2025-08-05 18:43:04 +08:00
|
|
|
|
2025-08-21 08:54:57 +08:00
|
|
|
import vllm_ascend.envs as envs_ascend
|
|
|
|
|
from vllm_ascend.ascend_config import get_ascend_config
|
2025-08-12 14:24:50 +08:00
|
|
|
from vllm_ascend.platform import NPUPlatform
|
2025-10-30 16:53:05 +08:00
|
|
|
from vllm_ascend.spec_decode import get_spec_decode_method
|
2025-09-01 09:09:21 +08:00
|
|
|
from vllm_ascend.torchair.utils import (
|
2025-09-03 17:56:12 +08:00
|
|
|
TORCHAIR_CACHE_DIR, TorchairCommonAttentionMetadata,
|
|
|
|
|
check_torchair_cache_exist, converting_weight_acl_format,
|
|
|
|
|
register_torchair_model, torchair_ops_patch,
|
2025-09-01 09:09:21 +08:00
|
|
|
torchair_quant_method_register, write_kv_cache_bytes_to_file)
|
2025-08-11 21:39:24 +08:00
|
|
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
AscendDeviceType, get_ascend_device_type)
|
2025-08-05 18:43:04 +08:00
|
|
|
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NPUTorchairModelRunner(NPUModelRunner):
|
|
|
|
|
|
|
|
|
|
def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
2025-09-30 03:25:58 +08:00
|
|
|
self.ascend_config = get_ascend_config()
|
|
|
|
|
self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp
|
2025-08-05 18:43:04 +08:00
|
|
|
super().__init__(vllm_config, device)
|
2025-09-18 14:05:33 +08:00
|
|
|
if self.speculative_config:
|
|
|
|
|
self.actual_seq_lengths_q = list(
|
|
|
|
|
range(self.decode_token_per_req, self.max_num_tokens + 1,
|
|
|
|
|
self.decode_token_per_req))
|
2025-09-16 01:17:42 +08:00
|
|
|
self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
|
|
|
|
|
None, None, vllm_config, device)
|
[1/N][Refactor] Refactor code to adapt with vllm main (#3612)
### What this PR does / why we need it?
This is the step 1 of refactoring code to adapt with vllm main, and this
pr aligned with
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
1. refactor deepseek to the latest code arch as of
https://github.com/vllm-project/vllm/commit/17c540a993af88204ad1b78345c8a865cf58ce44
2. bunches of fixes due to vllm changes
- Fix `AscendScheduler` `__post_init__`, caused by
https://github.com/vllm-project/vllm/pull/25075
- Fix `AscendScheduler` init got an unexpected arg `block_size`, caused
by https://github.com/vllm-project/vllm/pull/26296
- Fix `KVCacheManager` `get_num_common_prefix_blocks` arg, caused by
https://github.com/vllm-project/vllm/pull/23485
- Fix `MLAAttention` import,caused by
https://github.com/vllm-project/vllm/pull/25103
- Fix `SharedFusedMoE` import, caused by
https://github.com/vllm-project/vllm/pull/26145
- Fix `LazyLoader` improt, caused by
https://github.com/vllm-project/vllm/pull/27022
- Fix `vllm.utils.swap_dict_values` improt, caused by
https://github.com/vllm-project/vllm/pull/26990
- Fix `Backend` enum import, caused by
https://github.com/vllm-project/vllm/pull/25893
- Fix `CompilationLevel` renaming to `CompilationMode` issue introduced
by https://github.com/vllm-project/vllm/pull/26355
- Fix fused_moe ops, caused by
https://github.com/vllm-project/vllm/pull/24097
- Fix bert model because of `inputs_embeds`, caused by
https://github.com/vllm-project/vllm/pull/25922
- Fix MRope because of `get_input_positions_tensor` to
`get_mrope_input_positions`, caused by
https://github.com/vllm-project/vllm/pull/24172
- Fix `splitting_ops` changes introduced by
https://github.com/vllm-project/vllm/pull/25845
- Fix multi-modality changes introduced by
https://github.com/vllm-project/vllm/issues/16229
- Fix lora bias dropping issue introduced by
https://github.com/vllm-project/vllm/pull/25807
- Fix structured ouput break introduced by
https://github.com/vllm-project/vllm/issues/26737
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: Icey <1790571317@qq.com>
Co-authored-by: Icey <1790571317@qq.com>
2025-10-24 16:55:08 +08:00
|
|
|
self.use_sparse = hasattr(self.model_config.hf_config, "index_topk")
|
2025-09-16 01:17:42 +08:00
|
|
|
|
2025-09-16 14:13:07 +08:00
|
|
|
register_torchair_model()
|
|
|
|
|
torchair_ops_patch()
|
|
|
|
|
torchair_quant_method_register()
|
|
|
|
|
if self.enable_shared_expert_dp:
|
|
|
|
|
return
|
2025-08-21 08:54:57 +08:00
|
|
|
self.new_kv_cache_bytes = -1
|
|
|
|
|
self.torchair_compiled_model = None # type: ignore
|
|
|
|
|
self.torchair_compiled_models = {} # type: ignore
|
2025-09-30 03:25:58 +08:00
|
|
|
self.use_cached_npu_graph = self.ascend_config.torchair_graph_config.use_cached_graph
|
|
|
|
|
self.use_cached_kv_cache_bytes = self.ascend_config.torchair_graph_config.use_cached_kv_cache_bytes
|
|
|
|
|
self.torchair_graph_batch_sizes = self.ascend_config.torchair_graph_config.graph_batch_sizes
|
|
|
|
|
if self.ascend_config.torchair_graph_config.graph_batch_sizes_init:
|
2025-08-21 08:54:57 +08:00
|
|
|
self.init_torchair_graph_batch_sizes()
|
|
|
|
|
|
2025-09-08 14:52:33 +08:00
|
|
|
self.update_torchair_graph_batch_sizes()
|
2025-08-21 08:54:57 +08:00
|
|
|
|
|
|
|
|
torch._dynamo.cache_size.config.cache_size_limit += len(
|
|
|
|
|
self.torchair_graph_batch_sizes)
|
|
|
|
|
torch._dynamo.config.capture_dynamic_output_shape_ops = True
|
|
|
|
|
torch._logging.set_logs(
|
|
|
|
|
recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES)
|
|
|
|
|
|
|
|
|
|
self._check_batch_sizes_consistency()
|
2025-08-11 14:06:49 +08:00
|
|
|
|
2025-10-30 16:53:05 +08:00
|
|
|
def _set_up_drafter(self):
|
|
|
|
|
super()._set_up_drafter()
|
|
|
|
|
if self.speculative_config:
|
|
|
|
|
# Torchair do not support disable_padded_drafter_batch
|
|
|
|
|
# Enforce to disable this feature
|
|
|
|
|
self.speculative_config.disable_padded_drafter_batch = True
|
|
|
|
|
|
|
|
|
|
def _get_drafter(self):
|
|
|
|
|
return get_spec_decode_method(self.speculative_config.method,
|
|
|
|
|
self.vllm_config,
|
|
|
|
|
self.device,
|
|
|
|
|
self,
|
|
|
|
|
is_torchair_graph=True)
|
|
|
|
|
|
2025-10-17 21:57:05 +08:00
|
|
|
def _may_pad_kv_consumer_num_seq(self):
|
|
|
|
|
# pd disaggregation scenario need redundant_batch_sizes to avoid each batch's seq_len exceed 16 tokens
|
|
|
|
|
# self.max_num_reqs here is greater than the actual maximum request number
|
2025-10-18 16:42:17 +08:00
|
|
|
if self.decode_token_per_req > 1 and self.is_kv_consumer:
|
|
|
|
|
# applied only when speculative decoding is active
|
2025-10-17 21:57:05 +08:00
|
|
|
FIA_SEQ_LEN_LIMIT = 16
|
|
|
|
|
new_max_num_reqs = self.max_num_reqs + math.ceil(
|
|
|
|
|
self.max_num_reqs / FIA_SEQ_LEN_LIMIT) + math.ceil(
|
|
|
|
|
(self.max_num_reqs * self.decode_token_per_req) /
|
|
|
|
|
(FIA_SEQ_LEN_LIMIT**2))
|
|
|
|
|
if self.max_num_reqs < new_max_num_reqs:
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"max_num_reqs is updated to {new_max_num_reqs}")
|
|
|
|
|
self.max_num_reqs = new_max_num_reqs
|
|
|
|
|
|
|
|
|
|
def _init_mc2_tokens_capacity(self):
|
|
|
|
|
# NOTE: To be clear, we need to make sure that during graph capture, the number of
|
|
|
|
|
# tokens is less than or equal to mc2_tokens_capacity. According to _set_cudagraph_sizes,
|
|
|
|
|
# the max number of tokens in graph is min(max_num_seqs * uniform_decode_query_len, 512).
|
2025-11-03 14:17:51 +08:00
|
|
|
max_num_tokens = self.max_num_reqs * self.uniform_decode_query_len
|
2025-10-17 21:57:05 +08:00
|
|
|
tp_size = self.parallel_config.tensor_parallel_size
|
|
|
|
|
# Use integer arithmetic for ceiling division.
|
2025-10-31 09:24:50 +08:00
|
|
|
max_graph_batch_size = self.calculate_new_torchair_graph_batch_size(
|
|
|
|
|
max_num_tokens, tp_size)
|
|
|
|
|
self.mc2_tokens_capacity = max_graph_batch_size
|
|
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type(
|
|
|
|
|
) == AscendDeviceType._910_93 and self.mc2_tokens_capacity > 512:
|
2025-10-31 09:24:50 +08:00
|
|
|
logger.error(
|
|
|
|
|
f"A3: the max number of tokens must smaller then 512, but now is {self.mc2_tokens_capacity}"
|
|
|
|
|
)
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type(
|
|
|
|
|
) == AscendDeviceType._910B and self.mc2_tokens_capacity > 256:
|
2025-10-31 09:24:50 +08:00
|
|
|
logger.error(
|
|
|
|
|
f"A2: the max number of tokens must smaller then 256, but now is {self.mc2_tokens_capacity}"
|
|
|
|
|
)
|
2025-10-17 21:57:05 +08:00
|
|
|
|
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
|
|
|
def _sync_metadata_across_dp(
|
2025-10-25 15:53:01 +08:00
|
|
|
self, num_tokens: int,
|
|
|
|
|
with_prefill: bool) -> tuple[int, Optional[torch.Tensor], bool]:
|
2025-08-12 14:24:50 +08:00
|
|
|
"""Override from NPUModelRunner to pad num_tokens"""
|
2025-09-16 14:13:07 +08:00
|
|
|
if self.enable_shared_expert_dp:
|
2025-09-23 14:52:42 +08:00
|
|
|
# Padding is not required for shared_expert_dp cases in eager mode.
|
2025-10-25 15:53:01 +08:00
|
|
|
return num_tokens, None, with_prefill
|
2025-08-11 14:06:49 +08:00
|
|
|
if self.dp_size == 1:
|
|
|
|
|
if not with_prefill:
|
|
|
|
|
maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
|
|
|
|
|
num_tokens)
|
2025-10-25 15:53:01 +08:00
|
|
|
return maybe_padded_num_tokens, None, with_prefill
|
|
|
|
|
return num_tokens, None, with_prefill
|
2025-08-11 14:06:49 +08:00
|
|
|
|
2025-10-25 15:53:01 +08:00
|
|
|
num_tokens_across_dp = torch.zeros(self.dp_size + 1,
|
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
|
|
|
dtype=torch.int32,
|
|
|
|
|
device="npu")
|
|
|
|
|
num_tokens_across_dp[self.dp_rank] = num_tokens
|
2025-10-25 15:53:01 +08:00
|
|
|
num_tokens_across_dp[-1] = int(with_prefill)
|
[Fix] Fix DP-related padding logic (#2582)
### What this PR does / why we need it?
The determination of attention state, padding, and other forward
metadata has been moved to an earlier stage within the input preparation
process. This change enables us to utilize a single all-reduce
operation, maximizing synchronization efficiency as early as possible.
The logic for synchronizing metadata—such as the number of tokens,
prefill status, and DBO status—across data parallel (DP) ranks has now
been unified and simplified.
For performance improvements, the all-reduce operation has been switched
from the `gloo` backend to the `npu` backend, which results in an
reduction of several milliseconds per step (**approximately 10%
performance gain for TPOT!**).
Additionally, the multi-DP server hang issue has been resolved, ensuring
no more hangs occur when `num_requests < dp_size`. Alas, a relief.
Finally, the miscalculated memory usage issue has been addressed by
removing the unnecessary `DummyCommImpl`, allowing the system to use the
real communication method when determining available memory.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
Maybe we should add an test case for multi-DP online server?
@MengqingCao
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/c5d004aaaf3b2106d33974c673bec0568c18f762
---------
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-08-28 19:39:58 +08:00
|
|
|
dist.all_reduce(num_tokens_across_dp,
|
|
|
|
|
group=get_dp_group().device_group)
|
2025-10-25 15:53:01 +08:00
|
|
|
with_prefill = bool(num_tokens_across_dp[-1])
|
|
|
|
|
num_tokens_across_dp = num_tokens_across_dp[:-1]
|
2025-08-11 14:06:49 +08:00
|
|
|
|
|
|
|
|
if not with_prefill:
|
|
|
|
|
max_num_token = num_tokens_across_dp.max().item()
|
|
|
|
|
maybe_padded_num_tokens = self.select_torchair_padded_batch_size(
|
|
|
|
|
max_num_token)
|
|
|
|
|
num_tokens_across_dp = torch.full((self.dp_size, ),
|
|
|
|
|
maybe_padded_num_tokens,
|
|
|
|
|
dtype=torch.int32,
|
2025-08-29 16:06:49 +08:00
|
|
|
device="npu")
|
2025-08-11 14:06:49 +08:00
|
|
|
else:
|
|
|
|
|
maybe_padded_num_tokens = num_tokens
|
|
|
|
|
|
2025-10-25 15:53:01 +08:00
|
|
|
return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill
|
2025-08-11 18:03:19 +08:00
|
|
|
|
2025-10-21 00:00:42 +08:00
|
|
|
def _build_dummy_attn_metadata(
|
|
|
|
|
self,
|
|
|
|
|
with_prefill: bool,
|
|
|
|
|
num_reqs: int,
|
|
|
|
|
num_tokens: int,
|
|
|
|
|
max_query_len: int,
|
2025-11-05 08:46:05 +08:00
|
|
|
num_scheduled_tokens: np.ndarray,
|
2025-10-21 00:00:42 +08:00
|
|
|
aclgraph_runtime_mode: Optional[CUDAGraphMode] = None,
|
|
|
|
|
force_attention: bool = False,
|
|
|
|
|
) -> Optional[dict[str, Any]]:
|
2025-08-11 18:03:19 +08:00
|
|
|
# NOTE: If torchair graph mode and not with_prefill,
|
|
|
|
|
# we can't skip_attn, it will cause graph recompile.
|
2025-09-16 14:13:07 +08:00
|
|
|
if with_prefill or self.enable_shared_expert_dp:
|
2025-10-21 00:00:42 +08:00
|
|
|
attn_metadata = super()._build_dummy_attn_metadata(
|
2025-09-22 17:14:28 +08:00
|
|
|
with_prefill, num_reqs, num_tokens, max_query_len,
|
2025-11-05 08:46:05 +08:00
|
|
|
num_scheduled_tokens, aclgraph_runtime_mode, force_attention)
|
2025-09-16 14:13:07 +08:00
|
|
|
else:
|
2025-08-20 09:01:04 +08:00
|
|
|
common_attn_metadata = TorchairCommonAttentionMetadata(
|
|
|
|
|
num_reqs=num_reqs,
|
|
|
|
|
num_actual_tokens=1,
|
|
|
|
|
actual_seq_lengths_q=self.actual_seq_lengths_q,
|
|
|
|
|
attn_mask=self.attn_mask,
|
|
|
|
|
spec_attn_mask=self.spec_attn_mask,
|
|
|
|
|
decode_token_per_req=self.decode_token_per_req,
|
|
|
|
|
)
|
2025-08-11 18:03:19 +08:00
|
|
|
attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
|
2025-08-20 09:01:04 +08:00
|
|
|
common_attn_metadata)
|
2025-08-11 18:03:19 +08:00
|
|
|
return attn_metadata
|
|
|
|
|
|
|
|
|
|
def _generate_dummy_run_hidden_states(self, with_prefill,
|
|
|
|
|
is_torchair_compile, input_ids,
|
|
|
|
|
positions, attn_metadata, num_tokens,
|
|
|
|
|
intermediate_tensors, inputs_embeds):
|
2025-09-16 14:13:07 +08:00
|
|
|
if with_prefill or self.enable_shared_expert_dp:
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type() == AscendDeviceType._310P:
|
2025-09-16 14:13:07 +08:00
|
|
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
|
|
|
|
hidden_states = super()._generate_dummy_run_hidden_states(
|
|
|
|
|
with_prefill, is_torchair_compile, input_ids, positions,
|
|
|
|
|
attn_metadata, num_tokens, intermediate_tensors, inputs_embeds)
|
|
|
|
|
else:
|
2025-08-11 18:03:19 +08:00
|
|
|
# Only mark static while compiling
|
|
|
|
|
if is_torchair_compile:
|
|
|
|
|
torch._dynamo.mark_static(input_ids)
|
|
|
|
|
torch._dynamo.mark_static(positions)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.block_table)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.input_positions)
|
|
|
|
|
torch._dynamo.mark_static(get_forward_context().mc2_mask)
|
|
|
|
|
if hasattr(attn_metadata.decode, "sin"):
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.sin)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.cos)
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.slot_mapping)
|
|
|
|
|
if self.speculative_config:
|
|
|
|
|
torch._dynamo.mark_static(attn_metadata.decode.attn_mask)
|
|
|
|
|
for kv in self.kv_caches:
|
|
|
|
|
assert isinstance(kv, tuple), "kv_cache must be a tuple"
|
|
|
|
|
torch._dynamo.mark_static(kv[0])
|
|
|
|
|
torch._dynamo.mark_static(kv[1])
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type() == AscendDeviceType._310P:
|
2025-08-25 19:48:55 +08:00
|
|
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
|
2025-08-11 18:03:19 +08:00
|
|
|
|
|
|
|
|
compiled_model = self._get_torchair_lazy_compiled_model(num_tokens)
|
|
|
|
|
model_kwargs = {}
|
|
|
|
|
model_kwargs["kv_caches"] = self.kv_caches
|
|
|
|
|
model_kwargs["attn_metadata"] = attn_metadata
|
|
|
|
|
hidden_states = compiled_model(
|
|
|
|
|
input_ids=input_ids,
|
|
|
|
|
positions=positions,
|
|
|
|
|
intermediate_tensors=intermediate_tensors,
|
|
|
|
|
inputs_embeds=None,
|
|
|
|
|
**model_kwargs,
|
|
|
|
|
)
|
|
|
|
|
return hidden_states
|
2025-08-11 21:39:24 +08:00
|
|
|
|
|
|
|
|
def _convert_torch_format(self, kv_cache):
|
2025-09-16 14:13:07 +08:00
|
|
|
if self.enable_shared_expert_dp:
|
|
|
|
|
return super()._convert_torch_format(kv_cache)
|
2025-08-11 21:39:24 +08:00
|
|
|
kv_cache = torch_npu.npu_format_cast(kv_cache, ACL_FORMAT_FRACTAL_ND)
|
|
|
|
|
return kv_cache
|
2025-08-12 14:24:50 +08:00
|
|
|
|
|
|
|
|
def _compile_torchair_graph(self, torchair_graph_batch_sizes) -> None:
|
|
|
|
|
# Trigger torchair graph capture for specific shapes.
|
|
|
|
|
# Capture the large shapes first so that the smaller shapes
|
|
|
|
|
# can reuse the memory pool allocated for the large shapes.
|
|
|
|
|
for idx, num_tokens in enumerate(reversed(torchair_graph_batch_sizes)):
|
|
|
|
|
for _ in range(self.vllm_config.compilation_config.
|
|
|
|
|
cudagraph_num_of_warmups):
|
|
|
|
|
self._dummy_run(num_tokens, is_torchair_compile=True)
|
|
|
|
|
self._dummy_run(num_tokens, is_torchair_compile=True)
|
|
|
|
|
logger.info("Batchsize %d is compiled successfully: %d/%d.",
|
|
|
|
|
num_tokens, idx + 1, len(torchair_graph_batch_sizes))
|
|
|
|
|
|
|
|
|
|
def _capture_model(self):
|
|
|
|
|
"""Override from NPUModelRunner to use torchair graph capture."""
|
2025-09-16 14:13:07 +08:00
|
|
|
if self.enable_shared_expert_dp:
|
|
|
|
|
return super()._capture_model()
|
2025-08-12 14:24:50 +08:00
|
|
|
# TODO(NeverRaR): Calling graph_capture(device=self.device) in
|
|
|
|
|
# torchair graph capture can cause some issues, so now we just
|
|
|
|
|
# temporarily split the codepath for the two different graph patterns.
|
|
|
|
|
torchair_graph_batch_sizes = self.torchair_graph_batch_sizes
|
|
|
|
|
graph_num = len(torchair_graph_batch_sizes)
|
|
|
|
|
|
|
|
|
|
if self.use_cached_npu_graph and not check_torchair_cache_exist():
|
2025-09-03 17:56:12 +08:00
|
|
|
# If caching is enabled but does not exist (either
|
|
|
|
|
# use_cached_kv_cache_bytes is disabled or kv_cache_bytes are
|
|
|
|
|
# different), we will compile the model twice. The first time is
|
|
|
|
|
# used to generate the cache, and the second time is used to load the
|
|
|
|
|
# cache to skip the overhead caused by Dynamo guard mechanism.
|
2025-08-12 14:24:50 +08:00
|
|
|
logger.info(
|
2025-09-03 17:56:12 +08:00
|
|
|
"Cache compilation for torchair graph is enabled. Now we compile graph to genetate"
|
|
|
|
|
" torchair cache, this usually takes %.1f~%.1f mins.",
|
2025-08-12 14:24:50 +08:00
|
|
|
0.5 * graph_num, 1.5 * graph_num)
|
|
|
|
|
self._compile_torchair_graph(torchair_graph_batch_sizes)
|
|
|
|
|
NPUPlatform.synchronize()
|
2025-09-03 17:56:12 +08:00
|
|
|
# Note: We reset dynamo and reload the compiled torchair cached computation graph below
|
|
|
|
|
# that was compiled above. This operation reduces graph launch time by 2-4ms and avoids
|
|
|
|
|
# runtime errors caused by configuration mismatches in graph mode.
|
2025-08-12 14:24:50 +08:00
|
|
|
torch._dynamo.reset()
|
|
|
|
|
self.torchair_compiled_models.clear()
|
|
|
|
|
if self.use_cached_npu_graph:
|
|
|
|
|
logger.info(
|
|
|
|
|
"Loading torchair graph cache, this usually takes %.1f~%.1f mins.",
|
|
|
|
|
0.3 * graph_num, 0.5 * graph_num)
|
|
|
|
|
self._compile_torchair_graph(torchair_graph_batch_sizes)
|
|
|
|
|
else:
|
|
|
|
|
logger.info(
|
|
|
|
|
"Capturing torchair graph, this usually takes %.1f~%.1f mins.",
|
|
|
|
|
0.5 * graph_num, 1.5 * graph_num)
|
|
|
|
|
self._compile_torchair_graph(torchair_graph_batch_sizes)
|
|
|
|
|
|
2025-09-03 17:56:12 +08:00
|
|
|
if self.use_cached_kv_cache_bytes and self.new_kv_cache_bytes > 0:
|
2025-08-12 14:24:50 +08:00
|
|
|
write_kv_cache_bytes_to_file(torch.distributed.get_rank(),
|
|
|
|
|
self.new_kv_cache_bytes)
|
2025-08-21 08:54:57 +08:00
|
|
|
|
|
|
|
|
def _use_aclgraph(self) -> bool:
|
2025-09-16 14:13:07 +08:00
|
|
|
if self.enable_shared_expert_dp:
|
|
|
|
|
return super()._use_aclgraph()
|
2025-08-21 08:54:57 +08:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def _check_batch_sizes_consistency(self) -> None:
|
|
|
|
|
if not dist.is_initialized():
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
local = torch.tensor(self.torchair_graph_batch_sizes,
|
|
|
|
|
device="cpu",
|
|
|
|
|
dtype=torch.int32)
|
|
|
|
|
gathered_graph_batch_size = local.clone()
|
|
|
|
|
dist.all_reduce(gathered_graph_batch_size,
|
|
|
|
|
group=get_dp_group().cpu_group)
|
|
|
|
|
expected = local * self.dp_size
|
|
|
|
|
|
|
|
|
|
if not torch.equal(gathered_graph_batch_size, expected):
|
|
|
|
|
diff_idxs = (gathered_graph_batch_size != expected).nonzero(
|
|
|
|
|
as_tuple=False).flatten().tolist()
|
|
|
|
|
raise AssertionError(
|
|
|
|
|
f"[Graph BatchSize Mismatch] Found mismatches at indices {diff_idxs}.\n"
|
|
|
|
|
f"Local (rank {self.dp_rank}): {local.tolist()}\n"
|
|
|
|
|
f"Sum over ranks: {gathered_graph_batch_size.tolist()}\n"
|
|
|
|
|
f"Expected if all equal: {[v * self.dp_size for v in local.tolist()]}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _update_graph_pad_size(self, with_prefill, graph_pad_size):
|
2025-09-16 14:13:07 +08:00
|
|
|
if with_prefill or self.enable_shared_expert_dp:
|
2025-08-21 08:54:57 +08:00
|
|
|
super()._update_graph_pad_size(with_prefill, graph_pad_size)
|
2025-09-16 14:13:07 +08:00
|
|
|
else:
|
|
|
|
|
self.graph_pad_size = graph_pad_size
|
2025-08-21 08:54:57 +08:00
|
|
|
|
|
|
|
|
def _update_input_ids_and_positions(self, input_ids, positions,
|
|
|
|
|
num_input_tokens, with_prefill,
|
|
|
|
|
padded_num_tokens_across_dp):
|
|
|
|
|
"""Override from NPUModelRunner to update input_ids and positions"""
|
|
|
|
|
input_ids, positions = super()._update_input_ids_and_positions(
|
|
|
|
|
input_ids, positions, num_input_tokens, with_prefill,
|
|
|
|
|
padded_num_tokens_across_dp)
|
|
|
|
|
|
2025-09-16 14:13:07 +08:00
|
|
|
if with_prefill or self.enable_shared_expert_dp:
|
|
|
|
|
return input_ids, positions
|
|
|
|
|
else:
|
2025-08-21 08:54:57 +08:00
|
|
|
input_ids = self.input_ids[:padded_num_tokens_across_dp]
|
|
|
|
|
positions = self.positions[:padded_num_tokens_across_dp]
|
|
|
|
|
return input_ids, positions
|
|
|
|
|
|
|
|
|
|
def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill,
|
|
|
|
|
padded_num_tokens_across_dp,
|
|
|
|
|
input_ids, positions,
|
|
|
|
|
intermediate_tensors,
|
|
|
|
|
inputs_embeds):
|
2025-09-16 01:17:42 +08:00
|
|
|
if attn_metadata is not None and isinstance(attn_metadata, dict):
|
|
|
|
|
attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
|
|
|
|
|
|
2025-09-16 14:13:07 +08:00
|
|
|
if self.enable_shared_expert_dp:
|
|
|
|
|
return super()._generate_process_reqs_hidden_states(
|
|
|
|
|
attn_metadata, with_prefill, padded_num_tokens_across_dp,
|
|
|
|
|
input_ids, positions, intermediate_tensors, inputs_embeds)
|
2025-08-21 08:54:57 +08:00
|
|
|
model_kwargs = {
|
|
|
|
|
"kv_caches": self.kv_caches,
|
|
|
|
|
"attn_metadata": attn_metadata
|
|
|
|
|
}
|
|
|
|
|
if not with_prefill:
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type() == AscendDeviceType._310P:
|
2025-08-25 19:48:55 +08:00
|
|
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ)
|
2025-08-21 08:54:57 +08:00
|
|
|
compiled_model = self._get_torchair_lazy_compiled_model(
|
|
|
|
|
padded_num_tokens_across_dp)
|
|
|
|
|
hidden_states = compiled_model(
|
|
|
|
|
input_ids=input_ids,
|
|
|
|
|
positions=positions,
|
|
|
|
|
intermediate_tensors=intermediate_tensors,
|
|
|
|
|
inputs_embeds=inputs_embeds,
|
|
|
|
|
**model_kwargs,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
assert self.model is not None
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type() == AscendDeviceType._310P:
|
2025-08-25 19:48:55 +08:00
|
|
|
converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND)
|
2025-08-21 08:54:57 +08:00
|
|
|
|
|
|
|
|
hidden_states = self.model(
|
|
|
|
|
input_ids=input_ids,
|
|
|
|
|
positions=positions,
|
|
|
|
|
intermediate_tensors=intermediate_tensors,
|
|
|
|
|
inputs_embeds=inputs_embeds,
|
|
|
|
|
**model_kwargs,
|
|
|
|
|
)
|
|
|
|
|
return hidden_states
|
|
|
|
|
|
|
|
|
|
def _get_torchair_lazy_compiled_model(self, batch_size: int):
|
|
|
|
|
if batch_size < 0 or batch_size > self.torchair_graph_batch_sizes[-1]:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"Bad graph batch size:{batch_size}! max_graph_batch_sizes:{self.torchair_graph_batch_sizes[-1]}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
compiled_model = self.torchair_compiled_models.get(
|
|
|
|
|
batch_size
|
|
|
|
|
) if self.use_cached_npu_graph else self.torchair_compiled_model
|
|
|
|
|
|
|
|
|
|
if compiled_model:
|
|
|
|
|
return compiled_model
|
|
|
|
|
|
|
|
|
|
import torchair # type: ignore
|
|
|
|
|
from torchair import patch_for_hcom # type: ignore
|
|
|
|
|
|
|
|
|
|
patch_for_hcom()
|
|
|
|
|
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type() == AscendDeviceType._310P:
|
2025-08-21 08:54:57 +08:00
|
|
|
# on 300I Duo platform, we need to patch broadcast. however, this patch will be
|
|
|
|
|
# overwritten by patch_for_hcom in torchair. so we need to re-patch it here.
|
2025-10-21 20:19:46 +08:00
|
|
|
from vllm_ascend.patch.platform.patch_distributed import \
|
2025-08-21 08:54:57 +08:00
|
|
|
communication_adaptation_310p
|
|
|
|
|
communication_adaptation_310p()
|
|
|
|
|
|
|
|
|
|
config = torchair.CompilerConfig()
|
2025-09-30 03:25:58 +08:00
|
|
|
if self.ascend_config.torchair_graph_config.mode:
|
|
|
|
|
config.mode = self.ascend_config.torchair_graph_config.mode
|
2025-09-17 12:00:44 +08:00
|
|
|
config.experimental_config.frozen_parameter = \
|
2025-09-30 03:25:58 +08:00
|
|
|
self.ascend_config.torchair_graph_config.enable_frozen_parameter
|
2025-08-21 08:54:57 +08:00
|
|
|
# enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to
|
|
|
|
|
# disable it on 300I Duo platform now.
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
config.experimental_config.tiling_schedule_optimize = get_ascend_device_type(
|
|
|
|
|
) != AscendDeviceType._310P
|
2025-08-21 08:54:57 +08:00
|
|
|
config.experimental_config.enable_view_optimize = \
|
2025-09-30 03:25:58 +08:00
|
|
|
self.ascend_config.torchair_graph_config.enable_view_optimize
|
2025-08-21 08:54:57 +08:00
|
|
|
torch.npu.set_compile_mode(jit_compile=False)
|
|
|
|
|
if not self.use_cached_npu_graph:
|
|
|
|
|
npu_backend = torchair.get_npu_backend(compiler_config=config)
|
2025-09-30 03:25:58 +08:00
|
|
|
self.torchair_compiled_model = torch.compile(
|
|
|
|
|
self.model,
|
2025-10-15 17:48:58 +08:00
|
|
|
dynamic=not self.use_sparse,
|
2025-09-30 03:25:58 +08:00
|
|
|
fullgraph=True,
|
|
|
|
|
backend=npu_backend)
|
2025-08-21 08:54:57 +08:00
|
|
|
return self.torchair_compiled_model
|
|
|
|
|
else:
|
|
|
|
|
# Generate a new forward proxy code object to prevent the invalidation of
|
|
|
|
|
# compilation cache caused by dynamo retracing
|
|
|
|
|
forward_proxy_name = f"{self.model.__class__.__name__}_forward_with_batch_size_{batch_size}"
|
|
|
|
|
forward_fn = self.model.forward
|
|
|
|
|
code = forward_fn.__code__
|
|
|
|
|
# Mark code object with a new proxy name
|
|
|
|
|
modified_code = code.replace(co_name=forward_proxy_name, )
|
|
|
|
|
|
|
|
|
|
modified_func = types.FunctionType(modified_code,
|
|
|
|
|
forward_fn.__globals__,
|
|
|
|
|
name=forward_proxy_name,
|
|
|
|
|
argdefs=forward_fn.__defaults__)
|
|
|
|
|
|
|
|
|
|
self.model.__dict__[forward_proxy_name] = modified_func.__get__(
|
|
|
|
|
self.model, nn.Module)
|
|
|
|
|
self.torchair_compiled_models[
|
|
|
|
|
batch_size] = torchair.inference.cache_compile(
|
|
|
|
|
self.model.__dict__[forward_proxy_name],
|
2025-10-15 17:48:58 +08:00
|
|
|
dynamic=not self.use_sparse,
|
2025-09-19 17:22:30 -07:00
|
|
|
fullgraph=True,
|
2025-09-03 17:56:12 +08:00
|
|
|
cache_dir=TORCHAIR_CACHE_DIR,
|
2025-08-21 08:54:57 +08:00
|
|
|
config=config,
|
|
|
|
|
ge_cache=False)
|
|
|
|
|
return self.torchair_compiled_models[batch_size]
|
|
|
|
|
|
|
|
|
|
def init_torchair_graph_batch_sizes(self):
|
|
|
|
|
start_graph_batch_size = 4
|
|
|
|
|
tp_size = get_tensor_model_parallel_world_size()
|
|
|
|
|
|
|
|
|
|
# NOTE: When use all2all | mc2, We need to slice the `num_tokens` dimension into `tp_size` blocks
|
|
|
|
|
start_graph_batch_size = max(start_graph_batch_size, tp_size)
|
|
|
|
|
|
|
|
|
|
while (start_graph_batch_size <= self.max_num_reqs):
|
|
|
|
|
self.torchair_graph_batch_sizes.append(start_graph_batch_size)
|
|
|
|
|
start_graph_batch_size *= 2
|
|
|
|
|
|
2025-10-31 09:24:50 +08:00
|
|
|
def calculate_new_torchair_graph_batch_size(self, old_graph_batch_size,
|
|
|
|
|
tp_size):
|
|
|
|
|
cur_graph_batch_size = (old_graph_batch_size + tp_size -
|
|
|
|
|
1) // tp_size * tp_size
|
|
|
|
|
# MTP > 1: Cal LCMLeast Common Multiple with graph_batch_size and tp_size,
|
|
|
|
|
# Both adapter multi-dp and FIA operator
|
|
|
|
|
if self.speculative_config is not None and self.speculative_config.num_speculative_tokens > 1:
|
|
|
|
|
cur_graph_batch_size = (tp_size * old_graph_batch_size) \
|
|
|
|
|
// math.gcd(tp_size, old_graph_batch_size)
|
|
|
|
|
return cur_graph_batch_size
|
|
|
|
|
|
2025-08-21 08:54:57 +08:00
|
|
|
def select_torchair_padded_batch_size(self, batch_size: int):
|
|
|
|
|
for padded_batch_size in self.torchair_graph_batch_sizes:
|
|
|
|
|
if batch_size <= padded_batch_size:
|
|
|
|
|
# we treat batch_size as num of requests
|
|
|
|
|
return padded_batch_size
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"cur batch_size is invalid, torchair_graph_batch_sizes is "
|
|
|
|
|
f"{self.torchair_graph_batch_sizes}, but cur batch_size is {batch_size}."
|
|
|
|
|
)
|
|
|
|
|
|
2025-09-08 14:52:33 +08:00
|
|
|
def update_torchair_graph_batch_sizes(self):
|
2025-08-21 08:54:57 +08:00
|
|
|
# return graph_batch_sizes according to the max number of tokens
|
|
|
|
|
# first pad according to the number of requests
|
2025-09-17 09:07:58 +08:00
|
|
|
if self.is_kv_consumer and self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
|
|
|
|
|
# pd disaggregation scenario may incorrectly calculate the batch in mtp scenario, so we force set it to max_num_reqs
|
|
|
|
|
self.torchair_graph_batch_sizes = [self.max_num_reqs]
|
|
|
|
|
logger.warning(
|
2025-10-21 22:21:44 +08:00
|
|
|
f"is kv_consumer, torch_graph_batch_sizes sets to [max_num_seqs] {[self.max_num_reqs]}"
|
2025-09-17 09:07:58 +08:00
|
|
|
)
|
|
|
|
|
elif len(self.torchair_graph_batch_sizes) == 0:
|
2025-08-21 08:54:57 +08:00
|
|
|
self.torchair_graph_batch_sizes = [1, self.max_num_reqs]
|
|
|
|
|
else:
|
|
|
|
|
self.torchair_graph_batch_sizes = sorted(
|
|
|
|
|
self.torchair_graph_batch_sizes)
|
|
|
|
|
while self.torchair_graph_batch_sizes[-1] > self.max_num_reqs:
|
|
|
|
|
self.torchair_graph_batch_sizes.pop()
|
|
|
|
|
if len(self.torchair_graph_batch_sizes) == 0:
|
|
|
|
|
logger.warning(
|
|
|
|
|
"torch_graph_batch_sizes is invalid, reset it to [1, max_num_seqs]"
|
|
|
|
|
)
|
|
|
|
|
self.torchair_graph_batch_sizes = [1, self.max_num_reqs]
|
|
|
|
|
if self.torchair_graph_batch_sizes[-1] < self.max_num_reqs:
|
|
|
|
|
self.torchair_graph_batch_sizes.append(self.max_num_reqs)
|
|
|
|
|
|
|
|
|
|
# padded max number tokens = max_num_req * decode_token_per_req
|
2025-10-17 09:42:48 +08:00
|
|
|
self.torchair_graph_batch_sizes = [
|
|
|
|
|
graph_batch_size * self.decode_token_per_req
|
|
|
|
|
for graph_batch_size in self.torchair_graph_batch_sizes
|
|
|
|
|
]
|
2025-08-21 08:54:57 +08:00
|
|
|
|
2025-09-08 14:52:33 +08:00
|
|
|
# NOTE: when enable_expert_parallel on A3, we need to check if `graph_batch_size` is divisible by `tp_size`
|
|
|
|
|
# Because we use x_active_mask for dispatch/combine op on A3, which requires that input shape should be same
|
|
|
|
|
# on all EP ranks
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
if get_ascend_device_type(
|
|
|
|
|
) == AscendDeviceType._910_93 and self.parallel_config.enable_expert_parallel:
|
2025-09-08 14:52:33 +08:00
|
|
|
self._align_graph_size_divisible_by_tp_size()
|
|
|
|
|
|
|
|
|
|
def _align_graph_size_divisible_by_tp_size(self):
|
2025-08-21 08:54:57 +08:00
|
|
|
tp_size = self.parallel_config.tensor_parallel_size
|
2025-09-08 14:52:33 +08:00
|
|
|
new_graph_batch_sizes = []
|
|
|
|
|
for graph_batch_size in self.torchair_graph_batch_sizes:
|
2025-10-31 09:24:50 +08:00
|
|
|
cur_graph_batch_size = self.calculate_new_torchair_graph_batch_size(
|
|
|
|
|
graph_batch_size, tp_size)
|
2025-09-08 14:52:33 +08:00
|
|
|
if cur_graph_batch_size not in new_graph_batch_sizes and \
|
|
|
|
|
cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens:
|
|
|
|
|
new_graph_batch_sizes.append(cur_graph_batch_size)
|
|
|
|
|
elif cur_graph_batch_size > self.scheduler_config.max_num_batched_tokens \
|
|
|
|
|
and self.decode_token_per_req > 1:
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"torchair_graph_batch_sizes {cur_graph_batch_size} is bigger than max_num_batched_tokens",
|
|
|
|
|
f"{self.scheduler_config.max_num_batched_tokens} will skip this batch size."
|
|
|
|
|
)
|
2025-10-21 22:21:44 +08:00
|
|
|
new_max_num_reqs = math.ceil(
|
|
|
|
|
max(new_graph_batch_sizes) / self.decode_token_per_req)
|
2025-09-08 14:52:33 +08:00
|
|
|
if self.max_num_reqs != new_max_num_reqs:
|
|
|
|
|
logger.warning(f"max_num_reqs is updated to {new_max_num_reqs}")
|
|
|
|
|
self.max_num_reqs = new_max_num_reqs
|
2025-10-21 22:21:44 +08:00
|
|
|
if not (self.decode_token_per_req > 1 and self.is_kv_consumer):
|
|
|
|
|
# Do not update scheduler_config.max_num_seqs in KV consumer + MTP
|
|
|
|
|
# Since FIA need extra space for padding
|
|
|
|
|
# Enforce self.max_num_seqs > self.scheduler_config.max_num_seqs in KV consumer + MTP
|
|
|
|
|
self.scheduler_config.max_num_seqs = new_max_num_reqs
|
2025-09-08 14:52:33 +08:00
|
|
|
|
|
|
|
|
if new_graph_batch_sizes != self.torchair_graph_batch_sizes:
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"torchair_graph_batch_sizes are updated to {new_graph_batch_sizes}."
|
|
|
|
|
)
|
2025-08-21 08:54:57 +08:00
|
|
|
self.torchair_graph_batch_sizes = new_graph_batch_sizes
|
|
|
|
|
|
|
|
|
|
def _build_drafter_prepare_inputs_torchair_param(self):
|
2025-09-16 14:13:07 +08:00
|
|
|
if self.enable_shared_expert_dp:
|
|
|
|
|
return super()._build_drafter_prepare_inputs_torchair_param()
|
|
|
|
|
else:
|
|
|
|
|
return True
|