2025-07-21 19:43:30 +08:00
import fcntl
import os
import shutil
from contextlib import contextmanager , nullcontext
2025-08-20 09:01:04 +08:00
from dataclasses import dataclass
2025-07-21 19:43:30 +08:00
import torch
2025-08-25 19:48:55 +08:00
import torch_npu
2025-10-20 20:04:37 +08:00
from torchair . scope import super_kernel as _super_kernel
2025-07-21 19:43:30 +08:00
try :
# Recent release of torchair has moved these ops to `.scope`.
from torchair . scope import npu_stream_switch as _npu_stream_switch
from torchair . scope import npu_wait_tensor as _npu_wait_tensor
except ImportError :
from torchair . ops import NpuStreamSwitch as _npu_stream_switch
from torchair . ops import npu_wait_tensor as _npu_wait_tensor
2025-10-30 14:59:46 +08:00
import vllm_ascend . envs as envs_ascend
2025-10-14 17:39:26 +08:00
from vllm_ascend . utils import ACL_FORMAT_FRACTAL_NZ , is_enable_nz
2025-07-21 19:43:30 +08:00
KV_CACHE_BYTES_CACHE_PATH_NAME = " .kv_cache_bytes "
KV_CACHE_BYTES_CACHE_FILE_NAME = " kv_cache_bytes "
TORCHAIR_CACHE_PATH_NAME = " .torchair_cache "
2025-09-03 17:56:12 +08:00
TORCHAIR_CACHE_DIR = os . path . join (
os . getenv ( ' TORCHAIR_CACHE_HOME ' , os . getcwd ( ) ) , TORCHAIR_CACHE_PATH_NAME )
2025-07-21 19:43:30 +08:00
2025-08-20 09:01:04 +08:00
@dataclass
class TorchairCommonAttentionMetadata :
"""
Per - batch attention metadata , shared across layers and backends .
AttentionMetadataBuilder instances use it to construct per - layer metadata .
For many of the tensors we keep both GPU and CPU versions .
"""
num_reqs : int
""" Number of requests """
num_actual_tokens : int
""" Total number of tokens in batch """
decode_token_per_req : int
actual_seq_lengths_q : list [ int ]
attn_mask : torch . Tensor = None
spec_attn_mask : torch . Tensor = None
graph_pad_size : int = - 1
2025-07-21 19:43:30 +08:00
@contextmanager
def _file_lock ( file_descriptor , lock_type ) :
fcntl . flock ( file_descriptor , lock_type )
try :
yield
finally :
fcntl . flock ( file_descriptor , fcntl . LOCK_UN )
def _get_torchair_current_work_dir ( file_name = None ) :
if file_name is None :
return TORCHAIR_CACHE_DIR
return os . path . join ( TORCHAIR_CACHE_DIR , file_name )
def check_torchair_cache_exist ( ) :
res = False
torch_air_abs_path = _get_torchair_current_work_dir ( )
if os . path . exists ( torch_air_abs_path ) :
file_list = os . listdir ( torch_air_abs_path )
if len ( file_list ) != 0 :
res = True
return res
def check_kv_cache_bytes_cache_exist ( ) :
res = False
kv_cache_bytes_cache_abs_path = _get_torchair_current_work_dir (
KV_CACHE_BYTES_CACHE_PATH_NAME )
if os . path . exists ( kv_cache_bytes_cache_abs_path ) :
file_list = os . listdir ( kv_cache_bytes_cache_abs_path )
if len ( file_list ) != 0 :
res = True
return res
def read_kv_cache_bytes_from_file ( rank ) - > int :
kv_cache_bytes = - 1
kv_cache_bytes_cache_abs_path = _get_torchair_current_work_dir (
KV_CACHE_BYTES_CACHE_PATH_NAME )
kv_cache_bytes_file = os . path . join (
kv_cache_bytes_cache_abs_path ,
f " { rank } _ { KV_CACHE_BYTES_CACHE_FILE_NAME } " )
with open ( kv_cache_bytes_file , " r " , encoding = " utf-8 " ) as f :
with _file_lock ( f , fcntl . LOCK_SH ) :
kv_cache_bytes = int ( f . readline ( ) )
return kv_cache_bytes
def write_kv_cache_bytes_to_file ( rank , kv_cache_bytes ) :
kv_cache_bytes_cache_abs_path = _get_torchair_current_work_dir (
KV_CACHE_BYTES_CACHE_PATH_NAME )
os . makedirs ( kv_cache_bytes_cache_abs_path , exist_ok = True )
kv_cache_bytes_file = os . path . join (
kv_cache_bytes_cache_abs_path ,
f " { rank } _ { KV_CACHE_BYTES_CACHE_FILE_NAME } " )
with open ( kv_cache_bytes_file , " w " , encoding = " utf-8 " ) as f :
with _file_lock ( f , fcntl . LOCK_EX ) :
f . write ( f " { kv_cache_bytes } " )
def delete_torchair_cache_file ( ) :
torch_air_abs_path = _get_torchair_current_work_dir ( )
2025-09-03 17:56:12 +08:00
try :
2025-07-21 19:43:30 +08:00
shutil . rmtree ( torch_air_abs_path )
2025-09-03 17:56:12 +08:00
except FileNotFoundError :
pass
2025-07-21 19:43:30 +08:00
def npu_stream_switch ( tag : str , priority : int , * , enabled : bool = True ) :
return _npu_stream_switch ( tag , priority ) if enabled else nullcontext ( )
def npu_wait_tensor ( self : torch . Tensor ,
dependency : torch . Tensor ,
* ,
enabled : bool = True ) :
return _npu_wait_tensor ( self , dependency ) if enabled else self
2025-08-18 15:00:37 +08:00
2025-08-25 19:48:55 +08:00
def converting_weight_acl_format ( model , format ) :
# currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ
# in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ
# is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this
# conversion when using torchair graph mode on 300I Duo platform.
# TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant
# accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode.
from vllm . model_executor . layers . fused_moe . layer import FusedMoE
for module in model . modules ( ) :
if isinstance ( module , FusedMoE ) :
if torch_npu . get_npu_format ( module . w13_weight . data ) == format :
return
2025-10-14 17:39:26 +08:00
if format == ACL_FORMAT_FRACTAL_NZ \
2025-11-28 17:32:25 +08:00
and not is_enable_nz ( module . w13_weight . data . dtype ) :
2025-10-14 17:39:26 +08:00
return
2025-08-25 19:48:55 +08:00
module . w13_weight . data = torch_npu . npu_format_cast (
module . w13_weight . data , format )
module . w2_weight . data = torch_npu . npu_format_cast (
module . w2_weight . data , format )
2025-08-18 15:00:37 +08:00
def register_torchair_model ( ) :
from vllm import ModelRegistry
ModelRegistry . register_model (
" DeepSeekMTPModel " ,
" vllm_ascend.torchair.models.torchair_deepseek_mtp:TorchairDeepSeekMTP "
)
ModelRegistry . register_model (
" DeepseekV2ForCausalLM " ,
" vllm_ascend.torchair.models.torchair_deepseek_v2:TorchairDeepseekV2ForCausalLM "
)
ModelRegistry . register_model (
" DeepseekV3ForCausalLM " ,
" vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM "
)
qwen3_moe/qwen25 support torchair graph (#2403)
### What this PR does / why we need it?
Added support for the TorchAir graph mode in qwen3_moe and qwen2.5
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```bash
llm = LLM(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=False,
enable_expert_parallel=True,
max_model_len=4096,
max_num_seqs=16,
trust_remote_code=trust_remote_code,
gpu_memory_utilization=0.4,
additional_config={
"torchair_graph_config": {
"enabled": True,
"use_cached_graph": False,
"graph_batch_sizes_init": False,
"graph_batch_sizes": [16]
},
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled":True,
},
"refresh": True,
},
)
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/b87cb97a53bcff92a90308528b3f313e43aff102
Signed-off-by: taoyuxiang <oui.nicholas.tao@gmail.com>
2025-08-20 11:23:50 +08:00
2025-09-30 03:25:58 +08:00
ModelRegistry . register_model (
" DeepseekV32ForCausalLM " ,
" vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM "
)
qwen3_moe/qwen25 support torchair graph (#2403)
### What this PR does / why we need it?
Added support for the TorchAir graph mode in qwen3_moe and qwen2.5
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```bash
llm = LLM(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=False,
enable_expert_parallel=True,
max_model_len=4096,
max_num_seqs=16,
trust_remote_code=trust_remote_code,
gpu_memory_utilization=0.4,
additional_config={
"torchair_graph_config": {
"enabled": True,
"use_cached_graph": False,
"graph_batch_sizes_init": False,
"graph_batch_sizes": [16]
},
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled":True,
},
"refresh": True,
},
)
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/b87cb97a53bcff92a90308528b3f313e43aff102
Signed-off-by: taoyuxiang <oui.nicholas.tao@gmail.com>
2025-08-20 11:23:50 +08:00
ModelRegistry . register_model (
" Qwen2ForCausalLM " ,
" vllm_ascend.torchair.models.qwen2:CustomQwen2ForCausalLM " )
ModelRegistry . register_model (
2025-08-30 15:49:48 +08:00
" Qwen3MoeForCausalLM " ,
qwen3_moe/qwen25 support torchair graph (#2403)
### What this PR does / why we need it?
Added support for the TorchAir graph mode in qwen3_moe and qwen2.5
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```bash
llm = LLM(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=False,
enable_expert_parallel=True,
max_model_len=4096,
max_num_seqs=16,
trust_remote_code=trust_remote_code,
gpu_memory_utilization=0.4,
additional_config={
"torchair_graph_config": {
"enabled": True,
"use_cached_graph": False,
"graph_batch_sizes_init": False,
"graph_batch_sizes": [16]
},
"ascend_scheduler_config": {
"enabled": True,
"chunked_prefill_enabled":True,
},
"refresh": True,
},
)
```
- vLLM version: v0.10.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/b87cb97a53bcff92a90308528b3f313e43aff102
Signed-off-by: taoyuxiang <oui.nicholas.tao@gmail.com>
2025-08-20 11:23:50 +08:00
" vllm_ascend.torchair.models.qwen3_moe:CustomQwen3MoeForCausalLM " )
2025-08-27 10:45:50 +08:00
2025-09-04 10:39:21 +08:00
ModelRegistry . register_model (
" PanguProMoEForCausalLM " ,
" vllm_ascend.torchair.models.torchair_pangu_moe:PanguProMoEForCausalLM "
)
2025-08-27 10:45:50 +08:00
def torchair_quant_method_register ( ) :
2025-09-04 11:35:14 +08:00
from vllm_ascend . quantization . utils import ASCEND_QUANTIZATION_METHOD_MAP
from vllm_ascend . torchair . quantization . torchair_w4a8_dynamic import (
TorchairAscendW4A8DynamicFusedMoEMethod ,
TorchairAscendW4A8DynamicLinearMethod )
from vllm_ascend . torchair . quantization . torchair_w8a8_dynamic import (
TorchairAscendW8A8DynamicFusedMoEMethod ,
TorchairAscendW8A8DynamicLinearMethod )
ASCEND_QUANTIZATION_METHOD_MAP [ " W8A8_DYNAMIC " ] [
" linear " ] = TorchairAscendW8A8DynamicLinearMethod
ASCEND_QUANTIZATION_METHOD_MAP [ " W8A8_DYNAMIC " ] [
" moe " ] = TorchairAscendW8A8DynamicFusedMoEMethod
ASCEND_QUANTIZATION_METHOD_MAP [ " W4A8_DYNAMIC " ] [
" linear " ] = TorchairAscendW4A8DynamicLinearMethod
ASCEND_QUANTIZATION_METHOD_MAP [ " W4A8_DYNAMIC " ] [
" moe " ] = TorchairAscendW4A8DynamicFusedMoEMethod
2025-09-01 09:09:21 +08:00
def torchair_ops_patch ( ) :
2025-09-11 21:20:09 +08:00
from vllm_ascend . ops . activation import AscendSiluAndMul
2025-10-29 22:44:44 +08:00
from vllm_ascend . ops . layernorm import AscendQuantRMSNorm , AscendRMSNorm
2025-09-02 17:21:56 +08:00
from vllm_ascend . ops . rotary_embedding import (
AscendDeepseekScalingRotaryEmbedding , AscendRotaryEmbedding )
2025-10-13 23:02:12 +08:00
from vllm_ascend . ops . vocab_parallel_embedding import \
AscendVocabParallelEmbedding
2025-09-11 21:20:09 +08:00
from vllm_ascend . torchair . ops import ( torchair_activation ,
torchair_layernorm )
2025-09-01 09:09:21 +08:00
from vllm_ascend . torchair . ops . torchair_rotary_embedding import (
deepseek_rope_init_func , native_rope_deepseek_forward ,
qwen_rope_init_func , rope_forward )
2025-10-13 23:02:12 +08:00
from vllm_ascend . torchair . ops . torchair_vocab_parallel_embedding import \
vocab_embedding_forward
2025-09-01 09:09:21 +08:00
2025-09-02 17:21:56 +08:00
AscendRotaryEmbedding . __init__ = qwen_rope_init_func # type: ignore[method-assign]
AscendRotaryEmbedding . forward_oot = rope_forward # type: ignore[method-assign]
2025-09-01 09:09:21 +08:00
2025-09-02 17:21:56 +08:00
AscendDeepseekScalingRotaryEmbedding . __init__ = deepseek_rope_init_func # type: ignore[method-assign]
AscendDeepseekScalingRotaryEmbedding . forward = native_rope_deepseek_forward # type: ignore[method-assign]
2025-09-11 21:20:09 +08:00
2025-10-22 20:20:32 +08:00
AscendRMSNorm . __init__ = torchair_layernorm . torchair_rmsnorm_init_ # type: ignore[method-assign]
2025-09-11 21:20:09 +08:00
AscendRMSNorm . forward_oot = torchair_layernorm . torchair_rmsnorm_forward_oot # type: ignore[method-assign]
2025-10-22 20:20:32 +08:00
2025-10-29 22:44:44 +08:00
AscendQuantRMSNorm . __init__ = torchair_layernorm . torchair_rmsnorm_init_ # type: ignore[method-assign]
AscendQuantRMSNorm . forward_oot = torchair_layernorm . torchair_rmsnorm_forward_oot # type: ignore[method-assign]
2025-09-11 21:20:09 +08:00
AscendSiluAndMul . forward_oot = torchair_activation . torchair_silu_and_mul_forward_oot # type: ignore[method-assign]
2025-10-13 23:02:12 +08:00
AscendVocabParallelEmbedding . forward = vocab_embedding_forward # type: ignore[method-assign]
2025-10-20 20:04:37 +08:00
def super_kernel ( prefix : str , option : str , enabled : bool = True ) :
2025-10-22 20:20:32 +08:00
return _super_kernel ( prefix , option ) if enabled else nullcontext ( )
2025-10-30 14:59:46 +08:00
# TODO(ttanzhiqiang): rm_router_logits
# dp>1 will trigger
# In theory, this solution is only applicable to AllGather and AllGatherEP, because in the dp scenario, the previous operation was gate + two communications, and now it is changed to one communication + gate operation, which can save some communication time. In theory, all moe AllGather and AllGatherEP solutions can follow this logic, but now other moe models (qwen3-235b) dp solutions are not adjusted, so use the switch to control it to prevent code errors.
def get_rm_router_logits_state ( ep_size : int , dp_size : int ,
is_deepseek_v3_r1 : bool ) :
# the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
# only supports deepseek v3/r1
if dp_size > 1 :
if ( envs_ascend . VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
and is_deepseek_v3_r1 ) :
return True
elif ep_size == 1 and is_deepseek_v3_r1 :
return True
return False
# TODO(ttanzhiqiang): all_reduce merge
# When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
# Currently, all_reduce_merge is enabled by default in the AllGather, AllGatherEP and NaiveMulticast scenarios of the deepseek model.
def get_all_reduce_merge_state ( ep_size : int , is_deepseek_v3_r1 : bool ) :
# the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
# only supports deepseek v3/r1
if ( envs_ascend . VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
and is_deepseek_v3_r1 ) :
return True
elif ep_size == 1 and is_deepseek_v3_r1 :
return True
return False