From a39d92878209cdb891ba2a726f17f5649ee0842c Mon Sep 17 00:00:00 2001 From: Yijie Zhu <762412795@qq.com> Date: Wed, 18 Jun 2025 02:24:10 +0800 Subject: [PATCH] support qwen2 running on ascend npu device (#7022) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 刁莹煜 --- docs/backend/native_api.ipynb | 4 ++ python/sglang/srt/_custom_ops.py | 4 +- python/sglang/srt/layers/activation.py | 5 ++- python/sglang/srt/layers/layernorm.py | 5 ++- .../compressed_tensors_moe.py | 5 ++- python/sglang/srt/layers/quantization/fp8.py | 4 +- .../sglang/srt/layers/quantization/utils.py | 5 ++- python/sglang/srt/layers/rotary_embedding.py | 5 ++- python/sglang/srt/utils.py | 42 +++++++++++++++++-- 9 files changed, 63 insertions(+), 16 deletions(-) diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 189c678c0..a54ae6996 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -51,6 +51,10 @@ "server_process, port = launch_server_cmd(\n", " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n", ")\n", + "## To run qwen2.5-0.5b-instruct model on the Ascend-Npu, you can execute the following command:\n", + "# server_process, port = launch_server_cmd(\n", + "# \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --device npu --tp 2 --attention-backend torch_native\"\n", + "# )\n", "\n", "wait_for_server(f\"http://localhost:{port}\")" ] diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py index 5d5b999a2..1c232d19f 100644 --- a/python/sglang/srt/_custom_ops.py +++ b/python/sglang/srt/_custom_ops.py @@ -4,7 +4,7 @@ from typing import List, Tuple import torch -from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu +from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu, is_npu logger = logging.getLogger(__name__) use_vllm_custom_allreduce = get_bool_env_var( @@ -25,7 +25,7 @@ if not is_hpu(): logger.warning("Failed to import from custom_ar with %r", e) -if not is_hip(): +if not is_hip() and not is_npu(): if use_vllm_custom_allreduce: custom_op = torch.ops._C_custom_ar else: diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py index 2e200be36..b018743bc 100644 --- a/python/sglang/srt/layers/activation.py +++ b/python/sglang/srt/layers/activation.py @@ -29,10 +29,11 @@ from sglang.srt.distributed import ( get_tensor_model_parallel_world_size, ) from sglang.srt.layers.quantization.base_config import QuantizationConfig -from sglang.srt.utils import is_cuda, set_weight_attrs +from sglang.srt.utils import is_cuda, is_npu, set_weight_attrs from sglang.utils import resolve_obj_by_qualname _is_cuda = is_cuda() +_is_npu = is_npu() if _is_cuda: from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul @@ -184,7 +185,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig): return nn.Identity() -if not _is_cuda: +if not _is_cuda and not _is_npu: logger.info( "sgl-kernel is not available on Non-NV platforms. Fallback to other kernel libraries." ) diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index 0994a511e..3ccff5a72 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -20,10 +20,11 @@ import torch import torch.nn as nn from sglang.srt.custom_op import CustomOp -from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip +from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip, is_npu _is_cuda = is_cuda() _is_hip = is_hip() +_is_npu = is_npu() _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip if _is_cuda: @@ -187,7 +188,7 @@ class Gemma3RMSNorm(nn.Module): return f"{tuple(self.weight.shape)}, eps={self.eps}" -if not (_is_cuda or _is_hip): +if not (_is_cuda or _is_hip or _is_npu): logger.info( "sgl-kernel layernorm implementation is not available on current platform. Fallback to other kernel libraries." ) diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 0aaa3a508..ee08c1f55 100644 --- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -17,11 +17,12 @@ from sglang.srt.layers.quantization.utils import ( per_tensor_dequantize, replace_parameter, ) -from sglang.srt.utils import is_cuda, set_weight_attrs +from sglang.srt.utils import is_cuda, is_npu, set_weight_attrs _is_cuda = is_cuda() +_is_npu = is_npu() -if not _is_cuda: +if not _is_cuda and not _is_npu: from vllm import _custom_ops as vllm_ops from vllm._custom_ops import scaled_fp8_quant diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 80a5971a0..bbdd64ba8 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -67,6 +67,7 @@ from sglang.srt.utils import ( get_bool_env_var, is_cuda, is_hip, + is_npu, log_info_on_rank0, print_warning_once, set_weight_attrs, @@ -74,6 +75,7 @@ from sglang.srt.utils import ( _is_hip = is_hip() _is_cuda = is_cuda() +_is_npu = is_npu() _is_fp8_fnuz = is_fp8_fnuz() @@ -86,7 +88,7 @@ if _is_hip: from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages from aiter.ops.shuffle import shuffle_weight -if not _is_cuda: +if not _is_cuda and not _is_npu: from vllm._custom_ops import scaled_fp8_quant diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py index d2bbce494..0b9a15560 100644 --- a/python/sglang/srt/layers/quantization/utils.py +++ b/python/sglang/srt/layers/quantization/utils.py @@ -6,11 +6,12 @@ from typing import List, Mapping, Tuple, Union import torch from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant -from sglang.srt.utils import is_cuda +from sglang.srt.utils import is_cuda, is_npu _is_cuda = is_cuda() +_is_npu = is_npu() -if not _is_cuda: +if not _is_cuda and not _is_npu: from vllm._custom_ops import scaled_fp8_quant diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index 8ae191b51..7db99d375 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -8,10 +8,11 @@ import torch import torch.nn as nn from sglang.srt.custom_op import CustomOp -from sglang.srt.utils import is_cuda, is_hip +from sglang.srt.utils import is_cuda, is_hip, is_npu _is_cuda = is_cuda() _is_hip = is_hip() +_is_npu = is_npu() if _is_cuda: from sgl_kernel import apply_rope_with_cos_sin_cache_inplace @@ -84,7 +85,7 @@ class RotaryEmbedding(CustomOp): if not _is_cuda: cache = cache.to(dtype) - if not _is_cuda or self.head_size not in [64, 128, 256, 512]: + if not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]: from vllm._custom_ops import rotary_embedding self.vllm_rotary_embedding = rotary_embedding diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 2184a4a94..69d99ddd2 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1291,6 +1291,15 @@ def get_hpu_memory_capacity(): ) +def get_npu_memory_capacity(): + try: + import torch_npu + + return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB + except ImportError as e: + raise ImportError("torch_npu is required when run on npu device.") + + def get_device_memory_capacity(device: str = None): if is_cuda(): gpu_mem = get_nvgpu_memory_capacity() @@ -1298,6 +1307,8 @@ def get_device_memory_capacity(device: str = None): gpu_mem = get_amdgpu_memory_capacity() elif device == "hpu": gpu_mem = get_hpu_memory_capacity() + elif device == "npu": + gpu_mem = get_npu_memory_capacity() else: # GPU memory is not known yet or no GPU is available. gpu_mem = None @@ -1423,6 +1434,11 @@ def get_device(device_id: Optional[int] = None) -> str: return "xpu" return "xpu:{}".format(device_id) + if hasattr(torch, "npu") and torch.npu.is_available(): + if device_id == None: + return "npu" + return "npu:{}".format(device_id) + if is_habana_available(): try: import habana_frameworks.torch.hpu @@ -1497,15 +1513,35 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]: return major, minor +def get_npu_compiler_config(): + config = { + "frozen_parameter": True, + "tiling_schedule_optimize": True, + "topology_sorting_strategy": "StableRDFS", + } + return config + + def get_compiler_backend() -> str: if hasattr(torch, "hpu") and torch.hpu.is_available(): return "hpu_backend" if hasattr(torch, "npu") and torch.npu.is_available(): - import torchair + try: + import torchair + import torchair.ge_concrete_graph.ge_converter.experimental.patch_for_hcom_allreduce + from torchair.configs.compiler_config import CompilerConfig + except ImportError as e: + raise ImportError( + "NPU detected, but torchair package is not installed. " + "Please install torchair for torch.compile support on NPU." + ) + compiler_config = CompilerConfig() + predefined_config = get_npu_compiler_config() + for k, v in predefined_config.items(): + setattr(compiler_config.experimental_config, k, v) - config = torchair.CompilerConfig() - npu_backend = torchair.get_npu_backend(compiler_config=config) + npu_backend = torchair.get_npu_backend(compiler_config=compiler_config) return npu_backend return "inductor"