[feature]Ascend quantization support (#7791)

Co-authored-by: ichernob <ichernobnn@gmail.com> Co-authored-by: liupeng <liupeng374@huawei.com>
2025-07-10 19:17:37 +03:00
parent 4a0d19198b
commit 766392c6bd
13 changed files with 889 additions and 34 deletions
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -34,16 +34,18 @@ import torch
 import torch.distributed as dist
 import triton
 import triton.language as tl
-from sgl_kernel.kvcacheio import transfer_kv_per_layer, transfer_kv_per_layer_mla

 from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.utils import get_bool_env_var, is_cuda, next_power_of_2
+from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, next_power_of_2

 logger = logging.getLogger(__name__)

 GB = 1024 * 1024 * 1024
 _is_cuda = is_cuda()
+_is_npu = is_npu()
+if not _is_npu:
+    from sgl_kernel.kvcacheio import transfer_kv_per_layer, transfer_kv_per_layer_mla


 class ReqToTokenPool: