[feature]Ascend quantization support (#7791)

Co-authored-by: ichernob <ichernobnn@gmail.com>
Co-authored-by: liupeng <liupeng374@huawei.com>
This commit is contained in:
ronnie_zheng
2025-07-10 19:17:37 +03:00
committed by GitHub
parent 4a0d19198b
commit 766392c6bd
13 changed files with 889 additions and 34 deletions

View File

@@ -34,16 +34,18 @@ import torch
import torch.distributed as dist
import triton
import triton.language as tl
from sgl_kernel.kvcacheio import transfer_kv_per_layer, transfer_kv_per_layer_mla
from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.utils import get_bool_env_var, is_cuda, next_power_of_2
from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, next_power_of_2
logger = logging.getLogger(__name__)
GB = 1024 * 1024 * 1024
_is_cuda = is_cuda()
_is_npu = is_npu()
if not _is_npu:
from sgl_kernel.kvcacheio import transfer_kv_per_layer, transfer_kv_per_layer_mla
class ReqToTokenPool: