Files
xc-llm-ascend/vllm_ascend/ops/triton/activation/swiglu_quant.py
SILONG ZENG 78af0c30a3 [Lint]Style: Convert vllm-ascend/ to ruff format(Batch #12) (#6177)
### What this PR does / why we need it?
**Scope of Changes**:
| File Path |
| :--- |
| `vllm_ascend/ops/triton/activation/swiglu_quant.py` |
| `vllm_ascend/ops/triton/batch_invariant/matmul.py` |
| `vllm_ascend/ops/triton/batch_invariant/mean.py` |
| `vllm_ascend/ops/triton/batch_invariant/rmsnorm.py` |
| `vllm_ascend/ops/triton/fla/chunk.py` |
| `vllm_ascend/ops/triton/fla/chunk_delta_h.py` |
| `vllm_ascend/ops/triton/fla/chunk_o.py` |
| `vllm_ascend/ops/triton/fla/chunk_scaled_dot_kkt.py` |
| `vllm_ascend/ops/triton/fla/cumsum.py` |
| `vllm_ascend/ops/triton/fla/fused_qkvzba_split_reshape.py` |
| `vllm_ascend/ops/triton/fla/l2norm.py` |
| `vllm_ascend/ops/triton/fla/layernorm_guard.py` |
| `vllm_ascend/ops/triton/fla/sigmoid_gating.py` |
| `vllm_ascend/ops/triton/fla/solve_tril.py` |
| `vllm_ascend/ops/triton/fla/utils.py` |
| `vllm_ascend/ops/triton/fla/wy_fast.py` |
| `vllm_ascend/ops/triton/fused_gdn_gating.py` |
| `vllm_ascend/ops/triton/layernorm_gated.py` |
| `vllm_ascend/ops/triton/linearnorm/split_qkv_rmsnorm_rope.py` |
| `vllm_ascend/ops/triton/mamba/causal_conv1d.py` |
| `vllm_ascend/ops/triton/reject_sample.py` |
| `vllm_ascend/ops/triton/rope.py` |
| `vllm_ascend/ops/triton/spec_decode/utils.py` |
| `vllm_ascend/ops/triton/triton_utils.py` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.14.0
- vLLM main:
d68209402d

Signed-off-by: MrZ20 <2609716663@qq.com>
2026-01-23 14:59:19 +08:00

101 lines
3.7 KiB
Python

import torch
from vllm.triton_utils import tl, triton
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
@triton.jit
def _swiglu_quant_kernel(
x_ptr,
group_list_ptr,
out_ptr,
scale_ptr,
TOTAL_COLS: tl.constexpr,
HALF_COLS: tl.constexpr,
COL_BLOCK_SIZE: tl.constexpr,
NUM_EXPERTS: tl.constexpr,
NUM_EXPERTS_ALGIN: tl.constexpr,
GROUP_LIST_TYPE: tl.constexpr,
NUM_CORES: tl.constexpr,
DTYPE_MAX: tl.constexpr,
SCALE: tl.constexpr,
):
# calc real total_rows
if GROUP_LIST_TYPE == 0: # cusum
total_rows = tl.load(group_list_ptr + NUM_EXPERTS).to(tl.int32)
else:
gl_offsets = tl.arange(0, NUM_EXPERTS_ALGIN)
gl_mask = gl_offsets < NUM_EXPERTS
group_list = tl.load(group_list_ptr + gl_offsets, gl_mask, other=0).to(tl.int32)
total_rows = tl.sum(group_list)
block_size = (total_rows - 1) // NUM_CORES + 1
pid = tl.program_id(0)
row_begin = pid * block_size
if row_begin >= total_rows:
return
row_end = tl.minimum((pid + 1) * block_size, total_rows)
for row_idx in range(row_begin, row_end):
# swiglu
x_offsets = row_idx * TOTAL_COLS + tl.arange(0, TOTAL_COLS)
cur_x = tl.load(x_ptr + x_offsets)
x1 = tl.extract_slice(cur_x, offsets=(0,), sizes=(HALF_COLS,), strides=(1,))
x2 = tl.extract_slice(cur_x, offsets=(HALF_COLS,), sizes=(HALF_COLS,), strides=(1,))
out = x1 * tl.sigmoid(x1) * x2
# quant
if SCALE:
scale = tl.max(tl.abs(out)).to(tl.float32) / DTYPE_MAX
# store scale
tl.store(scale_ptr + row_idx, scale.to(scale_ptr.dtype.element_ty))
for col_blk_idx in range(0, HALF_COLS, COL_BLOCK_SIZE):
tmp_out = tl.extract_slice(out, offsets=(col_blk_idx,), sizes=(COL_BLOCK_SIZE,), strides=(1,))
tmp_out = (tmp_out.to(tl.float32) / scale).to(x_ptr.dtype.element_ty)
tmp_out = tmp_out.cast(tl.int8, overflow_mode="saturate")
o_offsets = row_idx * HALF_COLS + col_blk_idx + tl.arange(0, COL_BLOCK_SIZE)
mask = (col_blk_idx + tl.arange(0, COL_BLOCK_SIZE)) < HALF_COLS
tl.store(out_ptr + o_offsets, tmp_out.to(out_ptr.dtype.element_ty), mask=mask)
else:
# store out
o_offsets = row_idx * HALF_COLS + tl.arange(0, HALF_COLS)
tl.store(out_ptr + o_offsets, out.to(out_ptr.dtype.element_ty))
def swiglu_quant(x, group_list, group_list_type, need_quant=True):
# group_list_type must be 0 cusum or 1 count
if group_list_type not in [0, 1]:
raise ValueError(f"group_list_type must be 0 or 1, but got {group_list_type}")
s, h = x.shape
out_dtype = torch.int8 if need_quant else x.dtype
out = torch.empty((s, h // 2), dtype=out_dtype, device=x.device)
scale = torch.empty((s,), dtype=torch.float32, device=x.device)
num_experts = group_list.shape[0]
# ub must be 32-byte aligned on npu
if group_list.dtype == torch.int64:
num_experts_algin = (num_experts + 7) // 8 * 8
elif group_list.dtype == torch.int32:
num_experts_algin = (num_experts + 15) // 16 * 16
else:
raise ValueError(f"group_list dtype must be torch.int32 or torch.int64, but got {group_list.dtype}")
num_vectorcore = get_vectorcore_num()
_swiglu_quant_kernel[(num_vectorcore,)](
x,
group_list,
out,
scale,
TOTAL_COLS=h,
HALF_COLS=h // 2,
COL_BLOCK_SIZE=1536,
NUM_EXPERTS=num_experts,
NUM_EXPERTS_ALGIN=num_experts_algin,
GROUP_LIST_TYPE=group_list_type,
NUM_CORES=num_vectorcore,
DTYPE_MAX=127,
SCALE=need_quant,
multibuffer=True,
)
return out, scale