forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/kernels/__init__.py
Normal file
0
vllm-v0.6.2/tests/kernels/__init__.py
Normal file
18
vllm-v0.6.2/tests/kernels/allclose_default.py
Normal file
18
vllm-v0.6.2/tests/kernels/allclose_default.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import torch
|
||||
|
||||
# Reference default values of atol and rtol are from
|
||||
# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
|
||||
default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
|
||||
default_rtol = {
|
||||
torch.float16: 1e-3,
|
||||
torch.bfloat16: 1.6e-2,
|
||||
torch.float: 1.3e-6
|
||||
}
|
||||
|
||||
|
||||
def get_default_atol(output) -> float:
|
||||
return default_atol[output.dtype]
|
||||
|
||||
|
||||
def get_default_rtol(output) -> float:
|
||||
return default_rtol[output.dtype]
|
||||
127
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_copy_blocks.py
Normal file
127
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_copy_blocks.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch_mlu
|
||||
from vllm import _mlu_ops as mlu_ops
|
||||
|
||||
from typing import List, Tuple
|
||||
|
||||
DTYPES = [torch.half, torch.float]
|
||||
if "3" not in torch.mlu.get_device_name(0):
|
||||
DTYPES = [torch.half, torch.float]
|
||||
NUM_TOKENS = [83] # Arbitrary values for testing
|
||||
NUM_LAYERS = [1] # Arbitrary values for testing
|
||||
NUM_HEADS = [8] # Arbitrary values for testing
|
||||
HEAD_SIZES = [64, 80, 96, 112, 128, 256, 512]
|
||||
BLOCK_SIZES = [8, 16, 32]
|
||||
NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing
|
||||
NUM_MAPPINGS = [256] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
DEVICES = [i for i in range(1 if torch.mlu.device_count() == 1 else 2)]
|
||||
|
||||
|
||||
def create_kv_caches(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_layers: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int
|
||||
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
|
||||
|
||||
torch.random.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
|
||||
scale = head_size**-0.5
|
||||
# vllm scale
|
||||
# x = 16 // torch.tensor([], dtype=dtype).element_size()
|
||||
# key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
|
||||
key_cache_shape = (num_blocks, num_heads, block_size, head_size)
|
||||
print("key_cache_shape: ", key_cache_shape)
|
||||
key_caches = []
|
||||
for _ in range(num_layers):
|
||||
key_cache = torch.empty(size=key_cache_shape,
|
||||
dtype=dtype).mlu()
|
||||
if dtype == torch.int32 or dtype == torch.int64:
|
||||
key_cache.random_(-100,100)
|
||||
else:
|
||||
key_cache.uniform_(-scale, scale)
|
||||
key_caches.append(key_cache)
|
||||
value_cache_shape = (num_blocks, num_heads, head_size, block_size)
|
||||
print("value_cache_shape: ", value_cache_shape)
|
||||
value_caches = []
|
||||
for _ in range(num_layers):
|
||||
value_cache = torch.empty(size=value_cache_shape,
|
||||
dtype=dtype).mlu()
|
||||
if dtype == torch.int32 or dtype == torch.int64:
|
||||
value_cache.random_(-100,100)
|
||||
else:
|
||||
value_cache.uniform_(-scale, scale)
|
||||
value_caches.append(value_cache)
|
||||
return key_caches, value_caches
|
||||
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_copy_blocks(
|
||||
num_mappings: int,
|
||||
num_layers: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: int,
|
||||
) -> None:
|
||||
random.seed(seed)
|
||||
torch.random.manual_seed(seed)
|
||||
torch.mlu.manual_seed(seed)
|
||||
# Generate random block mappings where each source block is mapped to two
|
||||
# destination blocks.
|
||||
assert 3 * num_mappings <= num_blocks
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
|
||||
block_mapping = torch.empty(num_mappings, 2, dtype=torch.int32)
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
dst = dst_blocks[2 * i]
|
||||
block_mapping[i] = torch.tensor([src, dst])
|
||||
block_mapping_mlu = block_mapping.mlu()
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = create_kv_caches(num_blocks, block_size,
|
||||
num_layers, num_heads,
|
||||
head_size, dtype, seed)
|
||||
|
||||
# Clone the KV caches.
|
||||
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
|
||||
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
|
||||
|
||||
# Call the copy blocks kernel.
|
||||
mlu_ops.copy_blocks(key_caches, value_caches, block_mapping_mlu)
|
||||
|
||||
# Run the reference implementation.
|
||||
for mapping in block_mapping:
|
||||
src, dst = mapping[0], mapping[1]
|
||||
for cloned_key_cache in cloned_key_caches:
|
||||
cloned_key_cache[dst].copy_(cloned_key_cache[src])
|
||||
for cloned_value_cache in cloned_value_caches:
|
||||
cloned_value_cache[dst].copy_(cloned_value_cache[src])
|
||||
|
||||
# Compare the results.
|
||||
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
|
||||
assert torch.allclose(key_cache, cloned_key_cache)
|
||||
for value_cache, cloned_value_cache in zip(value_caches,
|
||||
cloned_value_caches):
|
||||
assert torch.allclose(value_cache, cloned_value_cache)
|
||||
99
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_ffn.py
Normal file
99
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_ffn.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch_mlu
|
||||
from vllm import _mlu_ops as mlu_ops
|
||||
|
||||
act_dict = {
|
||||
"relu": F.relu,
|
||||
"gelu": F.gelu,
|
||||
"silu": F.silu,
|
||||
}
|
||||
|
||||
def ref_ffn(
|
||||
hidden_states,
|
||||
up_fc_weight,
|
||||
up_fc_bias,
|
||||
down_proj_weight,
|
||||
down_proj_bias,
|
||||
gate_up_proj_weight,
|
||||
gate_up_proj_bias,
|
||||
layernorm_weight,
|
||||
layernorm_bias,
|
||||
act_mode):
|
||||
up_output = F.linear(hidden_states, up_fc_weight, bias=up_fc_bias)
|
||||
act_output = act_dict[act_mode](up_output)
|
||||
if not gate_up_proj_weight is None:
|
||||
gate_output = F.linear(hidden_states, gate_up_proj_weight, bias=gate_up_proj_bias)
|
||||
out = F.linear(act_output * gate_output, down_proj_weight, bias=down_proj_bias)
|
||||
else:
|
||||
out = F.linear(act_output, down_proj_weight, bias=down_proj_bias)
|
||||
return out
|
||||
|
||||
BATCH_SIZE = [1]
|
||||
SEQ_LENS = [1, 64, 1024]
|
||||
HIDDEN_SIZE = [16, 24]
|
||||
INTER_SIZE = [32]
|
||||
DTYPES = [torch.half, torch.float]
|
||||
if "3" not in torch.mlu.get_device_name(0):
|
||||
DTYPES = [torch.half, torch.float]
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZE)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZE)
|
||||
@pytest.mark.parametrize("inter_size", INTER_SIZE)
|
||||
@pytest.mark.parametrize("act_name", ["relu", "silu"]) # gelu
|
||||
@pytest.mark.parametrize("use_gate", [True])
|
||||
@pytest.mark.parametrize("use_gate_bias", [False])
|
||||
@pytest.mark.parametrize("use_up_bias", [False])
|
||||
@pytest.mark.parametrize("use_down_bias", [False])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", [0])
|
||||
def test_attention_project(
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
inter_size: int,
|
||||
act_name: str,
|
||||
use_gate: bool,
|
||||
use_gate_bias: bool,
|
||||
use_up_bias: bool,
|
||||
use_down_bias: bool,
|
||||
dtype: torch.dtype,
|
||||
seed : int
|
||||
) -> None:
|
||||
device_id = "mlu:0"
|
||||
torch.random.manual_seed(seed)
|
||||
torch.mlu.manual_seed(seed)
|
||||
|
||||
hidden_states = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device=device_id)
|
||||
up_proj_weight= torch.randn(inter_size, hidden_size, dtype=dtype, device=device_id)
|
||||
if use_gate:
|
||||
gate_proj_weight = torch.randn(inter_size, hidden_size, dtype=dtype, device=device_id)
|
||||
else:
|
||||
gate_proj_weight = None
|
||||
down_proj_weight = torch.randn(hidden_size, inter_size, dtype=dtype, device=device_id)
|
||||
|
||||
out = mlu_ops.ffn(hidden_states,
|
||||
up_proj_weight,
|
||||
None,
|
||||
down_proj_weight,
|
||||
None,
|
||||
gate_proj_weight,
|
||||
None,
|
||||
act_name)
|
||||
|
||||
ref_out = ref_ffn(
|
||||
hidden_states,
|
||||
up_proj_weight,
|
||||
None,
|
||||
down_proj_weight,
|
||||
None,
|
||||
gate_proj_weight,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
act_name
|
||||
)
|
||||
|
||||
assert torch.allclose(out, ref_out, atol=1e-1, rtol=1e-1)
|
||||
|
||||
185
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_rotary_emb.py
Normal file
185
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_rotary_emb.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import numpy
|
||||
from typing import List, Optional
|
||||
from itertools import accumulate
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope, _ROPE_DICT, LinearScalingRotaryEmbedding
|
||||
from vllm_mlu.model_executor.layers.rotary_embedding import MLURotaryEmbedding
|
||||
|
||||
ROPE_THRESHOLD_DIFF1 = 5e-3
|
||||
ROPE_THRESHOLD_DIFF2 = 5e-3
|
||||
|
||||
def compute_diff(baseline: numpy.ndarray, compare: numpy.ndarray):
|
||||
'''add diff1 diff2 accuracy method'''
|
||||
error = numpy.abs(baseline - compare)
|
||||
diff1 = numpy.sum(error) / numpy.sum(numpy.abs(baseline))
|
||||
diff2 = numpy.sqrt(numpy.sum(error**2)/numpy.sum(baseline**2))
|
||||
return diff1, diff2
|
||||
|
||||
IS_NEOX_STYLE = [True, False]
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
||||
ROTARY_DIMS = [32] # None means rotary dim == head size
|
||||
NUM_HEADS = [9, 17] # Arbitrary values for testing
|
||||
BATCH_SIZES = [1, 5] # Arbitrary values for testing
|
||||
SEQ_LENS = [11, 8192]
|
||||
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||
@torch.inference_mode()
|
||||
def test_rotary_embedding(
|
||||
is_neox_style: bool,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
max_position: int = 128,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
|
||||
total_seq_len = batch_size * seq_len
|
||||
|
||||
MLURotaryEmbedding.max_seq_len = max_position
|
||||
rope = MLURotaryEmbedding(head_size, rotary_dim, max_position, base, is_neox_style, dtype)
|
||||
rope = rope.to(dtype=dtype, device=0)
|
||||
|
||||
positions = torch.randint(0,
|
||||
max_position, ([batch_size*seq_len]),
|
||||
device=0).to(dtype=torch.int32)
|
||||
|
||||
context_shape = (total_seq_len, num_heads, head_size)
|
||||
context = torch.randn(size=context_shape, dtype=dtype).mlu()
|
||||
qk = context[..., 0 : num_heads, :]
|
||||
ref_qk = qk.clone()
|
||||
|
||||
cu_seq_lens = torch.arange(0, batch_size + 1, dtype=torch.int32).mlu() * seq_len
|
||||
MLURotaryEmbedding.set_cos_sin = False
|
||||
MLURotaryEmbedding.cu_seq_lens = cu_seq_lens
|
||||
MLURotaryEmbedding.is_prompt = False
|
||||
MLURotaryEmbedding.is_chunked = False
|
||||
qk_out = rope.forward(positions, qk)
|
||||
rope_base = MLURotaryEmbedding(head_size, rotary_dim, max_position, base, is_neox_style, dtype)
|
||||
# for simular CPU re_init_cos_sin_cache
|
||||
if "cos_sin_cache" in rope_base._buffers:
|
||||
del rope_base._buffers["cos_sin_cache"]
|
||||
cache = rope_base._compute_cos_sin_cache()
|
||||
cache = cache.to(dtype).mlu()
|
||||
rope_base.cos_sin_cache: torch.Tensor
|
||||
rope_base.register_buffer("cos_sin_cache", cache, persistent=False)
|
||||
|
||||
num_q_heads = num_heads - 1
|
||||
ref_q = ref_qk[:, :num_q_heads, :].reshape(-1,head_size)
|
||||
ref_k = ref_qk[:, num_q_heads:, :].reshape(-1,head_size)
|
||||
ref_q_o, ref_k_o = rope_base.forward_native(positions, ref_q, ref_k)
|
||||
ref_q_o_reshape = ref_q_o.reshape(-1, num_q_heads, head_size)
|
||||
ref_k_o_reshape = ref_k_o.reshape(-1, 1, head_size)
|
||||
ref_qk_out = torch.cat((ref_q_o_reshape, ref_k_o_reshape), dim=1).cpu()
|
||||
qk_out_cpu = qk_out.cpu()
|
||||
MLURotaryEmbedding.unset_mlu_var()
|
||||
diff1, diff2 = compute_diff(baseline=ref_qk_out.float().detach().numpy(),
|
||||
compare=qk_out_cpu.float().detach().numpy())
|
||||
assert diff1 <= ROPE_THRESHOLD_DIFF1 and diff2 <= ROPE_THRESHOLD_DIFF2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", [1, 11, 1024])
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", [64, 80, 128])
|
||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_batched_rotary_embedding_multi_lora(
|
||||
is_neox_style: bool,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
device: str = "mlu",
|
||||
seed: int = 0,
|
||||
max_position: int = 4096,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
"""test linear scaling rope kernel"""
|
||||
assert device == "mlu"
|
||||
assert torch.mlu.is_available()
|
||||
|
||||
torch.random.manual_seed(seed)
|
||||
torch.mlu.manual_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
is_prompt = seq_len == 1
|
||||
scaling_factors: List[int] = [1, 2, 4]
|
||||
|
||||
MLURotaryEmbedding.max_seq_len = max_position
|
||||
MLURotaryEmbedding.set_cos_sin = False
|
||||
MLURotaryEmbedding.is_prompt = is_prompt
|
||||
MLURotaryEmbedding.is_chunked = False
|
||||
MLURotaryEmbedding.positions_ = None
|
||||
MLURotaryEmbedding.cu_seq_lens = seq_len * torch.arange(
|
||||
0, batch_size + 1, dtype=torch.int32)
|
||||
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
|
||||
"rope_type": "linear",
|
||||
"factor": tuple(scaling_factors)
|
||||
})
|
||||
rope = rope.to(dtype=dtype)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size * seq_len, ),
|
||||
dtype=torch.int32)
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
offset_map = torch.tensor(
|
||||
list(
|
||||
accumulate([0] + [
|
||||
max_position * scaling_factor * 2
|
||||
for scaling_factor in scaling_factors[:-1]
|
||||
])), dtype=torch.int32)
|
||||
query_types = torch.randint(0,
|
||||
len(scaling_factors), (batch_size * seq_len, ))
|
||||
query_offsets = offset_map[query_types]
|
||||
|
||||
qk = torch.cat([query, key], dim=-1)
|
||||
qk = qk.view(batch_size * seq_len, num_heads + num_heads, head_size)
|
||||
|
||||
out_qk = rope.forward(positions, qk, query_offsets)
|
||||
|
||||
scaling_factor = tuple(scaling_factors)
|
||||
rope_base = LinearScalingRotaryEmbedding(head_size, rotary_dim,
|
||||
max_position, base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
torch.get_default_dtype())
|
||||
|
||||
ref_query, ref_key = rope_base.forward_native(positions, query, key,
|
||||
query_offsets)
|
||||
ref_qk = torch.cat([ref_query, ref_key], dim=-1)
|
||||
ref_qk = ref_qk.view(batch_size * seq_len, num_heads + num_heads, head_size)
|
||||
|
||||
# delete rope cache to init rope instance every time
|
||||
_ROPE_DICT.clear()
|
||||
|
||||
# compare the results
|
||||
diff1, diff2 = compute_diff(baseline=ref_qk.cpu().float().detach().numpy(),
|
||||
compare=out_qk.cpu().float().detach().numpy())
|
||||
assert diff1 <= ROPE_THRESHOLD_DIFF1 and diff2 <= ROPE_THRESHOLD_DIFF2
|
||||
94
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_swap_blocks.py
Normal file
94
vllm-v0.6.2/tests/kernels/bt_torch_ops/test_swap_blocks.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch_mlu
|
||||
USE_CUDA=False
|
||||
USE_MLU=True
|
||||
if USE_CUDA:
|
||||
from vllm._C import cache_ops
|
||||
if USE_MLU:
|
||||
from vllm import _mlu_ops as mlu_ops
|
||||
|
||||
from typing import List, Tuple
|
||||
|
||||
DTYPES = [torch.half, torch.float]
|
||||
if "3" not in torch.mlu.get_device_name(0):
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
SEEDS = [0]
|
||||
DEVICES = [i for i in range(1 if torch.mlu.device_count() == 1 else 2)]
|
||||
num_N = [3600]
|
||||
num_C = [8]
|
||||
num_H = [32,128]
|
||||
num_W = [16]
|
||||
num_pairs = [3,256]
|
||||
cpys = ["mlu to mlu", "mlu to cpu", "cpu to mlu"]
|
||||
|
||||
class SwapBlocks(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def forward(self,
|
||||
dst: torch.Tensor,
|
||||
src: torch.Tensor,
|
||||
src_to_dst: dict):
|
||||
for key, value in src_to_dst.items():
|
||||
dst[value] = src[key]
|
||||
|
||||
@pytest.mark.parametrize("n", num_N)
|
||||
@pytest.mark.parametrize("c", num_C)
|
||||
@pytest.mark.parametrize("h", num_H)
|
||||
@pytest.mark.parametrize("w", num_W)
|
||||
@pytest.mark.parametrize("num_pair", num_pairs)
|
||||
@pytest.mark.parametrize("cpy", cpys)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_copy_blocks(
|
||||
n,
|
||||
c: int,
|
||||
h: int,
|
||||
w: int,
|
||||
num_pair: int,
|
||||
cpy: str,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: int,
|
||||
) -> None:
|
||||
random.seed(seed)
|
||||
torch.random.manual_seed(seed)
|
||||
torch.mlu.manual_seed(seed)
|
||||
|
||||
if cpy == "mlu to mlu":
|
||||
src = torch.randn(n, c, h, w, dtype=dtype).mlu()
|
||||
dst = torch.randn(n, c, h, w, dtype=dtype).mlu()
|
||||
elif cpy == "mlu to cpu":
|
||||
src = torch.randn(n, c, h, w, dtype=dtype).mlu()
|
||||
dst = torch.randn(n, c, h, w, dtype=dtype).cpu()
|
||||
elif cpy == "cpu to mlu":
|
||||
src = torch.randn(n, c, h, w, dtype=dtype).cpu()
|
||||
dst = torch.randn(n, c, h, w, dtype=dtype).mlu()
|
||||
else:
|
||||
print("unkown copy direction.")
|
||||
exit(1)
|
||||
|
||||
values = list(range(num_pair))
|
||||
random.shuffle(values)
|
||||
src_to_dst = {key: value for key, value in zip(range(num_pair), values)}
|
||||
|
||||
mapping_data = []
|
||||
for k, v in src_to_dst.items():
|
||||
mapping_data.append([k, v])
|
||||
src_to_dst_tensor = torch.tensor(mapping_data, dtype=torch.int32).mlu()
|
||||
|
||||
ref_src, ref_dst = src.clone(), dst.clone()
|
||||
swap_blocks = SwapBlocks()
|
||||
# Call the swap blocks kernel.
|
||||
# cpu
|
||||
swap_blocks(ref_dst, ref_src, src_to_dst)
|
||||
# mlu
|
||||
mlu_ops.swap_blocks(dst, src, src_to_dst_tensor)
|
||||
# diff
|
||||
assert torch.allclose(src, ref_src)
|
||||
assert torch.allclose(dst, ref_dst)
|
||||
14
vllm-v0.6.2/tests/kernels/conftest.py
Normal file
14
vllm-v0.6.2/tests/kernels/conftest.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import pytest
|
||||
|
||||
from vllm.utils import (create_kv_caches_with_random,
|
||||
create_kv_caches_with_random_flash)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def kv_cache_factory():
|
||||
return create_kv_caches_with_random
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def kv_cache_factory_flashinfer():
|
||||
return create_kv_caches_with_random_flash
|
||||
88
vllm-v0.6.2/tests/kernels/quant_utils.py
Normal file
88
vllm-v0.6.2/tests/kernels/quant_utils.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# Using the default value (240.0) from pytorch will cause accuracy
|
||||
# issue on dynamic quantization models. Here use 224.0 for rocm.
|
||||
ROCM_FP8_MAX = 224.0
|
||||
FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \
|
||||
else torch.float8_e4m3fn
|
||||
|
||||
|
||||
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
|
||||
return torch.as_tensor(x, dtype=torch.float32, device='cuda')
|
||||
|
||||
def ref_dynamic_per_token_quant(x: torch.tensor,
|
||||
quant_dtype: torch.dtype,
|
||||
scale_ub: Optional[torch.tensor] = None) \
|
||||
-> Tuple[torch.tensor, torch.tensor]:
|
||||
|
||||
assert quant_dtype in [torch.int8, FP8_DTYPE]
|
||||
if scale_ub is not None:
|
||||
assert quant_dtype == FP8_DTYPE
|
||||
|
||||
qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
|
||||
else torch.finfo(quant_dtype)
|
||||
qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
else qtype_traits.max
|
||||
qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
else qtype_traits.min
|
||||
qtype_max = as_float32_tensor(qtype_traits_max)
|
||||
s_1 = as_float32_tensor(1.0)
|
||||
s_512 = as_float32_tensor(512.0)
|
||||
|
||||
# For fp8, in order to match the cuda kernel output, we have to do exactly
|
||||
# the same operations as in the corresponding fp8 kernel to prevent
|
||||
# rounding errors.
|
||||
|
||||
# Compute scales
|
||||
x_token_max, _ = x.abs().max(dim=-1)
|
||||
x_token_max = as_float32_tensor(x_token_max)
|
||||
if scale_ub is not None:
|
||||
x_token_max = x_token_max.clamp(max=scale_ub)
|
||||
scales = (x_token_max / qtype_max)[:, None]
|
||||
|
||||
# Quant
|
||||
if quant_dtype == torch.int8:
|
||||
iscales = as_float32_tensor(s_1 / scales)
|
||||
torch_out = as_float32_tensor(x) * iscales
|
||||
torch_out = torch_out.round()
|
||||
torch_out = torch_out.clamp(qtype_traits_min,
|
||||
qtype_traits_max).to(quant_dtype)
|
||||
else:
|
||||
assert quant_dtype == FP8_DTYPE
|
||||
min_scaling_factor = s_1 / (qtype_max * s_512)
|
||||
scales = scales.clamp(min=min_scaling_factor)
|
||||
torch_out = as_float32_tensor(x) / scales
|
||||
torch_out = torch_out.clamp(qtype_traits_min,
|
||||
qtype_traits_max).to(quant_dtype)
|
||||
|
||||
return torch_out, scales
|
||||
|
||||
|
||||
# The int8 version is very similar. Incorporate the int8 version, like in
|
||||
# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
|
||||
# kernel
|
||||
def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
|
||||
-> Tuple[torch.tensor, torch.tensor]:
|
||||
|
||||
fp8_traits = torch.finfo(FP8_DTYPE)
|
||||
fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
else fp8_traits.max
|
||||
fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \
|
||||
else fp8_traits.min
|
||||
fp8_max = as_float32_tensor(fp8_traits_max)
|
||||
one = as_float32_tensor(1.0)
|
||||
|
||||
# For fp8, in order to match the cuda kernel output, we have to do exactly
|
||||
# the same operations as in the corresponding fp8 kernel to prevent
|
||||
# rounding errors.
|
||||
|
||||
x_max = as_float32_tensor(x.abs().max())
|
||||
ref_scale = x_max / fp8_max
|
||||
ref_iscale = one / ref_scale
|
||||
ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
|
||||
fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
|
||||
return ref_out, ref_scale.view((1, ))
|
||||
101
vllm-v0.6.2/tests/kernels/test_activation.py
Normal file
101
vllm-v0.6.2/tests/kernels/test_activation.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import random
|
||||
from typing import Type
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
|
||||
GeluAndMul, NewGELU,
|
||||
QuickGELU, SiluAndMul)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .allclose_default import get_default_atol, get_default_rtol
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
||||
D = [512, 13824] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("activation",
|
||||
["silu", "gelu", "gelu_tanh", "fatrelu"])
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("d", D)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_act_and_mul(
|
||||
activation: str,
|
||||
num_tokens: int,
|
||||
d: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
||||
if activation == "silu":
|
||||
layer = SiluAndMul()
|
||||
fn = torch.ops._C.silu_and_mul
|
||||
elif activation == "gelu":
|
||||
layer = GeluAndMul(approximate="none")
|
||||
fn = torch.ops._C.gelu_and_mul
|
||||
elif activation == "gelu_tanh":
|
||||
layer = GeluAndMul(approximate="tanh")
|
||||
fn = torch.ops._C.gelu_tanh_and_mul
|
||||
elif activation == "fatrelu":
|
||||
threshold = random.uniform(0, 1)
|
||||
layer = FatreluAndMul(threshold)
|
||||
fn = torch.ops._C.fatrelu_and_mul
|
||||
out = layer(x)
|
||||
ref_out = layer.forward_native(x)
|
||||
# The SiLU, GELU and FatReLU implementations are equivalent to the native
|
||||
# PyTorch implementations, so we can do exact comparison.
|
||||
torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
|
||||
|
||||
d = x.shape[-1] // 2
|
||||
output_shape = (x.shape[:-1] + (d, ))
|
||||
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
||||
if activation == "fatrelu":
|
||||
opcheck(fn, (out, x, threshold))
|
||||
else:
|
||||
opcheck(fn, (out, x))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
|
||||
(NewGELU, torch.ops._C.gelu_new),
|
||||
(QuickGELU, torch.ops._C.gelu_quick)])
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("d", D)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_activation(
|
||||
activation: Type[torch.nn.Module],
|
||||
num_tokens: int,
|
||||
d: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
x = torch.randn(num_tokens, d, dtype=dtype)
|
||||
layer = activation[0]()
|
||||
fn = activation[1]
|
||||
out = layer(x)
|
||||
ref_out = layer.forward_native(x)
|
||||
torch.testing.assert_close(out,
|
||||
ref_out,
|
||||
atol=get_default_atol(out),
|
||||
rtol=get_default_rtol(out))
|
||||
|
||||
out = torch.empty_like(x)
|
||||
opcheck(fn, (out, x))
|
||||
90
vllm-v0.6.2/tests/kernels/test_advance_step.py
Normal file
90
vllm-v0.6.2/tests/kernels/test_advance_step.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import pytest
|
||||
import torch
|
||||
import torch_mlu
|
||||
from vllm import _mlu_ops as mlu_ops
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
@pytest.mark.parametrize("num_seqs, num_queries", [(20, 17), (17, 20), (256, 224)])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("TILE_SIZE", [8, 64, 256])
|
||||
@pytest.mark.parametrize("device", ["mlu"])
|
||||
def test_advance_step(num_seqs, num_queries, block_size, TILE_SIZE, device):
|
||||
if num_seqs < num_queries:
|
||||
pytest.skip(
|
||||
f"Skipping invalid case since num_seqs({num_seqs}) "
|
||||
f"is smaller than num_queries({num_queries})."
|
||||
)
|
||||
|
||||
def torch_impl(input_tokens: torch.Tensor,
|
||||
sampled_token_ids: torch.Tensor,
|
||||
input_positions: torch.Tensor,
|
||||
seq_lens: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
block_tables: torch.Tensor,
|
||||
num_seqs: int,
|
||||
num_queries: int,
|
||||
block_size: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
# Get updated input_tokens.
|
||||
sampled_token_ids = sampled_token_ids[:num_queries]
|
||||
torch_input_tokens = torch.clone(input_tokens)
|
||||
torch_input_tokens[:num_queries] = sampled_token_ids
|
||||
|
||||
# Get updated seq_lens.
|
||||
torch_seq_lens = torch.clone(seq_lens)
|
||||
torch_seq_lens[:num_queries] += 1
|
||||
|
||||
# Get updated input_positions.
|
||||
torch_input_positions = torch.clone(input_positions)
|
||||
torch_input_positions[:num_queries] = torch_seq_lens[:num_queries] - 1
|
||||
|
||||
# Get updated slot_mapping.
|
||||
torch_slot_mapping = torch.clone(slot_mapping)
|
||||
block_index = torch_input_positions[:num_queries] // block_size
|
||||
block_offset = torch_input_positions[:num_queries] % block_size
|
||||
indices = [slice(0, num_queries)] + [0] * (block_tables.ndim - 1)
|
||||
intermediate_block_table = block_tables[tuple(indices)]
|
||||
slot_num = intermediate_block_table * block_size + block_offset
|
||||
torch_slot_mapping[:num_queries] = slot_num
|
||||
|
||||
return (torch_input_tokens, torch_seq_lens, torch_input_positions, torch_slot_mapping)
|
||||
|
||||
block_tables_inner_size = 2
|
||||
input_tokens = torch.zeros(num_seqs, dtype=torch.int64, device=device)
|
||||
sampled_token_ids = torch.arange(num_queries, dtype=torch.int64, device=device)
|
||||
input_positions = torch.empty(num_seqs, dtype=torch.int32, device=device)
|
||||
seq_lens = torch.ones(num_seqs, dtype=torch.int32, device=device)
|
||||
slot_mapping = torch.empty(num_seqs, dtype=torch.int32, device=device)
|
||||
block_tables = torch.arange(
|
||||
num_seqs * block_tables_inner_size,
|
||||
dtype=torch.int32,
|
||||
device=device
|
||||
).view(num_seqs, block_tables_inner_size)
|
||||
torch_input_tokens, torch_seq_lens, torch_input_positions, torch_slot_mapping = torch_impl(
|
||||
input_tokens,
|
||||
sampled_token_ids,
|
||||
input_positions,
|
||||
seq_lens,
|
||||
slot_mapping,
|
||||
block_tables,
|
||||
num_seqs,
|
||||
num_queries,
|
||||
block_size
|
||||
)
|
||||
mlu_ops.advance_step(
|
||||
num_seqs,
|
||||
num_queries,
|
||||
block_size,
|
||||
input_tokens,
|
||||
sampled_token_ids.view(-1, 1),
|
||||
input_positions,
|
||||
seq_lens,
|
||||
slot_mapping,
|
||||
block_tables,
|
||||
TILE_SIZE=TILE_SIZE
|
||||
)
|
||||
assert torch.allclose(torch_input_tokens, input_tokens)
|
||||
assert torch.allclose(torch_seq_lens, seq_lens)
|
||||
assert torch.allclose(torch_input_positions, input_positions)
|
||||
assert torch.allclose(torch_slot_mapping, slot_mapping)
|
||||
37
vllm-v0.6.2/tests/kernels/test_aqlm.py
Normal file
37
vllm-v0.6.2/tests/kernels/test_aqlm.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops # noqa: F401
|
||||
|
||||
|
||||
def test_aqlm_dequant_opcheck():
|
||||
codes = torch.randint(-32768,
|
||||
32767, (22016, 512, 1),
|
||||
device='cuda',
|
||||
dtype=torch.int16)
|
||||
codebooks = torch.rand((2, 65536, 1, 8),
|
||||
device='cuda',
|
||||
dtype=torch.float16)
|
||||
codebook_partition_sizes = [11008, 11008]
|
||||
|
||||
opcheck(torch.ops._C.aqlm_dequant,
|
||||
(codes, codebooks, codebook_partition_sizes))
|
||||
|
||||
|
||||
def test_aqlm_gemm_opcheck():
|
||||
input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
|
||||
codes = torch.randint(-32768,
|
||||
32767, (12288, 512, 1),
|
||||
device='cuda',
|
||||
dtype=torch.int16)
|
||||
codebooks = torch.rand((3, 65536, 1, 8),
|
||||
device='cuda',
|
||||
dtype=torch.float16)
|
||||
scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
|
||||
codebook_partition_sizes = [4096, 4096, 4096]
|
||||
bias = None
|
||||
|
||||
opcheck(torch.ops._C.aqlm_gemm,
|
||||
(input, codes, codebooks, scales, codebook_partition_sizes, None))
|
||||
opcheck(torch.ops._C.aqlm_gemm,
|
||||
(input, codes, codebooks, scales, codebook_partition_sizes, bias))
|
||||
433
vllm-v0.6.2/tests/kernels/test_attention.py
Normal file
433
vllm-v0.6.2/tests/kernels/test_attention.py
Normal file
@@ -0,0 +1,433 @@
|
||||
import random
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import get_max_shared_memory_bytes
|
||||
|
||||
from .allclose_default import get_default_atol, get_default_rtol
|
||||
|
||||
if not current_platform.is_rocm():
|
||||
from xformers import ops as xops
|
||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
# This will change depending on the compute capability.
|
||||
# - 512 as a buffer
|
||||
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
|
||||
# There may not be enough gpu memory due to large NUM_BLOCKS.
|
||||
# Reduce NUM_BLOCKS when it happens.
|
||||
NUM_BLOCKS = 4321 # Arbitrary values for testing
|
||||
PARTITION_SIZE = 512
|
||||
# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
|
||||
DTYPES = [
|
||||
torch.half, torch.bfloat16, torch.float
|
||||
] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
|
||||
NUM_GEN_SEQS = [7] # Arbitrary values for testing
|
||||
NUM_PREFILL_SEQS = [3] # Arbitrary values for testing
|
||||
NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
|
||||
|
||||
# FlashAttention forward only supports head dimension at most 128
|
||||
# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
|
||||
HEAD_SIZES = [64, 80, 120, 256]
|
||||
|
||||
BLOCK_SIZES = [16, 32]
|
||||
USE_ALIBI = [False, True]
|
||||
KV_CACHE_DTYPE = ["auto", "fp8"]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
def ref_masked_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
|
||||
if attn_mask is not None:
|
||||
attn_weights = attn_weights + attn_mask.float()
|
||||
attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
|
||||
out = torch.einsum("hqk,khd->qhd", attn_weights, value)
|
||||
return out
|
||||
|
||||
|
||||
def ref_single_query_cached_kv_attention(
|
||||
output: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
num_queries_per_kv: int,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
block_tables: torch.Tensor,
|
||||
seq_lens: torch.Tensor,
|
||||
scale: float,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
) -> None:
|
||||
num_query_heads = query.shape[1]
|
||||
num_kv_heads = value_cache.shape[1]
|
||||
head_size = value_cache.shape[2]
|
||||
block_size = value_cache.shape[3]
|
||||
num_seqs = query.shape[0]
|
||||
|
||||
block_tables_lst = block_tables.cpu().tolist()
|
||||
seq_lens_lst = seq_lens.cpu().tolist()
|
||||
for i in range(num_seqs):
|
||||
q = query[i].unsqueeze(0)
|
||||
block_table = block_tables_lst[i]
|
||||
seq_len = int(seq_lens_lst[i])
|
||||
|
||||
keys_lst: List[torch.Tensor] = []
|
||||
values_lst: List[torch.Tensor] = []
|
||||
for j in range(seq_len):
|
||||
block_number = int(block_table[j // block_size])
|
||||
block_offset = j % block_size
|
||||
|
||||
k = key_cache[block_number, :, :, block_offset, :]
|
||||
k = k.reshape(num_kv_heads, head_size)
|
||||
keys_lst.append(k)
|
||||
|
||||
v = value_cache[block_number, :, :, block_offset]
|
||||
values_lst.append(v)
|
||||
keys = torch.stack(keys_lst, dim=0)
|
||||
values = torch.stack(values_lst, dim=0)
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
|
||||
values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
|
||||
|
||||
alibi_bias = None
|
||||
if alibi_slopes is not None:
|
||||
# Create the ALiBi bias used in the paged attention kernel.
|
||||
position_ids = torch.arange(seq_len).int()
|
||||
alibi_bias = (position_ids - seq_len + 1).float()
|
||||
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
|
||||
1, 1, -1)
|
||||
|
||||
out = ref_masked_attention(q, keys, values, scale, alibi_bias)
|
||||
out = out.view(num_query_heads, head_size)
|
||||
output[i].copy_(out, non_blocking=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"version",
|
||||
["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"])
|
||||
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("use_alibi", USE_ALIBI)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_paged_attention(
|
||||
kv_cache_factory,
|
||||
version: str,
|
||||
num_seqs: int,
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
use_alibi: bool,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
if ((kv_cache_dtype == "fp8" and head_size % 16)
|
||||
or (version == "rocm" and head_size not in (64, 128))):
|
||||
pytest.skip()
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
num_query_heads, num_kv_heads = num_heads
|
||||
query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
|
||||
query.uniform_(-scale, scale)
|
||||
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
num_queries_per_kv = num_query_heads // num_kv_heads
|
||||
alibi_slopes = None
|
||||
if use_alibi:
|
||||
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
|
||||
|
||||
seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||
seq_lens[-1] = MAX_SEQ_LEN
|
||||
max_seq_len = max(seq_lens)
|
||||
seq_lens = torch.tensor(seq_lens, dtype=torch.int)
|
||||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables_lst: List[List[int]] = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
random.randint(0, NUM_BLOCKS - 1)
|
||||
for _ in range(max_num_blocks_per_seq)
|
||||
]
|
||||
block_tables_lst.append(block_table)
|
||||
|
||||
block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
|
||||
num_kv_heads, head_size,
|
||||
kv_cache_dtype, dtype, seed,
|
||||
device)
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
# Using default kv_scale
|
||||
k_scale = v_scale = 1.0
|
||||
|
||||
# Call the paged attention kernel.
|
||||
output = torch.empty_like(query)
|
||||
if version == "v1":
|
||||
ops.paged_attention_v1(
|
||||
output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
|
||||
opcheck(torch.ops._C.paged_attention_v1,
|
||||
(output, query, key_cache, value_cache, num_kv_heads, scale,
|
||||
block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
|
||||
kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
|
||||
cond=(head_size == HEAD_SIZES[0]
|
||||
and block_size == BLOCK_SIZES[0]))
|
||||
|
||||
elif version in ("v2", "rocm"):
|
||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||
assert PARTITION_SIZE % block_size == 0
|
||||
num_seqs, num_heads, head_size = output.shape
|
||||
tmp_output = torch.empty(
|
||||
size=(num_seqs, num_heads, num_partitions, head_size),
|
||||
dtype=output.dtype,
|
||||
)
|
||||
exp_sums = torch.empty(
|
||||
size=(num_seqs, num_heads, num_partitions),
|
||||
dtype=torch.float32,
|
||||
)
|
||||
max_logits = torch.empty_like(exp_sums)
|
||||
if version == "v2":
|
||||
ops.paged_attention_v2(
|
||||
output,
|
||||
exp_sums,
|
||||
max_logits,
|
||||
tmp_output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
|
||||
opcheck(torch.ops._C.paged_attention_v2,
|
||||
(output, exp_sums, max_logits, tmp_output, query,
|
||||
key_cache, value_cache, num_kv_heads, scale, block_tables,
|
||||
seq_lens, block_size, max_seq_len, alibi_slopes,
|
||||
kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
|
||||
cond=(head_size == HEAD_SIZES[0]
|
||||
and block_size == BLOCK_SIZES[0]))
|
||||
|
||||
else:
|
||||
ops.paged_attention_rocm(
|
||||
output,
|
||||
exp_sums,
|
||||
max_logits,
|
||||
tmp_output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
|
||||
opcheck(torch.ops._rocm_C.paged_attention,
|
||||
(output, exp_sums, max_logits, tmp_output, query,
|
||||
key_cache, value_cache, num_kv_heads, scale, block_tables,
|
||||
seq_lens, block_size, max_seq_len, alibi_slopes,
|
||||
kv_cache_dtype, k_scale, v_scale),
|
||||
cond=(head_size == HEAD_SIZES[0]
|
||||
and block_size == BLOCK_SIZES[0]))
|
||||
|
||||
else:
|
||||
raise AssertionError(f"Unknown version: {version}")
|
||||
|
||||
# Run the reference implementation.
|
||||
if kv_cache_dtype == "fp8":
|
||||
# Convert cache data back to dtype.
|
||||
x = 16 // torch.tensor([], dtype=dtype).element_size()
|
||||
key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
|
||||
block_size, x)
|
||||
dequantized_key_cache = torch.empty(size=key_cache_shape,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
ops.convert_fp8(dequantized_key_cache, key_cache)
|
||||
key_cache = dequantized_key_cache
|
||||
|
||||
value_cache_shape = value_cache.shape
|
||||
dequantized_value_cache = torch.empty(size=value_cache_shape,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
ops.convert_fp8(dequantized_value_cache, value_cache)
|
||||
value_cache = dequantized_value_cache
|
||||
|
||||
ref_output = torch.empty_like(query)
|
||||
ref_single_query_cached_kv_attention(
|
||||
ref_output,
|
||||
query,
|
||||
num_queries_per_kv,
|
||||
key_cache,
|
||||
value_cache,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
scale,
|
||||
alibi_slopes,
|
||||
)
|
||||
|
||||
# NOTE(woosuk): Due to the kernel-level differences in the two
|
||||
# implementations, there is a small numerical difference in the two
|
||||
# outputs. Thus, we use a relaxed tolerance for the test.
|
||||
atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
|
||||
rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
|
||||
|
||||
# NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
|
||||
# so we use a relaxed tolerance for the test.
|
||||
atol, rtol = 1e-3, 1e-5
|
||||
if kv_cache_dtype == "fp8":
|
||||
atol, rtol = 1e-2, 1e-5
|
||||
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
def ref_multi_query_kv_attention(
|
||||
cu_seq_lens: List[int],
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
dtype: torch.dtype,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(cu_seq_lens) - 1
|
||||
ref_outputs: List[torch.Tensor] = []
|
||||
for i in range(num_seqs):
|
||||
start_idx = cu_seq_lens[i]
|
||||
end_idx = cu_seq_lens[i + 1]
|
||||
seq_len = end_idx - start_idx
|
||||
|
||||
# Create attention mask.
|
||||
attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
|
||||
diagonal=1)
|
||||
attn_mask = attn_mask * torch.finfo(dtype).min
|
||||
attn_mask = attn_mask.to(dtype=dtype)
|
||||
|
||||
ref_output = ref_masked_attention(
|
||||
query[start_idx:end_idx],
|
||||
key[start_idx:end_idx],
|
||||
value[start_idx:end_idx],
|
||||
scale,
|
||||
attn_mask=attn_mask,
|
||||
)
|
||||
ref_outputs.append(ref_output)
|
||||
|
||||
return torch.cat(ref_outputs, dim=0)
|
||||
|
||||
|
||||
# TODO(woosuk): Add tests for USE_ALIBI=True.
|
||||
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
@torch.inference_mode()
|
||||
def test_multi_query_kv_attention(
|
||||
num_seqs: int,
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
|
||||
# As the xformers library is already tested with its own tests, we can use
|
||||
# a smaller MAX_SEQ_LEN here.
|
||||
max_len = min(MAX_SEQ_LEN, 4096)
|
||||
seq_lens = random.sample(range(1, max_len), num_seqs)
|
||||
num_tokens = sum(seq_lens)
|
||||
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
num_query_heads, num_kv_heads = num_heads
|
||||
qkv = torch.empty(num_tokens,
|
||||
num_query_heads + 2 * num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
qkv.uniform_(-scale, scale)
|
||||
query, key, value = qkv.split(
|
||||
[num_query_heads, num_kv_heads, num_kv_heads], dim=1)
|
||||
|
||||
num_queries_per_kv = num_query_heads // num_kv_heads
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
|
||||
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
|
||||
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
|
||||
output = xops.memory_efficient_attention_forward(
|
||||
query.unsqueeze(0),
|
||||
key.unsqueeze(0),
|
||||
value.unsqueeze(0),
|
||||
attn_bias=attn_bias,
|
||||
p=0.0,
|
||||
scale=scale,
|
||||
)
|
||||
output = output.squeeze(0)
|
||||
|
||||
cu_seq_lens = [0]
|
||||
for seq_len in seq_lens:
|
||||
cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
|
||||
ref_output = ref_multi_query_kv_attention(
|
||||
cu_seq_lens,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
scale,
|
||||
dtype,
|
||||
)
|
||||
atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
|
||||
rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
|
||||
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
|
||||
87
vllm-v0.6.2/tests/kernels/test_attention_selector.py
Normal file
87
vllm-v0.6.2/tests/kernels/test_attention_selector.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from vllm.attention.selector import which_attn_to_use
|
||||
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
|
||||
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
|
||||
def test_env(name: str, device: str, monkeypatch):
|
||||
"""Test that the attention selector can be set via environment variable.
|
||||
Note that we do not test FlashAttn because it is the default backend.
|
||||
"""
|
||||
|
||||
override_backend_env_variable(monkeypatch, name)
|
||||
|
||||
if device == "cpu":
|
||||
with patch("vllm.attention.selector.current_platform.is_cpu",
|
||||
return_value=True):
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == "TORCH_SDPA"
|
||||
elif device == "hip":
|
||||
with patch("vllm.attention.selector.current_platform.is_rocm",
|
||||
return_value=True):
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == "ROCM_FLASH"
|
||||
elif device == "openvino":
|
||||
with patch("vllm.attention.selector.current_platform.is_openvino",
|
||||
return_value=True):
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == "OPENVINO"
|
||||
else:
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == name
|
||||
|
||||
|
||||
def test_flash_attn(monkeypatch):
|
||||
"""Test FlashAttn validation."""
|
||||
# TODO: When testing for v1, pipe in `use_v1` as an argument to
|
||||
# which_attn_to_use
|
||||
|
||||
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
|
||||
|
||||
# Unsupported CUDA arch
|
||||
with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
|
||||
backend = which_attn_to_use(16, torch.float16, None, 16, False)
|
||||
assert backend.name != STR_FLASH_ATTN_VAL
|
||||
|
||||
# Unsupported data type
|
||||
backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False)
|
||||
assert backend.name != STR_FLASH_ATTN_VAL
|
||||
|
||||
# Unsupported kv cache data type
|
||||
backend = which_attn_to_use(16, torch.float16, "fp8", 16, False)
|
||||
assert backend.name != STR_FLASH_ATTN_VAL
|
||||
|
||||
# Unsupported block size
|
||||
backend = which_attn_to_use(16, torch.float16, None, 8, False)
|
||||
assert backend.name != STR_FLASH_ATTN_VAL
|
||||
|
||||
# flash-attn is not installed
|
||||
with patch.dict('sys.modules', {'vllm_flash_attn': None}):
|
||||
backend = which_attn_to_use(16, torch.float16, None, 16, False)
|
||||
assert backend.name != STR_FLASH_ATTN_VAL
|
||||
|
||||
# Unsupported head size
|
||||
backend = which_attn_to_use(17, torch.float16, None, 16, False)
|
||||
assert backend.name != STR_FLASH_ATTN_VAL
|
||||
|
||||
# Attention-free models should bypass env and use PlaceholderAttention
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True)
|
||||
assert backend.name != STR_FLASH_ATTN_VAL
|
||||
|
||||
|
||||
def test_invalid_env(monkeypatch):
|
||||
"""Throw an exception if the backend name is invalid."""
|
||||
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
|
||||
with pytest.raises(ValueError):
|
||||
which_attn_to_use(16, torch.float16, None, 16, False)
|
||||
43
vllm-v0.6.2/tests/kernels/test_awq.py
Normal file
43
vllm-v0.6.2/tests/kernels/test_awq.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops # noqa: F401
|
||||
|
||||
|
||||
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
|
||||
reason="AWQ is not supported on this GPU type.")
|
||||
def test_awq_dequantize_opcheck():
|
||||
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
|
||||
qweight = torch.randint(-2000000000,
|
||||
2000000000, (8192, 256),
|
||||
device='cuda',
|
||||
dtype=torch.int32)
|
||||
scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
|
||||
zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
|
||||
split_k_iters = 0
|
||||
thx = 0
|
||||
thy = 0
|
||||
opcheck(torch.ops._C.awq_dequantize,
|
||||
(qweight, scales, zeros, split_k_iters, thx, thy))
|
||||
|
||||
|
||||
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
|
||||
reason="AWQ is not supported on this GPU type.")
|
||||
def test_awq_gemm_opcheck():
|
||||
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
|
||||
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
|
||||
qweight = torch.randint(-2000000000,
|
||||
2000000000, (8192, 256),
|
||||
device='cuda',
|
||||
dtype=torch.int32)
|
||||
scales = torch.randint(-2000000000,
|
||||
2000000000, (64, 256),
|
||||
device='cuda',
|
||||
dtype=torch.int32)
|
||||
qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
|
||||
split_k_iters = 8
|
||||
opcheck(torch.ops._C.awq_gemm,
|
||||
(input, qweight, qzeros, scales, split_k_iters))
|
||||
167
vllm-v0.6.2/tests/kernels/test_awq_marlin.py
Normal file
167
vllm-v0.6.2/tests/kernels/test_awq_marlin.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""Test AWQ with fused MoE Marlin kernels.
|
||||
|
||||
Run `pytest tests/kernels/test_awq_marlin.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe # noqa
|
||||
from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe,
|
||||
torch_moe_single)
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
awq_marlin_quantize)
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
TOP_KS = [2, 6]
|
||||
GROUP_SIZES = [-1, 32, 128]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64, 222])
|
||||
@pytest.mark.parametrize("n", [128, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 1024])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("group_size", GROUP_SIZES)
|
||||
@pytest.mark.skipif(not (ops.supports_moe_ops
|
||||
and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
def test_fused_marlin_moe_awq(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
group_size: int,
|
||||
):
|
||||
torch.manual_seed(7)
|
||||
|
||||
num_bits = 4
|
||||
quant_type = scalar_types.uint4
|
||||
dtype = torch.float16
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||
|
||||
w_ref1_l = []
|
||||
qweights1_l = []
|
||||
scales1_l = []
|
||||
zp1_l = []
|
||||
|
||||
for i in range(w1.shape[0]):
|
||||
w_ref1, qweight1, scales1, zp1 = awq_marlin_quantize(
|
||||
w1[i].transpose(1, 0), quant_type, group_size)
|
||||
w_ref1_l.append(w_ref1)
|
||||
qweights1_l.append(qweight1)
|
||||
scales1_l.append(scales1)
|
||||
zp1_l.append(zp1)
|
||||
|
||||
w_ref1 = stack_and_dev(w_ref1_l)
|
||||
qweight1 = stack_and_dev(qweights1_l).contiguous()
|
||||
scales1 = stack_and_dev(scales1_l)
|
||||
zp1 = stack_and_dev(zp1_l)
|
||||
|
||||
w_ref2_l = []
|
||||
qweights2_l = []
|
||||
scales2_l = []
|
||||
zp2_l = []
|
||||
|
||||
for i in range(w2.shape[0]):
|
||||
w_ref2, qweight2, scales2, zp2 = awq_marlin_quantize(
|
||||
w2[i].transpose(1, 0), quant_type, group_size)
|
||||
w_ref2_l.append(w_ref2)
|
||||
qweights2_l.append(qweight2)
|
||||
scales2_l.append(scales2)
|
||||
zp2_l.append(zp2)
|
||||
|
||||
w_ref2 = stack_and_dev(w_ref2_l)
|
||||
qweight2 = stack_and_dev(qweights2_l).contiguous()
|
||||
scales2 = stack_and_dev(scales2_l)
|
||||
zp2 = stack_and_dev(zp2_l)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||
|
||||
topk_weights, topk_ids = fused_topk(a, score, topk, False)
|
||||
marlin_output = torch.ops.vllm.fused_marlin_moe(
|
||||
a,
|
||||
qweight1,
|
||||
qweight2,
|
||||
scales1,
|
||||
scales2,
|
||||
score,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
w1_zeros=zp1,
|
||||
w2_zeros=zp2,
|
||||
num_bits=num_bits,
|
||||
)
|
||||
|
||||
torch_output = torch_moe(
|
||||
a,
|
||||
w_ref1.transpose(1, 2),
|
||||
w_ref2.transpose(1, 2),
|
||||
score,
|
||||
topk,
|
||||
)
|
||||
|
||||
assert compute_max_diff(marlin_output, torch_output) < 4e-2
|
||||
|
||||
|
||||
@pytest.mark.skip("This test is here for the sake of debugging, "
|
||||
"don't run it in automated tests.")
|
||||
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
|
||||
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
|
||||
@pytest.mark.parametrize("k", [128, 1024, 512])
|
||||
@pytest.mark.parametrize("e", [8, 64])
|
||||
@pytest.mark.parametrize("topk", [2, 6])
|
||||
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
|
||||
def test_single_marlin_moe_multiply_awq(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
group_size: int,
|
||||
):
|
||||
torch.manual_seed(7)
|
||||
|
||||
num_bits = 4
|
||||
quant_type = scalar_types.uint4
|
||||
dtype = torch.float16
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
|
||||
|
||||
w_ref_l = []
|
||||
qweights_l = []
|
||||
scales_l = []
|
||||
zp_l = []
|
||||
|
||||
for i in range(w.shape[0]):
|
||||
w_ref, qweight, scales, zp = awq_marlin_quantize(
|
||||
w[i].transpose(1, 0), quant_type, group_size)
|
||||
w_ref_l.append(w_ref)
|
||||
qweights_l.append(qweight)
|
||||
scales_l.append(scales)
|
||||
zp_l.append(zp)
|
||||
|
||||
w_ref = stack_and_dev(w_ref_l)
|
||||
qweight = stack_and_dev(qweights_l).contiguous()
|
||||
scales = stack_and_dev(scales_l).contiguous()
|
||||
zp = stack_and_dev(zp_l).contiguous()
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||
|
||||
marlin_output = torch.ops.vllm.single_marlin_moe(a,
|
||||
qweight,
|
||||
scales,
|
||||
score,
|
||||
topk,
|
||||
renormalize=False,
|
||||
w_zeros=zp,
|
||||
num_bits=num_bits)
|
||||
|
||||
torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
|
||||
|
||||
assert compute_max_diff(marlin_output, torch_output) < 1e-2
|
||||
170
vllm-v0.6.2/tests/kernels/test_awq_triton.py
Normal file
170
vllm-v0.6.2/tests/kernels/test_awq_triton.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""Tests for the AWQ Triton kernel.
|
||||
|
||||
Run `pytest tests/kernels/test_awq_triton.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.awq_triton import (
|
||||
AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
device = "cuda"
|
||||
|
||||
|
||||
def reverse_awq_order(t: torch.Tensor):
|
||||
bits = 4
|
||||
AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
|
||||
reverse_order_tensor = torch.arange(
|
||||
t.shape[-1],
|
||||
dtype=torch.int32,
|
||||
device=t.device,
|
||||
)
|
||||
reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
|
||||
reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
|
||||
reverse_order_tensor = reverse_order_tensor.view(-1)
|
||||
|
||||
t = t[:, reverse_order_tensor] & 0xF
|
||||
return t
|
||||
|
||||
|
||||
# qweights - [R , C // 8], int32
|
||||
# scales - [R // G, C ], float16
|
||||
# zeros - [R // G, C // 8], int32
|
||||
def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
|
||||
qzeros: torch.Tensor,
|
||||
group_size: int) -> torch.Tensor:
|
||||
|
||||
if group_size == -1:
|
||||
group_size = qweight.shape[0]
|
||||
|
||||
bits = 4
|
||||
shifts = torch.arange(0, 32, bits, device=qzeros.device)
|
||||
|
||||
iweights = torch.bitwise_right_shift(qweight[:, :, None],
|
||||
shifts[None, None, :]).to(torch.int8)
|
||||
|
||||
iweights = iweights.view(iweights.shape[0], -1)
|
||||
|
||||
zeros = torch.bitwise_right_shift(qzeros[:, :, None],
|
||||
shifts[None, None, :]).to(torch.int8)
|
||||
zeros = zeros.view(qzeros.shape[0], -1)
|
||||
zeros = reverse_awq_order(zeros)
|
||||
|
||||
iweights = reverse_awq_order(iweights)
|
||||
|
||||
iweights = torch.bitwise_and(iweights, (2**bits) - 1)
|
||||
zeros = torch.bitwise_and(zeros, (2**bits) - 1)
|
||||
|
||||
scales = scales.repeat_interleave(group_size, dim=0)
|
||||
zeros = zeros.repeat_interleave(group_size, dim=0)
|
||||
return (iweights - zeros) * scales
|
||||
|
||||
|
||||
# qweights - [R , C // 8], int32
|
||||
# scales - [R // G, C ], float16
|
||||
# zeros - [R // G, C // 8], int32
|
||||
@pytest.mark.parametrize("qweight_rows", [3584, 18944, 128, 256, 512, 1024])
|
||||
@pytest.mark.parametrize("qweight_cols", [448, 576, 4736, 16, 32, 64, 128])
|
||||
@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
|
||||
def test_dequantize(qweight_rows, qweight_cols, group_size):
|
||||
|
||||
if group_size == -1:
|
||||
group_size = qweight_rows
|
||||
|
||||
qweight_dtype = torch.int32
|
||||
scales_rows = qweight_rows // group_size
|
||||
scales_cols = qweight_cols * 8
|
||||
scales_dtype = torch.float16
|
||||
zeros_rows = scales_rows
|
||||
zeros_cols = qweight_cols
|
||||
zeros_dtype = torch.int32
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
qweight = torch.randint(0,
|
||||
torch.iinfo(torch.int32).max,
|
||||
(qweight_rows, qweight_cols),
|
||||
dtype=qweight_dtype,
|
||||
device=device)
|
||||
scales = torch.rand(scales_rows,
|
||||
scales_cols,
|
||||
dtype=scales_dtype,
|
||||
device=device)
|
||||
zeros = torch.randint(0,
|
||||
torch.iinfo(torch.int32).max,
|
||||
(zeros_rows, zeros_cols),
|
||||
dtype=zeros_dtype,
|
||||
device=device)
|
||||
|
||||
iweights_triton = awq_dequantize_triton(qweight, scales, zeros)
|
||||
|
||||
assert (not torch.any(torch.isinf(iweights_triton))
|
||||
and not torch.any(torch.isnan(iweights_triton)))
|
||||
|
||||
iweights_torch = awq_dequantize_torch(qweight, scales, zeros, group_size)
|
||||
|
||||
torch.testing.assert_close(iweights_triton, iweights_torch)
|
||||
|
||||
|
||||
# input - [N, K]
|
||||
# qweight - [K, M // 8]
|
||||
# qzeros - [K // G, M // 8]
|
||||
# scales - [K // G, M]
|
||||
@pytest.mark.parametrize("N", [1, 2, 4, 8, 14, 17, 23, 32])
|
||||
@pytest.mark.parametrize("K", [128])
|
||||
@pytest.mark.parametrize("M", [16, 24, 32])
|
||||
@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
|
||||
@pytest.mark.parametrize("splitK", [1, 8])
|
||||
def test_gemm(N, K, M, splitK, group_size):
|
||||
|
||||
if group_size == -1:
|
||||
group_size = K
|
||||
|
||||
split_k_iters = splitK
|
||||
|
||||
input_rows = N
|
||||
input_cols = K
|
||||
input_dtype = torch.float32
|
||||
qweight_rows = input_cols
|
||||
qweight_cols = M // 8
|
||||
scales_rows = qweight_rows // group_size
|
||||
scales_cols = M
|
||||
scales_dtype = torch.float32
|
||||
qzeros_rows = scales_rows
|
||||
qzeros_cols = qweight_cols
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
input = torch.rand((input_rows, input_cols),
|
||||
dtype=input_dtype,
|
||||
device=device)
|
||||
qweight = torch.randint(0,
|
||||
torch.iinfo(torch.int32).max,
|
||||
(qweight_rows, qweight_cols),
|
||||
device=device)
|
||||
qzeros = torch.randint(0,
|
||||
torch.iinfo(torch.int32).max,
|
||||
(qzeros_rows, qzeros_cols),
|
||||
device=device)
|
||||
scales = torch.rand((scales_rows, scales_cols),
|
||||
dtype=scales_dtype,
|
||||
device=device)
|
||||
|
||||
output_triton = awq_gemm_triton(input, qweight, scales, qzeros,
|
||||
split_k_iters)
|
||||
|
||||
assert (not torch.any(torch.isinf(output_triton))
|
||||
and not torch.any(torch.isnan(output_triton)))
|
||||
|
||||
dequantized_weights = awq_dequantize_triton(qweight, scales, qzeros)
|
||||
|
||||
output_torch = torch.matmul(input, dequantized_weights)
|
||||
|
||||
assert (not torch.any(torch.isinf(output_torch))
|
||||
and not torch.any(torch.isnan(output_torch)))
|
||||
|
||||
torch.testing.assert_close(output_triton.cpu(),
|
||||
output_torch.cpu(),
|
||||
atol=1e-1,
|
||||
rtol=1e-1)
|
||||
439
vllm-v0.6.2/tests/kernels/test_blocksparse_attention.py
Normal file
439
vllm-v0.6.2/tests/kernels/test_blocksparse_attention.py
Normal file
@@ -0,0 +1,439 @@
|
||||
import random
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.attention.ops.blocksparse_attention.interface import (
|
||||
LocalStridedBlockSparseAttn)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import get_max_shared_memory_bytes
|
||||
|
||||
from .allclose_default import get_default_atol, get_default_rtol
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
# This will change depending on the compute capability.
|
||||
# - 512 as a buffer
|
||||
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
|
||||
# MAX_SEQ_LEN = 2771
|
||||
|
||||
# There may not be enough gpu memory due to large NUM_BLOCKS.
|
||||
# Reduce NUM_BLOCKS when it happens.
|
||||
NUM_BLOCKS = 4321 # Arbitrary values for testing
|
||||
PARTITION_SIZE = 512
|
||||
DTYPES = [torch.half, torch.bfloat16]
|
||||
NUM_GEN_SEQS = [3] # Arbitrary values for testing
|
||||
NUM_PREFILL_SEQS = [3] # Arbitrary values for testing
|
||||
NUM_HEADS = [(40, 40)] # Arbitrary values for testing
|
||||
|
||||
HEAD_SIZES = [64, 112]
|
||||
BLOCK_SIZES = [16]
|
||||
USE_ALIBI = [False, True]
|
||||
KV_CACHE_DTYPE = ["auto", "fp8"]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = ['cuda:0']
|
||||
BLOCKSPARSE_LOCAL_BLOCKS = [16]
|
||||
BLOCKSPARSE_VERT_STRIDES = [8]
|
||||
|
||||
BLOCKSPARSE_BLOCK_SIZES = [64]
|
||||
BLOCKSPARSE_HEADS_SLIDINGS = [2, -1]
|
||||
BLOCKSPARSE_HOMO_HEADS = [True, False]
|
||||
|
||||
|
||||
def ref_masked_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
|
||||
if attn_mask is not None:
|
||||
attn_weights = attn_weights + attn_mask.float()
|
||||
attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
|
||||
out = torch.einsum("hqk,khd->qhd", attn_weights, value)
|
||||
return out
|
||||
|
||||
|
||||
def ref_single_query_cached_kv_attention(
|
||||
output: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
num_queries_per_kv: int,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
block_tables: torch.Tensor,
|
||||
seq_lens: torch.Tensor,
|
||||
scale: float,
|
||||
alibi_slopes: Optional[torch.Tensor],
|
||||
tp_rank: int = 0,
|
||||
blocksparse_local_blocks: int = 0,
|
||||
blocksparse_vert_stride: int = 1,
|
||||
blocksparse_block_size: int = 64,
|
||||
blocksparse_head_sliding_step: int = 0,
|
||||
) -> None:
|
||||
num_query_heads = query.shape[1]
|
||||
num_kv_heads = value_cache.shape[1]
|
||||
head_size = value_cache.shape[2]
|
||||
block_size = value_cache.shape[3]
|
||||
num_seqs = query.shape[0]
|
||||
|
||||
block_tables_lst = block_tables.cpu().tolist()
|
||||
seq_lens_lst = seq_lens.cpu().tolist()
|
||||
for i in range(num_seqs):
|
||||
q = query[i].unsqueeze(0)
|
||||
block_table = block_tables_lst[i]
|
||||
seq_len = int(seq_lens_lst[i])
|
||||
|
||||
keys_lst: List[torch.Tensor] = []
|
||||
values_lst: List[torch.Tensor] = []
|
||||
for j in range(seq_len):
|
||||
block_number = int(block_table[j // block_size])
|
||||
block_offset = j % block_size
|
||||
|
||||
k = key_cache[block_number, :, :, block_offset, :]
|
||||
k = k.reshape(num_kv_heads, head_size)
|
||||
keys_lst.append(k)
|
||||
|
||||
v = value_cache[block_number, :, :, block_offset]
|
||||
values_lst.append(v)
|
||||
keys = torch.stack(keys_lst, dim=0)
|
||||
values = torch.stack(values_lst, dim=0)
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
|
||||
values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
|
||||
|
||||
alibi_bias = None
|
||||
if alibi_slopes is not None:
|
||||
# Create the ALiBi bias used in the paged attention kernel.
|
||||
position_ids = torch.arange(seq_len).int()
|
||||
alibi_bias = (position_ids - seq_len + 1).float()
|
||||
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
|
||||
1, 1, -1)
|
||||
|
||||
if blocksparse_vert_stride >= 1:
|
||||
bsize = blocksparse_block_size
|
||||
hsliding = blocksparse_head_sliding_step
|
||||
vert = blocksparse_vert_stride
|
||||
locals = blocksparse_local_blocks
|
||||
qb = (seq_len - 1) // bsize
|
||||
attn_mask = q.new_zeros(
|
||||
(num_query_heads, 1, seq_len)).float() - torch.inf
|
||||
for h in range(num_query_heads):
|
||||
if hsliding >= 0: # slide with q heads
|
||||
bs_offset = (tp_rank * num_query_heads + h) * hsliding + 1
|
||||
else: # slide with kv heads
|
||||
bs_offset = (tp_rank * num_kv_heads +
|
||||
h // num_queries_per_kv) * (-hsliding) + 1
|
||||
for kb in range(qb + 1):
|
||||
kj = kb * bsize
|
||||
if (qb - kb) < locals or \
|
||||
(kb + bs_offset) % vert == 0:
|
||||
attn_mask[h, 0, kj:min(kj + bsize, seq_len)] = 0
|
||||
if alibi_bias is not None:
|
||||
attn_mask += alibi_bias
|
||||
else:
|
||||
attn_mask = alibi_bias
|
||||
|
||||
out = ref_masked_attention(q, keys, values, scale, attn_mask=attn_mask)
|
||||
out = out.view(num_query_heads, head_size)
|
||||
output[i].copy_(out, non_blocking=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("version", ["v1", "v2"])
|
||||
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("use_alibi", USE_ALIBI)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
|
||||
@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
|
||||
@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("blocksparse_head_sliding_step",
|
||||
BLOCKSPARSE_HEADS_SLIDINGS)
|
||||
def test_paged_attention(
|
||||
kv_cache_factory,
|
||||
version: str,
|
||||
num_seqs: int,
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
use_alibi: bool,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str,
|
||||
seed: int,
|
||||
device: str,
|
||||
blocksparse_local_blocks: int,
|
||||
blocksparse_vert_stride: int,
|
||||
blocksparse_block_size: int,
|
||||
blocksparse_head_sliding_step: int,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
num_query_heads, num_kv_heads = num_heads
|
||||
query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
|
||||
query.uniform_(-scale, scale)
|
||||
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
num_queries_per_kv = num_query_heads // num_kv_heads
|
||||
alibi_slopes = None
|
||||
if use_alibi:
|
||||
alibi_slopes = torch.rand(num_query_heads, dtype=torch.float)
|
||||
|
||||
seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||
seq_lens[-1] = MAX_SEQ_LEN
|
||||
max_seq_len = max(seq_lens)
|
||||
seq_lens = torch.tensor(seq_lens, dtype=torch.int)
|
||||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
random.randint(0, NUM_BLOCKS - 1)
|
||||
for _ in range(max_num_blocks_per_seq)
|
||||
]
|
||||
block_tables.append(block_table)
|
||||
block_tables = torch.tensor(block_tables, dtype=torch.int)
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
|
||||
num_kv_heads, head_size,
|
||||
kv_cache_dtype, dtype, seed,
|
||||
device)
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
# Using default kv_scale
|
||||
k_scale = v_scale = 1.0
|
||||
tp_rank = 0
|
||||
|
||||
# Call the paged attention kernel.
|
||||
output = torch.empty_like(query)
|
||||
if version == "v1":
|
||||
ops.paged_attention_v1(
|
||||
output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
tp_rank=tp_rank,
|
||||
blocksparse_local_blocks=blocksparse_local_blocks,
|
||||
blocksparse_vert_stride=blocksparse_vert_stride,
|
||||
blocksparse_block_size=blocksparse_block_size,
|
||||
blocksparse_head_sliding_step=blocksparse_head_sliding_step,
|
||||
)
|
||||
elif version == "v2":
|
||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||
assert PARTITION_SIZE % block_size == 0
|
||||
num_seqs, num_heads, head_size = output.shape
|
||||
tmp_output = torch.empty(
|
||||
size=(num_seqs, num_heads, num_partitions, head_size),
|
||||
dtype=output.dtype,
|
||||
)
|
||||
exp_sums = torch.empty(
|
||||
size=(num_seqs, num_heads, num_partitions),
|
||||
dtype=torch.float32,
|
||||
)
|
||||
max_logits = torch.empty_like(exp_sums)
|
||||
ops.paged_attention_v2(
|
||||
output,
|
||||
exp_sums,
|
||||
max_logits,
|
||||
tmp_output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
tp_rank=tp_rank,
|
||||
blocksparse_local_blocks=blocksparse_local_blocks,
|
||||
blocksparse_vert_stride=blocksparse_vert_stride,
|
||||
blocksparse_block_size=blocksparse_block_size,
|
||||
blocksparse_head_sliding_step=blocksparse_head_sliding_step,
|
||||
)
|
||||
else:
|
||||
raise AssertionError(f"Unknown version: {version}")
|
||||
|
||||
# Run the reference implementation.
|
||||
if kv_cache_dtype == "fp8":
|
||||
# Convert cache data back to dtype.
|
||||
x = 16 // torch.tensor([], dtype=dtype).element_size()
|
||||
key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
|
||||
block_size, x)
|
||||
dequantized_key_cache = torch.empty(size=key_cache_shape,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
ops.convert_fp8(dequantized_key_cache, key_cache)
|
||||
key_cache = dequantized_key_cache
|
||||
|
||||
value_cache_shape = value_cache.shape
|
||||
dequantized_value_cache = torch.empty(size=value_cache_shape,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
ops.convert_fp8(dequantized_value_cache, value_cache)
|
||||
value_cache = dequantized_value_cache
|
||||
|
||||
ref_output = torch.empty_like(query)
|
||||
ref_single_query_cached_kv_attention(
|
||||
ref_output,
|
||||
query,
|
||||
num_queries_per_kv,
|
||||
key_cache,
|
||||
value_cache,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
scale,
|
||||
alibi_slopes,
|
||||
tp_rank,
|
||||
blocksparse_local_blocks,
|
||||
blocksparse_vert_stride,
|
||||
blocksparse_block_size,
|
||||
blocksparse_head_sliding_step,
|
||||
)
|
||||
|
||||
# NOTE(woosuk): Due to the kernel-level differences in the two
|
||||
# implementations, there is a small numerical difference in the two
|
||||
# outputs. Thus, we use a relaxed tolerance for the test.
|
||||
atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
|
||||
rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
|
||||
|
||||
# NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
|
||||
# so we use a relaxed tolerance for the test.
|
||||
atol, rtol = 1e-3, 1e-5
|
||||
if kv_cache_dtype == "fp8":
|
||||
atol, rtol = 1e-2, 1e-5
|
||||
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
def ref_multi_query_kv_attention(
|
||||
cu_seq_lens: List[int],
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
dtype: torch.dtype,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(cu_seq_lens) - 1
|
||||
ref_outputs = []
|
||||
for i in range(num_seqs):
|
||||
start_idx = cu_seq_lens[i]
|
||||
end_idx = cu_seq_lens[i + 1]
|
||||
seq_len = end_idx - start_idx
|
||||
|
||||
# Create attention mask.
|
||||
attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
|
||||
diagonal=1)
|
||||
attn_mask = attn_mask * torch.finfo(dtype).min
|
||||
attn_mask = attn_mask.to(dtype=dtype)
|
||||
|
||||
ref_output = ref_masked_attention(
|
||||
query[start_idx:end_idx],
|
||||
key[start_idx:end_idx],
|
||||
value[start_idx:end_idx],
|
||||
scale,
|
||||
attn_mask=attn_mask,
|
||||
)
|
||||
ref_outputs.append(ref_output)
|
||||
ref_output = torch.cat(ref_outputs, dim=0)
|
||||
return ref_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
|
||||
@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
|
||||
@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("blocksparse_homo_heads", BLOCKSPARSE_HOMO_HEADS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_varlen_blocksparse_attention_prefill(
|
||||
num_seqs: int,
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
blocksparse_local_blocks: int,
|
||||
blocksparse_vert_stride: int,
|
||||
blocksparse_block_size: int,
|
||||
blocksparse_homo_heads: bool,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
|
||||
# As the xformers library is already tested with its own tests, we can use
|
||||
# a smaller MAX_SEQ_LEN here.
|
||||
max_len = min(MAX_SEQ_LEN, 4096)
|
||||
seq_lens = random.sample(range(1, max_len), num_seqs)
|
||||
cu_seq_lens = torch.cumsum(torch.tensor([0] + seq_lens), dim=0)
|
||||
num_tokens = sum(seq_lens)
|
||||
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
num_query_heads, num_kv_heads = num_heads
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
num_queries_per_kv = num_query_heads // num_kv_heads
|
||||
|
||||
qkv = torch.empty(num_tokens,
|
||||
num_query_heads + 2 * num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
qkv.uniform_(-scale, scale)
|
||||
query, key, value = qkv.split(
|
||||
[num_query_heads, num_kv_heads, num_kv_heads], dim=1)
|
||||
|
||||
bs_attn_op = LocalStridedBlockSparseAttn(
|
||||
num_query_heads,
|
||||
max_len,
|
||||
local_blocks=blocksparse_local_blocks,
|
||||
vert_stride=blocksparse_vert_stride,
|
||||
block_size=blocksparse_block_size,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
homo_head=blocksparse_homo_heads)
|
||||
|
||||
output = bs_attn_op(query,
|
||||
key,
|
||||
value,
|
||||
cu_seq_lens.to(device),
|
||||
sm_scale=scale)
|
||||
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
|
||||
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
|
||||
|
||||
ref_output = ref_multi_query_kv_attention(
|
||||
cu_seq_lens.tolist(),
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
scale,
|
||||
dtype,
|
||||
)
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
|
||||
432
vllm-v0.6.2/tests/kernels/test_cache.py
Normal file
432
vllm-v0.6.2/tests/kernels/test_cache.py
Normal file
@@ -0,0 +1,432 @@
|
||||
import random
|
||||
from typing import List, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
NUM_TOKENS = [42] # Arbitrary values for testing
|
||||
NUM_LAYERS = [1] # Arbitrary values for testing
|
||||
NUM_HEADS = [8] # Arbitrary values for testing
|
||||
HEAD_SIZES = [64, 80, 120, 256]
|
||||
BLOCK_SIZES = [8, 16, 32]
|
||||
|
||||
# Arbitrary values for testing
|
||||
# don't make it too large. e.g. [1024, 36000] will OOM
|
||||
NUM_BLOCKS = [1024, 10000]
|
||||
|
||||
NUM_MAPPINGS = [256] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
# We assume fp8 is always enabled for testing.
|
||||
KV_CACHE_DTYPE = ["auto", "fp8"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_copy_blocks(
|
||||
kv_cache_factory,
|
||||
num_mappings: int,
|
||||
num_layers: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
kv_cache_dtype: str,
|
||||
device: str,
|
||||
) -> None:
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
pytest.skip()
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
# Generate random block mappings where each source block is mapped to two
|
||||
# destination blocks.
|
||||
assert 2 * num_mappings <= num_blocks
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
|
||||
block_mapping: List[Tuple[int, int]] = []
|
||||
for i in range(num_mappings):
|
||||
src = src_blocks[i]
|
||||
dst1 = dst_blocks[2 * i]
|
||||
dst2 = dst_blocks[2 * i + 1]
|
||||
block_mapping.append((src, dst1))
|
||||
block_mapping.append((src, dst2))
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
|
||||
num_layers, num_heads,
|
||||
head_size, kv_cache_dtype,
|
||||
dtype, seed, device)
|
||||
|
||||
# Clone the KV caches.
|
||||
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
|
||||
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
|
||||
|
||||
# Call the copy blocks kernel.
|
||||
block_mapping_tensor = torch.tensor(block_mapping,
|
||||
dtype=torch.int64,
|
||||
device=device).view(-1, 2)
|
||||
|
||||
opcheck(torch.ops._C_cache_ops.copy_blocks,
|
||||
(key_caches, value_caches, block_mapping_tensor),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||
cond=(head_size == HEAD_SIZES[0]))
|
||||
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
|
||||
|
||||
# Run the reference implementation.
|
||||
for src, dst in block_mapping:
|
||||
for cloned_key_cache in cloned_key_caches:
|
||||
cloned_key_cache[dst].copy_(cloned_key_cache[src])
|
||||
for cloned_value_cache in cloned_value_caches:
|
||||
cloned_value_cache[dst].copy_(cloned_value_cache[src])
|
||||
|
||||
# Compare the results.
|
||||
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
|
||||
torch.testing.assert_close(key_cache, cloned_key_cache)
|
||||
for value_cache, cloned_value_cache in zip(value_caches,
|
||||
cloned_value_caches):
|
||||
torch.testing.assert_close(value_cache, cloned_value_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_reshape_and_cache(
|
||||
kv_cache_factory,
|
||||
num_tokens: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
pytest.skip()
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
# Create a random slot mapping.
|
||||
num_slots = block_size * num_blocks
|
||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
|
||||
|
||||
qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
|
||||
_, key, value = qkv.unbind(dim=1)
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
|
||||
num_heads, head_size,
|
||||
kv_cache_dtype, dtype, seed,
|
||||
device)
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
# Clone the KV caches.
|
||||
if kv_cache_dtype == "fp8":
|
||||
cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(cloned_key_cache, key_cache)
|
||||
cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(cloned_value_cache, value_cache)
|
||||
else:
|
||||
cloned_key_cache = key_cache.clone()
|
||||
cloned_value_cache = value_cache.clone()
|
||||
|
||||
# Using default kv_scale
|
||||
k_scale = v_scale = 1.0
|
||||
|
||||
# Call the reshape_and_cache kernel.
|
||||
opcheck(torch.ops._C_cache_ops.reshape_and_cache,
|
||||
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
|
||||
k_scale, v_scale),
|
||||
cond=(head_size == HEAD_SIZES[0]))
|
||||
ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
|
||||
kv_cache_dtype, k_scale, v_scale)
|
||||
|
||||
if kv_cache_dtype == "fp8":
|
||||
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(result_key_cache, key_cache)
|
||||
result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(result_value_cache, value_cache)
|
||||
|
||||
# Run the reference implementation.
|
||||
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indicies_lst = block_indicies.cpu().tolist()
|
||||
block_offsets = slot_mapping % block_size
|
||||
block_offsets_lst = block_offsets.cpu().tolist()
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indicies_lst[i]
|
||||
block_offset = block_offsets_lst[i]
|
||||
cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
|
||||
cloned_value_cache[block_idx, :, :, block_offset] = value[i]
|
||||
|
||||
if kv_cache_dtype == "fp8":
|
||||
torch.testing.assert_close(result_key_cache,
|
||||
cloned_key_cache,
|
||||
atol=0.001,
|
||||
rtol=0.1)
|
||||
torch.testing.assert_close(result_value_cache,
|
||||
cloned_value_cache,
|
||||
atol=0.001,
|
||||
rtol=0.1)
|
||||
else:
|
||||
torch.testing.assert_close(key_cache, cloned_key_cache)
|
||||
torch.testing.assert_close(value_cache, cloned_value_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_reshape_and_cache_flash(
|
||||
kv_cache_factory_flashinfer,
|
||||
num_tokens: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
# Create a random slot mapping.
|
||||
num_slots = block_size * num_blocks
|
||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||
slot_mapping = torch.tensor(slot_mapping_lst,
|
||||
dtype=torch.long,
|
||||
device=device)
|
||||
|
||||
qkv = torch.randn(num_tokens,
|
||||
3,
|
||||
num_heads,
|
||||
head_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
_, key, value = qkv.unbind(dim=1)
|
||||
|
||||
# Create the KV caches.
|
||||
key_caches, value_caches = kv_cache_factory_flashinfer(
|
||||
num_blocks,
|
||||
block_size,
|
||||
1,
|
||||
num_heads,
|
||||
head_size,
|
||||
kv_cache_dtype,
|
||||
dtype,
|
||||
device=device,
|
||||
)
|
||||
key_cache, value_cache = key_caches[0].contiguous(
|
||||
), value_caches[0].contiguous()
|
||||
del key_caches
|
||||
del value_caches
|
||||
|
||||
k_scale = key.amax().item() / 256
|
||||
v_scale = value.amax().item() / 256
|
||||
|
||||
# Clone the KV caches.
|
||||
if kv_cache_dtype == "fp8":
|
||||
cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype)
|
||||
cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(cloned_value_cache, value_cache, v_scale,
|
||||
kv_cache_dtype)
|
||||
else:
|
||||
cloned_key_cache = key_cache.clone()
|
||||
cloned_value_cache = value_cache.clone()
|
||||
|
||||
# Call the reshape_and_cache kernel.
|
||||
opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
|
||||
(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
|
||||
k_scale, v_scale),
|
||||
cond=(head_size == HEAD_SIZES[0]))
|
||||
ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
|
||||
slot_mapping, kv_cache_dtype, k_scale, v_scale)
|
||||
|
||||
if kv_cache_dtype == "fp8":
|
||||
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(result_key_cache,
|
||||
key_cache,
|
||||
k_scale,
|
||||
kv_dtype=kv_cache_dtype)
|
||||
result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(result_value_cache,
|
||||
value_cache,
|
||||
v_scale,
|
||||
kv_dtype=kv_cache_dtype)
|
||||
|
||||
# Run the reference implementation.
|
||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||
block_indicies_lst = block_indicies.cpu().tolist()
|
||||
block_offsets = slot_mapping % block_size
|
||||
block_offsets_lst = block_offsets.cpu().tolist()
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indicies_lst[i]
|
||||
block_offset = block_offsets_lst[i]
|
||||
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
|
||||
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
|
||||
|
||||
if kv_cache_dtype == "fp8":
|
||||
torch.testing.assert_close(result_key_cache,
|
||||
cloned_key_cache,
|
||||
atol=0.001,
|
||||
rtol=0.1)
|
||||
torch.testing.assert_close(result_value_cache,
|
||||
cloned_value_cache,
|
||||
atol=0.001,
|
||||
rtol=0.1)
|
||||
else:
|
||||
torch.testing.assert_close(key_cache, cloned_key_cache)
|
||||
torch.testing.assert_close(value_cache, cloned_value_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("direction", COPYING_DIRECTION)
|
||||
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||
@torch.inference_mode()
|
||||
def test_swap_blocks(
|
||||
kv_cache_factory,
|
||||
direction: Tuple[str, str],
|
||||
num_mappings: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
if kv_cache_dtype == "fp8" and "cpu" in direction:
|
||||
pytest.skip()
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
pytest.skip()
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
src_device = device if direction[0] == "cuda" else 'cpu'
|
||||
dst_device = device if direction[1] == "cuda" else 'cpu'
|
||||
|
||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
# For the same device, mapping must not overlap
|
||||
if src_device == dst_device:
|
||||
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||
dst_blocks = random.sample(remaining_blocks, num_mappings)
|
||||
else:
|
||||
dst_blocks = random.sample(range(num_blocks), num_mappings)
|
||||
|
||||
block_mapping = list(zip(src_blocks, dst_blocks))
|
||||
block_mapping_tensor = torch.tensor(block_mapping,
|
||||
dtype=torch.int64,
|
||||
device="cpu").view(-1, 2)
|
||||
|
||||
# Create the KV caches on the first device.
|
||||
src_key_caches, src_value_caches = kv_cache_factory(
|
||||
num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
|
||||
seed, src_device)
|
||||
|
||||
# Create the KV caches on the second device.
|
||||
dist_key_caches, dist_value_caches = kv_cache_factory(
|
||||
num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
|
||||
seed, dst_device)
|
||||
|
||||
src_key_caches_clone = src_key_caches[0].clone()
|
||||
src_value_caches_clone = src_value_caches[0].clone()
|
||||
|
||||
# Call the swap_blocks kernel.
|
||||
do_opcheck = (head_size == HEAD_SIZES[0])
|
||||
opcheck(torch.ops._C_cache_ops.swap_blocks,
|
||||
(src_key_caches[0], dist_key_caches[0], block_mapping_tensor),
|
||||
cond=do_opcheck)
|
||||
opcheck(torch.ops._C_cache_ops.swap_blocks,
|
||||
(src_value_caches[0], dist_value_caches[0], block_mapping_tensor),
|
||||
cond=do_opcheck)
|
||||
|
||||
ops.swap_blocks(src_key_caches[0], dist_key_caches[0],
|
||||
block_mapping_tensor)
|
||||
ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
|
||||
block_mapping_tensor)
|
||||
|
||||
for src, dst in block_mapping:
|
||||
torch.testing.assert_close(src_key_caches_clone[src].cpu(),
|
||||
dist_key_caches[0][dst].cpu())
|
||||
torch.testing.assert_close(src_value_caches_clone[src].cpu(),
|
||||
dist_value_caches[0][dst].cpu())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_fp8_e4m3_conversion(
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
low = -224.0
|
||||
high = 224.0
|
||||
shape = (num_blocks, num_heads, head_size, block_size)
|
||||
cache = torch.empty(shape, dtype=dtype, device=device)
|
||||
cache.uniform_(low, high)
|
||||
|
||||
cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
|
||||
ops.convert_fp8(cache_fp8, cache)
|
||||
|
||||
converted_cache = torch.empty_like(cache)
|
||||
ops.convert_fp8(converted_cache, cache_fp8)
|
||||
|
||||
torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)
|
||||
430
vllm-v0.6.2/tests/kernels/test_causal_conv1d.py
Normal file
430
vllm-v0.6.2/tests/kernels/test_causal_conv1d.py
Normal file
@@ -0,0 +1,430 @@
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops # noqa: F401
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
|
||||
causal_conv1d_fn, causal_conv1d_update)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def causal_conv1d_ref(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
initial_states: Optional[torch.Tensor] = None,
|
||||
return_final_states: bool = False,
|
||||
final_states_out: Optional[torch.Tensor] = None,
|
||||
activation: Optional[str] = "silu",
|
||||
):
|
||||
"""
|
||||
x: (batch, dim, seqlen)
|
||||
weight: (dim, width)
|
||||
bias: (dim,)
|
||||
initial_states: (batch, dim, width - 1)
|
||||
final_states_out: (batch, dim, width - 1)
|
||||
|
||||
out: (batch, dim, seqlen)
|
||||
"""
|
||||
if activation not in [None, "silu", "swish"]:
|
||||
raise NotImplementedError("activation must be None, silu, or swish")
|
||||
dtype_in = x.dtype
|
||||
x = x.to(weight.dtype)
|
||||
seqlen = x.shape[-1]
|
||||
dim, width = weight.shape
|
||||
if initial_states is None:
|
||||
out = F.conv1d(x,
|
||||
weight.unsqueeze(1),
|
||||
bias,
|
||||
padding=width - 1,
|
||||
groups=dim)
|
||||
else:
|
||||
x = torch.cat([initial_states, x], dim=-1)
|
||||
out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
|
||||
out = out[..., :seqlen]
|
||||
if return_final_states:
|
||||
final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
|
||||
dtype_in) # (batch, dim, width - 1)
|
||||
if final_states_out is not None:
|
||||
final_states_out.copy_(final_states)
|
||||
else:
|
||||
final_states_out = final_states
|
||||
out = (out if activation is None else F.silu(out)).to(dtype=dtype_in)
|
||||
return (out, None) if not return_final_states else (out, final_states_out)
|
||||
|
||||
|
||||
def causal_conv1d_update_ref(x,
|
||||
conv_state,
|
||||
weight,
|
||||
bias=None,
|
||||
activation=None,
|
||||
cache_seqlens=None):
|
||||
"""
|
||||
x: (batch, dim) or (batch, dim, seqlen)
|
||||
conv_state: (batch, dim, state_len), where state_len >= width - 1
|
||||
weight: (dim, width)
|
||||
bias: (dim,)
|
||||
cache_seqlens: (batch,), dtype int32.
|
||||
If not None, the conv_state is treated as a circular buffer.
|
||||
The conv_state will be updated by copying x to the
|
||||
conv_state starting at the index
|
||||
@cache_seqlens % state_len before performing the convolution.
|
||||
|
||||
out: (batch, dim) or (batch, dim, seqlen)
|
||||
"""
|
||||
if activation not in [None, "silu", "swish"]:
|
||||
raise NotImplementedError("activation must be None, silu, or swish")
|
||||
dtype_in = x.dtype
|
||||
unsqueeze = x.dim() == 2
|
||||
if unsqueeze:
|
||||
x = x.unsqueeze(-1)
|
||||
batch, dim, seqlen = x.shape
|
||||
width = weight.shape[1]
|
||||
state_len = conv_state.shape[-1]
|
||||
assert conv_state.shape == (batch, dim, state_len)
|
||||
assert weight.shape == (dim, width)
|
||||
if cache_seqlens is None:
|
||||
x_new = torch.cat([conv_state, x], dim=-1).to(
|
||||
weight.dtype) # (batch, dim, state_len + seqlen)
|
||||
conv_state.copy_(x_new[:, :, -state_len:])
|
||||
else:
|
||||
width_idx = torch.arange(
|
||||
-(width - 1), 0, dtype=torch.long,
|
||||
device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
|
||||
width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(
|
||||
-1, dim, -1)
|
||||
x_new = torch.cat([conv_state.gather(2, width_idx), x],
|
||||
dim=-1).to(weight.dtype)
|
||||
copy_idx = torch.arange(
|
||||
seqlen, dtype=torch.long,
|
||||
device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
|
||||
copy_idx = torch.remainder(copy_idx,
|
||||
state_len).unsqueeze(1).expand(-1, dim, -1)
|
||||
conv_state.scatter_(2, copy_idx, x)
|
||||
out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0,
|
||||
groups=dim)[:, :, -seqlen:]
|
||||
if unsqueeze:
|
||||
out = out.squeeze(-1)
|
||||
return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
|
||||
@pytest.mark.parametrize("silu_activation", [True])
|
||||
@pytest.mark.parametrize("has_bias", [True])
|
||||
def causal_conv1d_opcheck_fn(x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
cu_seq_len: Optional[torch.Tensor] = None,
|
||||
cache_indices: Optional[torch.Tensor] = None,
|
||||
has_initial_state: Optional[torch.Tensor] = None,
|
||||
conv_states: Optional[torch.Tensor] = None,
|
||||
activation: Optional[str] = "silu",
|
||||
pad_slot_id: int = PAD_SLOT_ID):
|
||||
"""
|
||||
x: (batch, dim, seqlen)
|
||||
weight: (dim, width)
|
||||
bias: (dim,)
|
||||
seq_idx: (batch, seqlen)
|
||||
initial_states: (batch, dim, width - 1)
|
||||
final_states_out: (batch, dim, width - 1), to be written to
|
||||
activation: either None or "silu" or "swish"
|
||||
|
||||
out: (batch, dim, seqlen)
|
||||
"""
|
||||
if activation not in [None, "silu", "swish"]:
|
||||
raise NotImplementedError("activation must be None, silu, or swish")
|
||||
if x.stride(-1) != 1:
|
||||
x = x.contiguous()
|
||||
bias = bias.contiguous() if bias is not None else None
|
||||
|
||||
opcheck(torch.ops._C.causal_conv1d_fwd,
|
||||
(x, weight, bias, conv_states, cu_seq_len, cache_indices,
|
||||
has_initial_state, activation in ["silu", "swish"], pad_slot_id))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
|
||||
@pytest.mark.parametrize("silu_activation", [True])
|
||||
@pytest.mark.parametrize("has_bias", [True])
|
||||
@pytest.mark.parametrize("width", [4])
|
||||
@pytest.mark.parametrize(
|
||||
'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
|
||||
@pytest.mark.parametrize('dim', [64])
|
||||
@pytest.mark.parametrize('batch', [1])
|
||||
def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
|
||||
itype):
|
||||
device = "cuda"
|
||||
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 1e-2, 5e-2
|
||||
# set seed
|
||||
current_platform.seed_everything(0)
|
||||
x = torch.randn(batch, dim, seqlen, device=device,
|
||||
dtype=itype).contiguous()
|
||||
|
||||
weight = torch.randn(dim, width, device=device, dtype=itype)
|
||||
bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
|
||||
initial_states = torch.randn(batch,
|
||||
dim,
|
||||
width - 1,
|
||||
device=device,
|
||||
dtype=itype)
|
||||
x_ref = x.clone()
|
||||
weight_ref = weight.clone()
|
||||
bias_ref = bias.clone() if bias is not None else None
|
||||
initial_states_ref = initial_states.clone(
|
||||
) if initial_states is not None else None
|
||||
activation = None if not silu_activation else "silu"
|
||||
out = causal_conv1d_fn(x,
|
||||
weight,
|
||||
bias,
|
||||
activation=activation,
|
||||
conv_states=initial_states,
|
||||
has_initial_state=torch.ones(batch,
|
||||
dtype=torch.bool,
|
||||
device=x.device))
|
||||
out_ref, final_states_ref = causal_conv1d_ref(
|
||||
x_ref,
|
||||
weight_ref,
|
||||
bias_ref,
|
||||
initial_states=initial_states_ref,
|
||||
return_final_states=True,
|
||||
activation=activation)
|
||||
assert initial_states is not None and final_states_ref is not None
|
||||
assert torch.allclose(initial_states,
|
||||
final_states_ref,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
|
||||
|
||||
causal_conv1d_opcheck_fn(x,
|
||||
weight,
|
||||
bias,
|
||||
activation=activation,
|
||||
conv_states=initial_states,
|
||||
has_initial_state=torch.ones(batch,
|
||||
dtype=torch.bool,
|
||||
device=x.device))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("silu_activation", [False, True])
|
||||
@pytest.mark.parametrize("has_bias", [False, True])
|
||||
@pytest.mark.parametrize("seqlen", [1])
|
||||
@pytest.mark.parametrize("width", [4])
|
||||
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
|
||||
def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation,
|
||||
itype):
|
||||
device = "cuda"
|
||||
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 1e-2, 5e-2
|
||||
# set seed
|
||||
current_platform.seed_everything(0)
|
||||
batch = 2
|
||||
x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
|
||||
x_ref = x.clone()
|
||||
conv_state = torch.randn(batch, dim, width - 1, device=device, dtype=itype)
|
||||
|
||||
weight = torch.randn(dim, width, device=device, dtype=itype)
|
||||
bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
|
||||
conv_state_ref = conv_state.detach().clone()
|
||||
activation = None if not silu_activation else "silu"
|
||||
out = causal_conv1d_update(x,
|
||||
conv_state,
|
||||
weight,
|
||||
bias,
|
||||
activation=activation)
|
||||
out_ref = causal_conv1d_update_ref(x_ref,
|
||||
conv_state_ref,
|
||||
weight,
|
||||
bias,
|
||||
activation=activation)
|
||||
|
||||
assert torch.equal(conv_state, conv_state_ref)
|
||||
assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
|
||||
|
||||
opcheck(torch.ops._C.causal_conv1d_update,
|
||||
(x, conv_state, weight, bias, activation
|
||||
in ["silu", "swish"], None, None, PAD_SLOT_ID))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("silu_activation", [False, True])
|
||||
@pytest.mark.parametrize("has_bias", [False, True])
|
||||
@pytest.mark.parametrize("seqlen", [1, 4, 5])
|
||||
@pytest.mark.parametrize("width", [2, 3, 4])
|
||||
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
|
||||
# tests correctness in case subset of the sequences are padded
|
||||
@pytest.mark.parametrize("with_padding", [True, False])
|
||||
def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width,
|
||||
seqlen, has_bias,
|
||||
silu_activation, itype):
|
||||
device = "cuda"
|
||||
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 1e-2, 5e-2
|
||||
|
||||
# set seed
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
batch_size = 3
|
||||
padding = 5 if with_padding else 0
|
||||
padded_batch_size = batch_size + padding
|
||||
total_entries = 10 * batch_size
|
||||
|
||||
x = torch.randn(padded_batch_size, dim, 1, device=device, dtype=itype)
|
||||
x_ref = x.clone()
|
||||
|
||||
conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
|
||||
dtype=torch.int32, device=device)
|
||||
unused_states_bool = torch.ones(total_entries,
|
||||
dtype=torch.bool,
|
||||
device=device)
|
||||
unused_states_bool[conv_state_indices] = False
|
||||
padded_state_indices = torch.concat([
|
||||
conv_state_indices,
|
||||
torch.as_tensor(
|
||||
[PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
|
||||
],
|
||||
dim=0)
|
||||
conv_state = torch.randn(total_entries,
|
||||
dim,
|
||||
width - 1,
|
||||
device=device,
|
||||
dtype=itype)
|
||||
conv_state_for_padding_test = conv_state.clone()
|
||||
|
||||
weight = torch.randn(dim, width, device=device, dtype=itype)
|
||||
bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
|
||||
conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
|
||||
activation = None if not silu_activation else "silu"
|
||||
out = causal_conv1d_update(x,
|
||||
conv_state,
|
||||
weight,
|
||||
bias,
|
||||
activation=activation,
|
||||
conv_state_indices=padded_state_indices,
|
||||
pad_slot_id=PAD_SLOT_ID)
|
||||
out_ref = causal_conv1d_update_ref(x_ref[:batch_size],
|
||||
conv_state_ref,
|
||||
weight,
|
||||
bias,
|
||||
activation=activation)
|
||||
|
||||
assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
|
||||
assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
|
||||
assert torch.equal(conv_state[unused_states_bool],
|
||||
conv_state_for_padding_test[unused_states_bool])
|
||||
|
||||
opcheck(torch.ops._C.causal_conv1d_update,
|
||||
(x, conv_state, weight, bias, activation
|
||||
in ["silu", "swish"], None, padded_state_indices, PAD_SLOT_ID))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("silu_activation", [True])
|
||||
@pytest.mark.parametrize("has_bias", [True])
|
||||
@pytest.mark.parametrize("width", [4])
|
||||
@pytest.mark.parametrize(
|
||||
'seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 2049, 4096])
|
||||
@pytest.mark.parametrize('dim', [64, 4096])
|
||||
# tests correctness in case subset of the sequences are padded
|
||||
@pytest.mark.parametrize('with_padding', [True, False])
|
||||
def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,
|
||||
silu_activation, itype):
|
||||
device = "cuda"
|
||||
torch.cuda.empty_cache()
|
||||
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 1e-2, 5e-2
|
||||
# set seed
|
||||
current_platform.seed_everything(0)
|
||||
seqlens = []
|
||||
batch_size = 4
|
||||
if seqlen < 10:
|
||||
batch_size = 1
|
||||
padding = 3 if with_padding else 0
|
||||
padded_batch_size = batch_size + padding
|
||||
nsplits = padded_batch_size - 1
|
||||
|
||||
eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
|
||||
seqlens.append(
|
||||
torch.diff(
|
||||
torch.cat(
|
||||
[torch.tensor([-1]), eos_pos,
|
||||
torch.tensor([seqlen - 1])])).tolist())
|
||||
assert sum(seqlens[-1]) == seqlen
|
||||
assert all(s > 0 for s in seqlens[-1])
|
||||
|
||||
total_entries = batch_size * 10
|
||||
cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
|
||||
cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
|
||||
dim=0)
|
||||
x = torch.randn(1, 4096 + dim + 64, seqlen, device=device,
|
||||
dtype=itype)[:, 4096:4096 + dim, :]
|
||||
weight = torch.randn(dim, width, device=device, dtype=itype)
|
||||
bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
|
||||
x_ref = x.clone()
|
||||
weight_ref = weight.clone()
|
||||
bias_ref = bias.clone() if bias is not None else None
|
||||
activation = None if not silu_activation else "silu"
|
||||
final_states = torch.randn(total_entries,
|
||||
dim,
|
||||
width - 1,
|
||||
device=x.device,
|
||||
dtype=x.dtype)
|
||||
final_states_ref = final_states.clone()
|
||||
has_initial_states = torch.randint(0,
|
||||
2, (cumsum.shape[0] - 1, ),
|
||||
dtype=torch.bool,
|
||||
device=x.device)
|
||||
state_indices = torch.randperm(total_entries,
|
||||
dtype=torch.int32,
|
||||
device=x.device)[:batch_size]
|
||||
padded_state_indices = torch.concat([
|
||||
state_indices,
|
||||
torch.as_tensor(
|
||||
[PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
|
||||
],
|
||||
dim=-1)
|
||||
|
||||
out = causal_conv1d_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
|
||||
padded_state_indices, has_initial_states,
|
||||
final_states, activation, PAD_SLOT_ID)
|
||||
out_ref = []
|
||||
out_ref_b = []
|
||||
|
||||
splits = [torch.split(var, seqlens[0], dim=-1) for var in (x_ref)]
|
||||
for i in range(len(seqlens[0])):
|
||||
x_s = [v[i].unsqueeze(0) for v in splits][0]
|
||||
if padded_state_indices[i] == PAD_SLOT_ID:
|
||||
continue
|
||||
out_ref_b.append(
|
||||
causal_conv1d_ref(
|
||||
x_s,
|
||||
weight_ref,
|
||||
bias_ref,
|
||||
activation=activation,
|
||||
return_final_states=True,
|
||||
final_states_out=final_states_ref[
|
||||
padded_state_indices[i]].unsqueeze(0),
|
||||
initial_states=final_states_ref[padded_state_indices[i]].
|
||||
unsqueeze(0) if has_initial_states[i] else None))
|
||||
out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
|
||||
out_ref_tensor = torch.cat(out_ref, dim=0)
|
||||
|
||||
unpadded_out = out[:, :out_ref_tensor.shape[-1]]
|
||||
assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
|
||||
assert torch.allclose(final_states[state_indices],
|
||||
final_states_ref[state_indices],
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
|
||||
padded_state_indices, has_initial_states,
|
||||
final_states, activation)
|
||||
455
vllm-v0.6.2/tests/kernels/test_cutlass.py
Normal file
455
vllm-v0.6.2/tests/kernels/test_cutlass.py
Normal file
@@ -0,0 +1,455 @@
|
||||
"""Tests for cutlass kernels
|
||||
|
||||
Run `pytest tests/kernels/test_cutlass.py`.
|
||||
"""
|
||||
from typing import Optional, Type
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MNK_FACTORS = [
|
||||
(1, 256, 128),
|
||||
(1, 16384, 1024),
|
||||
(1, 24576, 496),
|
||||
(16, 256, 496),
|
||||
(16, 16384, 128),
|
||||
(16, 24576, 4096),
|
||||
(32, 8192, 4096),
|
||||
(32, 16384, 4096),
|
||||
(33, 1024, 1024),
|
||||
(33, 8192, 128),
|
||||
(64, 2048, 496),
|
||||
(64, 16384, 1024),
|
||||
(100, 8192, 496),
|
||||
(128, 32768, 4096),
|
||||
(256, 4096, 4096),
|
||||
(512, 256, 1024),
|
||||
(512, 8192, 4096),
|
||||
(512, 16384, 128),
|
||||
(512, 24576, 128),
|
||||
]
|
||||
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
capability = current_platform.get_device_capability()
|
||||
capability = capability[0] * 10 + capability[1]
|
||||
|
||||
|
||||
def to_fp8(tensor: torch.Tensor):
|
||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||
return torch.round(tensor.clamp(
|
||||
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||
|
||||
|
||||
def to_int8(tensor: torch.Tensor):
|
||||
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||
|
||||
|
||||
def rand_int8(shape: tuple, device: str = "cuda"):
|
||||
return to_int8(torch.rand(shape, device=device) * 255 - 128)
|
||||
|
||||
|
||||
def baseline_scaled_mm(a: torch.Tensor,
|
||||
b: torch.Tensor,
|
||||
scale_a: torch.Tensor,
|
||||
scale_b: torch.Tensor,
|
||||
out_dtype: Type[torch.dtype],
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
output = (scale_a * (scale_b * (torch.mm(
|
||||
a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
|
||||
if bias is not None:
|
||||
output = output + bias
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def cutlass_fp8_gemm_helper(m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
per_token_act_quant: bool,
|
||||
per_out_channel_weight_quant: bool,
|
||||
use_bias: bool,
|
||||
out_dtype: Type[torch.dtype] = torch.bfloat16,
|
||||
device: str = "cuda"):
|
||||
# Test for a cutlass kernel with per-token activation quantization
|
||||
# and per-output channel weight quantization.
|
||||
a = to_fp8(torch.randn((m, k), device=device))
|
||||
b = to_fp8(torch.randn((n, k), device=device).t())
|
||||
|
||||
m_a_scales = m if per_token_act_quant else 1
|
||||
n_b_scales = n if per_out_channel_weight_quant else 1
|
||||
|
||||
scale_a = (torch.randn((m_a_scales, 1), device=device,
|
||||
dtype=torch.float32))
|
||||
scale_b = (torch.randn((1, n_b_scales), device=device,
|
||||
dtype=torch.float32))
|
||||
if use_bias:
|
||||
bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
|
||||
else:
|
||||
bias = None
|
||||
|
||||
out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
||||
baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
||||
|
||||
torch.testing.assert_close(out, baseline, rtol=1e-2, atol=5e-2)
|
||||
|
||||
opcheck(torch.ops._C.cutlass_scaled_mm,
|
||||
(out, a, b, scale_a, scale_b, bias))
|
||||
|
||||
|
||||
def cutlass_int8_gemm_helper(m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
per_token_act_quant: bool,
|
||||
per_out_channel_weight_quant: bool,
|
||||
use_bias: bool,
|
||||
out_dtype: Type[torch.dtype] = torch.bfloat16,
|
||||
device: str = "cuda"):
|
||||
# Test for a cutlass kernel with per-token activation quantization
|
||||
# and per-output channel weight quantization.
|
||||
a = to_int8(torch.randn((m, k), device=device) * 5)
|
||||
b = to_int8(torch.randn((n, k), device=device).t() * 5)
|
||||
|
||||
m_a_scales = m if per_token_act_quant else 1
|
||||
n_b_scales = n if per_out_channel_weight_quant else 1
|
||||
|
||||
scale_a = (torch.randn((m_a_scales, 1), device=device,
|
||||
dtype=torch.float32))
|
||||
scale_b = (torch.randn((1, n_b_scales), device=device,
|
||||
dtype=torch.float32))
|
||||
|
||||
if use_bias:
|
||||
bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
|
||||
else:
|
||||
bias = None
|
||||
|
||||
out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
||||
baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
||||
|
||||
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
|
||||
|
||||
opcheck(torch.ops._C.cutlass_scaled_mm,
|
||||
(out, a, b, scale_a, scale_b, bias))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
@pytest.mark.skipif(not current_platform.has_device_capability(89),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
|
||||
per_out_ch: bool, use_bias: bool):
|
||||
cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
|
||||
per_out_ch: bool, use_bias: bool):
|
||||
cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
|
||||
out_dtype: Type[torch.dtype],
|
||||
use_bias: bool):
|
||||
cutlass_int8_gemm_helper(512,
|
||||
512,
|
||||
512,
|
||||
per_act_token,
|
||||
per_out_ch,
|
||||
use_bias,
|
||||
out_dtype=out_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
@pytest.mark.skipif(not current_platform.has_device_capability(89),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
|
||||
out_dtype: Type[torch.dtype],
|
||||
use_bias: bool):
|
||||
cutlass_fp8_gemm_helper(512,
|
||||
512,
|
||||
512,
|
||||
per_act_token,
|
||||
per_out_ch,
|
||||
use_bias,
|
||||
out_dtype=out_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.skipif(not current_platform.has_device_capability(89),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
|
||||
use_bias: bool, device: str):
|
||||
cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
|
||||
torch.bfloat16, device)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
|
||||
use_bias: bool, device: str):
|
||||
cutlass_int8_gemm_helper(512,
|
||||
512,
|
||||
512,
|
||||
per_act_token,
|
||||
per_out_ch,
|
||||
use_bias,
|
||||
out_dtype=torch.bfloat16,
|
||||
device=device)
|
||||
|
||||
|
||||
# For the following two tests:
|
||||
# N and K correspond to the size of the weight matrix and likely to be multiples
|
||||
# of a large power of two. In any case, the kernel will have a naive fallback
|
||||
# when N and K are not divisible by 16. But M is the number of tokens and the
|
||||
# kernel must handle any M thrown at it.
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
@pytest.mark.skipif(not current_platform.has_device_capability(89),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
|
||||
use_bias: bool):
|
||||
for nk in range(32, 128, 32):
|
||||
for m in range(1, 128):
|
||||
cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
|
||||
use_bias)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
|
||||
use_bias: bool):
|
||||
for nk in range(32, 128, 32):
|
||||
for m in range(1, 128):
|
||||
cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
|
||||
use_bias)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [32, 64, 128])
|
||||
@pytest.mark.parametrize("n", [16, 32, 64])
|
||||
@pytest.mark.parametrize("k", [64, 128, 256])
|
||||
@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.skip
|
||||
def test_cutlass_int8_azp_bias_fold(m: int, n: int, k: int,
|
||||
out_dtype: torch.dtype):
|
||||
# Currently, the test is failing because folding azp into
|
||||
# 16-bit bias loses too much precision
|
||||
scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
|
||||
scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
|
||||
|
||||
aq_i8 = rand_int8((m, k))
|
||||
bq_i8 = rand_int8((n, k)).t()
|
||||
|
||||
aq_i32 = aq_i8.to(dtype=torch.int32)
|
||||
bq_i32 = bq_i8.to(dtype=torch.int32)
|
||||
|
||||
aq_f32 = aq_i8.to(dtype=torch.float32)
|
||||
bq_f32 = bq_i8.to(dtype=torch.float32)
|
||||
|
||||
b_dq = scale_b * bq_f32
|
||||
|
||||
azp_a = torch.rand((1, ), device="cuda", dtype=torch.float32) * 10 + 1.5
|
||||
azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
|
||||
azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a # correct for rounding
|
||||
|
||||
a_dq = scale_a * (aq_i32 + azp_aq_i8).to(dtype=torch.float32)
|
||||
torch.testing.assert_close(a_dq, scale_a * aq_f32 + azp_a)
|
||||
|
||||
baseline_dq = torch.mm(a_dq, b_dq).to(out_dtype)
|
||||
|
||||
J = torch.ones((1, k), device="cuda", dtype=torch.float32)
|
||||
azp_bias = (azp_a * scale_b * (J @ bq_f32)).to(out_dtype)
|
||||
assert azp_bias.shape == (1, n)
|
||||
assert azp_bias[0, :].shape == (n, )
|
||||
|
||||
baseline_q = (scale_a.to(device='cpu') * scale_b.to(device='cpu') * (
|
||||
(aq_i32 + azp_aq_i8).to(device='cpu') @ bq_i32.to(device='cpu'))).to(
|
||||
dtype=out_dtype, device='cuda')
|
||||
|
||||
out = ops.cutlass_scaled_mm(aq_i8,
|
||||
bq_i8,
|
||||
scale_a,
|
||||
scale_b,
|
||||
out_dtype=out_dtype,
|
||||
bias=azp_bias[0, :])
|
||||
torch.testing.assert_close(out, baseline_dq, rtol=1e-2, atol=1e0)
|
||||
torch.testing.assert_close(out, baseline_q, rtol=1e-2, atol=1e0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [32, 64, 128])
|
||||
@pytest.mark.parametrize("n", [16, 32, 64])
|
||||
@pytest.mark.parametrize("k", [64, 128, 256])
|
||||
@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
@pytest.mark.parametrize("azp_per_token", [True, False])
|
||||
def test_cutlass_int8_azp(m: int, n: int, k: int, out_dtype: torch.dtype,
|
||||
use_bias: bool, azp_per_token: bool):
|
||||
m_azp = m if azp_per_token else 1
|
||||
scale_a = torch.randn((m_azp, 1), device="cuda", dtype=torch.float32) / 10
|
||||
scale_b = torch.randn((1, n), device="cuda", dtype=torch.float32) / 10
|
||||
|
||||
aq_i8 = rand_int8((m, k))
|
||||
aq_i32 = aq_i8.to(dtype=torch.int32)
|
||||
aq_f32 = aq_i8.to(dtype=torch.float32)
|
||||
|
||||
bq_i8 = rand_int8((n, k)).t()
|
||||
bq_i32 = bq_i8.to(dtype=torch.int32)
|
||||
bq_f32 = bq_i8.to(dtype=torch.float32)
|
||||
b_dq = scale_b * bq_f32
|
||||
|
||||
azp_a = torch.rand(
|
||||
(m_azp, 1), device="cuda", dtype=torch.float32) * 10 + 1.5
|
||||
azp_aq_i8 = (azp_a / scale_a).to(dtype=torch.int8)
|
||||
azp_a = azp_aq_i8.to(dtype=torch.float32) * scale_a # correct for rounding
|
||||
|
||||
a_dq = scale_a * (aq_i32 - azp_aq_i8).to(dtype=torch.float32)
|
||||
torch.testing.assert_close(a_dq,
|
||||
scale_a * aq_f32 - azp_a,
|
||||
rtol=1e-4,
|
||||
atol=1e-3)
|
||||
|
||||
if use_bias:
|
||||
bias = torch.rand((1, n), device="cuda", dtype=out_dtype) * 10 + 2.5
|
||||
else:
|
||||
bias = torch.zeros((1, n), device="cuda", dtype=out_dtype)
|
||||
|
||||
baseline_dq = (torch.mm(a_dq, b_dq) + bias).to(out_dtype)
|
||||
|
||||
# int32 mm not supported on CUDA
|
||||
a_noazp_i32_cpu = (aq_i32 - azp_aq_i8).to(device='cpu')
|
||||
cq = (a_noazp_i32_cpu @ bq_i32.to(device='cpu')).to(device='cuda')
|
||||
baseline_q = (scale_a * scale_b * cq + bias).to(dtype=out_dtype)
|
||||
|
||||
# Hadamard is just the sum of the cols
|
||||
azp_adj_i32 = bq_i32.sum(dim=0, keepdim=True, dtype=torch.int32)
|
||||
azp_i32 = azp_aq_i8.to(dtype=torch.int32)
|
||||
func_bias = bias if use_bias else None
|
||||
|
||||
if azp_per_token:
|
||||
out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
|
||||
out_dtype, azp_adj_i32, azp_i32,
|
||||
func_bias)
|
||||
else:
|
||||
azp_with_adj_i32 = azp_i32 * azp_adj_i32
|
||||
out = ops.cutlass_scaled_mm_azp(aq_i8, bq_i8, scale_a, scale_b,
|
||||
out_dtype, azp_with_adj_i32, None,
|
||||
func_bias)
|
||||
|
||||
# bfloat16 precision is 7-bit mantissa -> 2^-8 ~ 0.4%
|
||||
# float16 precision is 10-bit mantissa -> 2^-11 ~ 0.05%
|
||||
rtol = 1e-2 if out_dtype == torch.bfloat16 else 1e-3
|
||||
atol = 1e-3
|
||||
torch.testing.assert_close(out, baseline_dq, rtol=rtol, atol=atol)
|
||||
torch.testing.assert_close(out, baseline_q, rtol=rtol, atol=atol)
|
||||
|
||||
if azp_per_token:
|
||||
opcheck(torch.ops._C.cutlass_scaled_mm_azp,
|
||||
(out, aq_i8, bq_i8, scale_a, scale_b, azp_adj_i32, azp_i32,
|
||||
func_bias))
|
||||
else:
|
||||
opcheck(torch.ops._C.cutlass_scaled_mm_azp,
|
||||
(out, aq_i8, bq_i8, scale_a, scale_b, azp_with_adj_i32, None,
|
||||
func_bias))
|
||||
|
||||
|
||||
# Test working with a subset of A and B
|
||||
def test_cutlass_subset():
|
||||
big_m, big_n, big_k = 1024, 1024, 1024
|
||||
m, n, k = 512, 512, 512
|
||||
|
||||
whole_a = to_int8(torch.randn((big_m, big_k), device="cuda") * 5)
|
||||
whole_b = to_int8(torch.randn((big_n, big_k), device="cuda").t() * 5)
|
||||
a = whole_a[0:m, 0:k]
|
||||
b = whole_b[0:k, 0:n]
|
||||
|
||||
scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
|
||||
scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
|
||||
|
||||
out = ops.cutlass_scaled_mm(a,
|
||||
b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
out_dtype=torch.bfloat16)
|
||||
baseline = baseline_scaled_mm(a,
|
||||
b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
out_dtype=torch.bfloat16)
|
||||
|
||||
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
|
||||
|
||||
|
||||
# Test to make sure cuda graphs work
|
||||
class CutlassLayer(torch.nn.Module):
|
||||
|
||||
def __init__(self, b, scale_a, scale_b, out_dtype):
|
||||
super().__init__()
|
||||
self.b = b
|
||||
self.scale_a = scale_a
|
||||
self.scale_b = scale_b
|
||||
self.out_dtype = out_dtype
|
||||
|
||||
def forward(self, a):
|
||||
return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
|
||||
self.out_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("per_act_token", [True, False])
|
||||
@pytest.mark.parametrize("per_out_ch", [True, False])
|
||||
def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
|
||||
m, n, k = 512, 512, 512
|
||||
|
||||
a = to_int8(torch.randn((m, k), device="cuda"))
|
||||
b = to_int8(torch.randn((n, k), device="cuda").t())
|
||||
|
||||
m_a_scales = m if per_act_token else 1
|
||||
n_b_scales = n if per_out_ch else 1
|
||||
|
||||
scale_a = (torch.randn(
|
||||
(m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
|
||||
scale_b = (torch.randn(
|
||||
(1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
|
||||
|
||||
# Construct a trivial model with a single layer that calls a CUTLASS kernel
|
||||
model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
|
||||
|
||||
# Run the model with a cuda graph
|
||||
stream = torch.cuda.Stream()
|
||||
with torch.cuda.stream(stream):
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g):
|
||||
out = model(a)
|
||||
out.zero_()
|
||||
g.replay()
|
||||
|
||||
baseline = torch.mm(scale_a * a.to(dtype=torch.float32),
|
||||
scale_b * b.to(dtype=torch.float32)).to(torch.bfloat16)
|
||||
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
|
||||
|
||||
|
||||
def test_cutlass_support_opcheck():
|
||||
opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
|
||||
1083
vllm-v0.6.2/tests/kernels/test_encoder_decoder_attn.py
Normal file
1083
vllm-v0.6.2/tests/kernels/test_encoder_decoder_attn.py
Normal file
File diff suppressed because it is too large
Load Diff
93
vllm-v0.6.2/tests/kernels/test_feed_forward.py
Normal file
93
vllm-v0.6.2/tests/kernels/test_feed_forward.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import pytest
|
||||
import numpy
|
||||
import torch
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader, initialize_dummy_weights
|
||||
from vllm_mlu.model_executor.layers.feed_forward import FeedForward
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from ..utils import init_test_distributed_environment
|
||||
|
||||
|
||||
def compute_diff(baseline: numpy.ndarray, compare: numpy.ndarray):
|
||||
error = numpy.abs(baseline - compare)
|
||||
diff1 = numpy.sum(error) / numpy.sum(numpy.abs(baseline))
|
||||
diff2 = numpy.sqrt(numpy.sum(error**2)/numpy.sum(baseline**2))
|
||||
return diff1, diff2
|
||||
|
||||
|
||||
BATCH_SIZE = [1]
|
||||
SEQ_LENS = [128]
|
||||
HIDDEN_SIZE = [32]
|
||||
INTERMEDIATE_SIZE = [64]
|
||||
HIDDEN_ACT = ['silu', 'gelu']
|
||||
IS_GATED = [True, False]
|
||||
BIAS = [True, False]
|
||||
UP_PROJ_NAME = ['up_proj']
|
||||
DOWN_PROJ_NAME = ['down_proj']
|
||||
DTYPE = [torch.float16]
|
||||
SEED = [0]
|
||||
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZE)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZE)
|
||||
@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZE)
|
||||
@pytest.mark.parametrize("hidden_act", HIDDEN_ACT)
|
||||
@pytest.mark.parametrize("is_gated", IS_GATED)
|
||||
@pytest.mark.parametrize("bias", BIAS)
|
||||
@pytest.mark.parametrize("up_proj_name", UP_PROJ_NAME)
|
||||
@pytest.mark.parametrize("down_proj_name", DOWN_PROJ_NAME)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@torch.inference_mode()
|
||||
def test_feed_forward(
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
hidden_act: str,
|
||||
is_gated: bool,
|
||||
bias: bool,
|
||||
up_proj_name: str,
|
||||
down_proj_name: str,
|
||||
dtype: torch.dtype,
|
||||
seed : int
|
||||
) -> None:
|
||||
device = torch.device("mlu:0")
|
||||
set_random_seed(seed)
|
||||
|
||||
# init distributed environment
|
||||
# now only support tensor_parallel_size=1 and pipeline_parallel_size=1
|
||||
if not torch.distributed.is_initialized():
|
||||
init_test_distributed_environment(pp_size=1,
|
||||
tp_size=1,
|
||||
rank=0,
|
||||
distributed_init_port="3000",
|
||||
local_rank=0)
|
||||
|
||||
with set_default_torch_dtype(dtype):
|
||||
# create ffn and initialize weights
|
||||
ffn = FeedForward(hidden_size=hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
hidden_act=hidden_act,
|
||||
up_proj_name=up_proj_name,
|
||||
is_gated=is_gated,
|
||||
down_proj_name=down_proj_name,
|
||||
bias=bias).to(device)
|
||||
initialize_dummy_weights(ffn, low=-1e-1, high=1e-1)
|
||||
|
||||
# create input
|
||||
hidden_states = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device=device)
|
||||
|
||||
# ffn forward
|
||||
out = ffn(hidden_states)
|
||||
# reference ffn forward
|
||||
ref_out = ffn._forward(hidden_states)
|
||||
|
||||
# compute the diff1 and diff2 value, for fp16, the threshold is 5e-3
|
||||
diff1, diff2 = compute_diff(baseline=ref_out.cpu().float().detach().numpy(),
|
||||
compare=out.cpu().float().detach().numpy())
|
||||
|
||||
del ffn, hidden_states, out, ref_out
|
||||
|
||||
assert diff1 <= 5e-3 and diff2 <= 5e-3
|
||||
233
vllm-v0.6.2/tests/kernels/test_flash_attention.py
Normal file
233
vllm-v0.6.2/tests/kernels/test_flash_attention.py
Normal file
@@ -0,0 +1,233 @@
|
||||
import numpy as np
|
||||
import math
|
||||
import random
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm_mlu.attention.ops.triton_flash_attention import triton_attention
|
||||
class SelfAttention(torch.nn.Module):
|
||||
"""Implement the scaled dot product attention with softmax.
|
||||
Arguments
|
||||
---------
|
||||
softmax_scale: The temperature to use for the softmax attention.
|
||||
(default: 1/sqrt(d_keys) where d_keys is computed at
|
||||
runtime)
|
||||
attention_dropout: The dropout rate to apply to the attention
|
||||
(default: 0.0)
|
||||
"""
|
||||
def __init__(self, causal=False, softmax_scale=None):
|
||||
super().__init__()
|
||||
self.causal = causal
|
||||
self.softmax_scale = softmax_scale
|
||||
|
||||
# special alibi only support causal
|
||||
def build_alibi(self, slopes, block_size, n_heads, dtype):
|
||||
device ='mlu'
|
||||
tril = torch.tril(torch.ones(1,1 , block_size, block_size, device = device))
|
||||
bias_rows = torch.arange( block_size, device=device).view(1, -1)
|
||||
bias_cols = torch.arange( block_size, device=device).view(-1, 1)
|
||||
bias = - torch.sqrt(bias_cols - bias_rows)
|
||||
bias = bias.view(1, block_size, block_size) * slopes.view(-1, 1, 1)
|
||||
bias = bias.masked_fill(tril == 0, float('-inf'))
|
||||
return bias.type(dtype)
|
||||
|
||||
def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, cur_seq_len_t:torch.Tensor, alibi_slope:torch.Tensor, attn_bias:torch.Tensor):
|
||||
"""Implements the multihead softmax attention.
|
||||
Arguments
|
||||
---------
|
||||
q: The tensor containing the query. (B, T, H, D)
|
||||
k: The tensor containing the key. (B, T, H, D)
|
||||
v: The tensor containing the value. (B, T, H, D)
|
||||
cur_seq_len_t: true_seq_lens. (B+1)
|
||||
alibi_slope: (H) or (B, H)
|
||||
attn_bias: (B,H,T,T) or (B,T,T)
|
||||
"""
|
||||
batch = q.shape[0]
|
||||
seq_q = q.shape[1]
|
||||
seq_k = k.shape[1]
|
||||
head = q.shape[2]
|
||||
scores = torch.einsum('bthd,bshd->bhts', q, k )* self.softmax_scale
|
||||
# mask
|
||||
if alibi_slope is not None:
|
||||
slope = torch.zeros((batch, head)).mlu()
|
||||
if len(alibi_slope.shape) == 1 :
|
||||
slope[:,]=alibi_slope
|
||||
else:
|
||||
slope=alibi_slope
|
||||
slope = slope.reshape(batch, head, 1, 1)
|
||||
slope_bias = torch.zeros(batch, head, seq_q, seq_k).mlu()
|
||||
if self.causal:
|
||||
relative_pos = torch.arange(-seq_k + 1, 1, dtype=torch.float32).mlu()
|
||||
slope_bias = relative_pos * slope
|
||||
else:
|
||||
row_idx = torch.arange(seq_q, dtype=torch.long).reshape(-1, 1)
|
||||
col_idx = torch.arange(seq_k, dtype=torch.long)
|
||||
relative_pos = torch.abs(row_idx + seq_k - seq_q - col_idx).mlu()
|
||||
slope_bias = -slope * relative_pos.to(dtype=slope.dtype)
|
||||
# if use special alibi
|
||||
# slope_bias = self.build_alibi(alibi_slope, seq_k, head, dtype=torch.float32)
|
||||
|
||||
scores += slope_bias
|
||||
if attn_bias is not None:
|
||||
if len(attn_bias.shape) == 3:
|
||||
scores += attn_bias.unsqueeze(1)
|
||||
else:
|
||||
scores +=attn_bias
|
||||
if self.causal:
|
||||
causal_mask = torch.triu(torch.full((seq_q, seq_k), -10000.0, device=scores.device), 1)
|
||||
scores = scores + causal_mask.to(dtype=scores.dtype)
|
||||
else: # fill -inf in pad_area
|
||||
for b in range(batch):
|
||||
true_seq_len = cur_seq_len_t[b + 1] - cur_seq_len_t[b]
|
||||
scores[b, ..., true_seq_len:] = -10000.0
|
||||
scores[b, :, true_seq_len:, :] = -10000.0
|
||||
attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
|
||||
output = torch.einsum('bhts,bshd->bthd', attention, v)
|
||||
return output.contiguous()
|
||||
|
||||
|
||||
NUM_HEADS = [64, 256]
|
||||
NUM_QUERIES_PER_KV = [1]
|
||||
HEAD_SIZES = [96]
|
||||
DTYPES = [torch.float16, torch.float32]
|
||||
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_contexted_kv_attention(num_heads: int, num_queries_per_kv: int,
|
||||
head_size: int, dtype: torch.dtype) -> None:
|
||||
"""
|
||||
split test case head_size 96 cause multi tests in one pytest will conflict memory.
|
||||
"""
|
||||
device="cuda"
|
||||
|
||||
MAX_SEQ_LEN = 1024
|
||||
MAX_CTX_LEN = 1024
|
||||
BS = 10
|
||||
cache_size = 640
|
||||
block_size = 32
|
||||
max_block_per_request = 64
|
||||
random.seed(1)
|
||||
query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
|
||||
ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
|
||||
seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
|
||||
num_kv_heads = num_heads
|
||||
|
||||
num_tokens = sum(query_lens)
|
||||
max_seqlens_q = max(query_lens)
|
||||
max_seqlens_k = max(query_lens)
|
||||
cu_seqlens = [0]
|
||||
for value in query_lens:
|
||||
cu_seqlens.append(cu_seqlens[-1] + value)
|
||||
cu_seqlens_q = torch.tensor(cu_seqlens, dtype=torch.int, device=device)
|
||||
cu_seqlens_k = torch.tensor(cu_seqlens, dtype=torch.int, device=device)
|
||||
|
||||
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype,device=device)
|
||||
query.uniform_(-1e-3, 1e-3)
|
||||
triton_output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype,device=device)
|
||||
|
||||
kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype,device=device)
|
||||
kv.uniform_(-1e-3, 1e-3)
|
||||
key, value = kv.unbind(dim=1)
|
||||
|
||||
k_cache = torch.zeros(cache_size,
|
||||
block_size, num_kv_heads, head_size, dtype=dtype, device=device)
|
||||
v_cache = torch.zeros(cache_size, block_size, num_kv_heads, head_size,
|
||||
dtype=dtype, device=device)
|
||||
k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype,device=device)
|
||||
v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype,device=device)
|
||||
values = torch.arange(0, cache_size, dtype=torch.long,device=device)
|
||||
values = values[torch.randperm(cache_size,device=device)]
|
||||
block_table = values[:BS * max_block_per_request].view(BS, max_block_per_request)
|
||||
b_seq_len = torch.tensor(seq_lens, dtype=torch.long,device=device)
|
||||
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long,device=device)
|
||||
b_start_loc = torch.cumsum(torch.tensor(
|
||||
[0] + query_lens[:-1], dtype=torch.long,device=device), dim=0)
|
||||
max_input_len = MAX_SEQ_LEN
|
||||
# copy kv to cache
|
||||
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
|
||||
dtype=torch.long,device=device), dim=0)
|
||||
for i in range(BS):
|
||||
for j in range(query_lens[i]):
|
||||
k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j])
|
||||
v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j])
|
||||
cur_ctx = 0
|
||||
block_id = 0
|
||||
while cur_ctx < b_ctx_len[i]:
|
||||
start_loc = b_seq_start_loc[i] + cur_ctx
|
||||
if cur_ctx + block_size > b_ctx_len[i]:
|
||||
end_loc = b_seq_start_loc[i] + b_ctx_len[i]
|
||||
else:
|
||||
end_loc = start_loc + block_size
|
||||
start_slot = block_table[i, block_id] * block_size
|
||||
end_slot = start_slot + end_loc - start_loc
|
||||
k_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(key[start_loc:end_loc])
|
||||
v_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(value[start_loc:end_loc])
|
||||
cur_ctx += block_size
|
||||
block_id += 1
|
||||
# transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
|
||||
# to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
|
||||
k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
|
||||
8).permute(0, 2, 3, 1, 4).contiguous()
|
||||
# transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
|
||||
# to V_cache[num_blocks, num_kv_heads, head_size, block_size]
|
||||
v_cache = v_cache.view(-1, block_size, num_kv_heads,
|
||||
head_size).permute(0, 2, 3, 1).contiguous()
|
||||
|
||||
triton_output,_ = triton_attention(query,
|
||||
k,
|
||||
v,
|
||||
None,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k,
|
||||
max_seqlens_q,
|
||||
max_seqlens_k)
|
||||
triton_output_cpu = triton_output.to(device='cpu')
|
||||
def copy_pack_data_to_pad_data(pad_input: torch.Tensor,
|
||||
packed_input_list: list,
|
||||
t_len_sequence: list,
|
||||
max_sequence_len: int):
|
||||
end_index1 = 0
|
||||
for index in range(len(t_len_sequence)):
|
||||
start_index1 = end_index1
|
||||
end_index1 = end_index1 + t_len_sequence[index]
|
||||
start_index = index * max_sequence_len
|
||||
end_index = start_index + t_len_sequence[index]
|
||||
pad_input[start_index:end_index, ...] = packed_input_list[start_index1:end_index1, ...]
|
||||
|
||||
pad_input_q = torch.zeros((MAX_SEQ_LEN * BS, num_heads, head_size)).mlu().half()
|
||||
pad_input_k = torch.zeros((MAX_SEQ_LEN * BS, num_kv_heads, head_size)).mlu().half()
|
||||
pad_input_v = torch.zeros((MAX_SEQ_LEN * BS, num_kv_heads, head_size)).mlu().half()
|
||||
copy_pack_data_to_pad_data(pad_input_q, query, query_lens, MAX_SEQ_LEN)
|
||||
copy_pack_data_to_pad_data(pad_input_k, k, query_lens, MAX_SEQ_LEN)
|
||||
copy_pack_data_to_pad_data(pad_input_v, v, query_lens, MAX_SEQ_LEN)
|
||||
softmax_scale = 1 / math.sqrt(head_size)
|
||||
attention = SelfAttention(causal = False, softmax_scale=softmax_scale)
|
||||
torch_output = attention(pad_input_q.view(BS, MAX_SEQ_LEN, num_heads, head_size),
|
||||
pad_input_k.view(BS, MAX_SEQ_LEN, num_kv_heads, head_size),
|
||||
pad_input_v.view(BS, MAX_SEQ_LEN, num_kv_heads, head_size),
|
||||
cu_seqlens_q,None,None)
|
||||
pad_triton_output = torch_output.clone().view(BS * MAX_SEQ_LEN, num_heads, head_size)
|
||||
copy_pack_data_to_pad_data(pad_triton_output, triton_output_cpu, query_lens, MAX_SEQ_LEN)
|
||||
view_triton_output = pad_triton_output.view(BS, MAX_SEQ_LEN, num_heads, head_size)
|
||||
torch.testing.assert_close(view_triton_output, torch_output)
|
||||
|
||||
|
||||
HEAD_SIZES = [24, 128]
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_contexted_kv_attention_1(num_heads: int, num_queries_per_kv: int, head_size: int,
|
||||
dtype: torch.dtype) -> None:
|
||||
"""
|
||||
split test case ihead_size 24, 128 cause multi tests in one pytest will conflict memory.
|
||||
"""
|
||||
test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size, dtype)
|
||||
227
vllm-v0.6.2/tests/kernels/test_flash_attn.py
Normal file
227
vllm-v0.6.2/tests/kernels/test_flash_attn.py
Normal file
@@ -0,0 +1,227 @@
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
|
||||
flash_attn_with_kvcache)
|
||||
|
||||
NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
|
||||
HEAD_SIZES = [128, 256]
|
||||
BLOCK_SIZES = [16, 32]
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
# one value large enough to test overflow in index calculation.
|
||||
# one value small enough to test the schema op check
|
||||
NUM_BLOCKS = [32768, 2048]
|
||||
|
||||
|
||||
def ref_paged_attn(
|
||||
query: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
query_lens: List[int],
|
||||
kv_lens: List[int],
|
||||
block_tables: torch.Tensor,
|
||||
scale: float,
|
||||
sliding_window: Optional[int] = None,
|
||||
soft_cap: Optional[float] = None,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(query_lens)
|
||||
block_tables = block_tables.cpu().numpy()
|
||||
_, block_size, num_kv_heads, head_size = key_cache.shape
|
||||
|
||||
outputs: List[torch.Tensor] = []
|
||||
start_idx = 0
|
||||
for i in range(num_seqs):
|
||||
query_len = query_lens[i]
|
||||
kv_len = kv_lens[i]
|
||||
q = query[start_idx:start_idx + query_len]
|
||||
q *= scale
|
||||
|
||||
num_kv_blocks = (kv_len + block_size - 1) // block_size
|
||||
block_indices = block_tables[i, :num_kv_blocks]
|
||||
|
||||
k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||
k = k[:kv_len]
|
||||
v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||
v = v[:kv_len]
|
||||
|
||||
if q.shape[1] != k.shape[1]:
|
||||
k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
|
||||
v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
|
||||
attn = torch.einsum("qhd,khd->hqk", q, k).float()
|
||||
empty_mask = torch.ones(query_len, kv_len)
|
||||
mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
|
||||
if sliding_window is not None:
|
||||
sliding_window_mask = torch.triu(empty_mask,
|
||||
diagonal=kv_len -
|
||||
(query_len + sliding_window) +
|
||||
1).bool().logical_not()
|
||||
mask |= sliding_window_mask
|
||||
if soft_cap is not None:
|
||||
attn = soft_cap * torch.tanh(attn / soft_cap)
|
||||
attn.masked_fill_(mask, float("-inf"))
|
||||
attn = torch.softmax(attn, dim=-1).to(v.dtype)
|
||||
out = torch.einsum("hqk,khd->qhd", attn, v)
|
||||
|
||||
outputs.append(out)
|
||||
start_idx += query_len
|
||||
|
||||
return torch.cat(outputs, dim=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("sliding_window", [None, 256])
|
||||
@torch.inference_mode()
|
||||
def test_flash_attn_with_paged_kv(
|
||||
kv_lens: List[int],
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
num_blocks: int,
|
||||
sliding_window: Optional[int],
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(kv_lens)
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
max_kv_len = max(kv_lens)
|
||||
scale = head_size**-0.5
|
||||
window_size = ((sliding_window - 1, 0) if sliding_window is not None else
|
||||
(-1, -1))
|
||||
|
||||
query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
|
||||
key_cache = torch.randn(num_blocks,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
value_cache = torch.randn_like(key_cache)
|
||||
kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
|
||||
|
||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||
block_tables = torch.randint(0,
|
||||
num_blocks,
|
||||
(num_seqs, max_num_blocks_per_seq),
|
||||
dtype=torch.int32)
|
||||
|
||||
output = flash_attn_with_kvcache(
|
||||
q=query.unsqueeze(1),
|
||||
k_cache=key_cache,
|
||||
v_cache=value_cache,
|
||||
softmax_scale=scale,
|
||||
causal=True,
|
||||
block_table=block_tables,
|
||||
cache_seqlens=kv_lens_tensor,
|
||||
softcap=soft_cap if soft_cap is not None else 0,
|
||||
window_size=window_size,
|
||||
).squeeze(1)
|
||||
|
||||
ref_output = ref_paged_attn(query=query,
|
||||
key_cache=key_cache,
|
||||
value_cache=value_cache,
|
||||
query_lens=[1] * num_seqs,
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
soft_cap=soft_cap,
|
||||
sliding_window=sliding_window)
|
||||
torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("sliding_window", [None, 256])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@torch.inference_mode()
|
||||
def test_varlen_with_paged_kv(
|
||||
seq_lens: List[Tuple[int, int]],
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
sliding_window: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
num_blocks: int,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
max_query_len = max(query_lens)
|
||||
max_kv_len = max(kv_lens)
|
||||
window_size = ((sliding_window - 1, 0) if sliding_window is not None else
|
||||
(-1, -1))
|
||||
scale = head_size**-0.5
|
||||
|
||||
query = torch.randn(sum(query_lens),
|
||||
num_query_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
key_cache = torch.randn(num_blocks,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
value_cache = torch.randn_like(key_cache)
|
||||
cu_query_lens = torch.tensor([0] + query_lens,
|
||||
dtype=torch.int32).cumsum(dim=0,
|
||||
dtype=torch.int32)
|
||||
cu_kv_lens = torch.tensor([0] + kv_lens,
|
||||
dtype=torch.int32).cumsum(dim=0,
|
||||
dtype=torch.int32)
|
||||
|
||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||
block_tables = torch.randint(0,
|
||||
num_blocks,
|
||||
(num_seqs, max_num_blocks_per_seq),
|
||||
dtype=torch.int32)
|
||||
|
||||
output = flash_attn_varlen_func(
|
||||
q=query,
|
||||
k=key_cache,
|
||||
v=value_cache,
|
||||
cu_seqlens_q=cu_query_lens,
|
||||
cu_seqlens_k=cu_kv_lens,
|
||||
max_seqlen_q=max_query_len,
|
||||
max_seqlen_k=max_kv_len,
|
||||
softmax_scale=scale,
|
||||
causal=True,
|
||||
window_size=window_size,
|
||||
block_table=block_tables,
|
||||
softcap=soft_cap if soft_cap is not None else 0,
|
||||
)
|
||||
|
||||
ref_output = ref_paged_attn(
|
||||
query=query,
|
||||
key_cache=key_cache,
|
||||
value_cache=value_cache,
|
||||
query_lens=query_lens,
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
sliding_window=sliding_window,
|
||||
soft_cap=soft_cap,
|
||||
)
|
||||
torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
470
vllm-v0.6.2/tests/kernels/test_flashinfer.py
Normal file
470
vllm-v0.6.2/tests/kernels/test_flashinfer.py
Normal file
@@ -0,0 +1,470 @@
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import flashinfer
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
|
||||
HEAD_SIZES = [128, 256]
|
||||
BLOCK_SIZES = [16, 32]
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation.
|
||||
|
||||
|
||||
def ref_paged_attn(
|
||||
query: torch.Tensor,
|
||||
key_cache: torch.Tensor,
|
||||
value_cache: torch.Tensor,
|
||||
query_lens: List[int],
|
||||
kv_lens: List[int],
|
||||
block_tables: torch.Tensor,
|
||||
scale: float,
|
||||
sliding_window: Optional[int] = None,
|
||||
soft_cap: Optional[float] = None,
|
||||
) -> torch.Tensor:
|
||||
num_seqs = len(query_lens)
|
||||
block_tables = block_tables.cpu().numpy()
|
||||
_, block_size, num_kv_heads, head_size = key_cache.shape
|
||||
|
||||
outputs: List[torch.Tensor] = []
|
||||
start_idx = 0
|
||||
for i in range(num_seqs):
|
||||
query_len = query_lens[i]
|
||||
kv_len = kv_lens[i]
|
||||
q = query[start_idx:start_idx + query_len]
|
||||
q *= scale
|
||||
|
||||
num_kv_blocks = (kv_len + block_size - 1) // block_size
|
||||
block_indices = block_tables[i, :num_kv_blocks]
|
||||
|
||||
k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||
k = k[:kv_len]
|
||||
v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
|
||||
v = v[:kv_len]
|
||||
|
||||
if q.shape[1] != k.shape[1]:
|
||||
k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
|
||||
v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
|
||||
attn = torch.einsum("qhd,khd->hqk", q, k).float()
|
||||
empty_mask = torch.ones(query_len, kv_len)
|
||||
mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
|
||||
if sliding_window is not None:
|
||||
sliding_window_mask = torch.triu(empty_mask,
|
||||
diagonal=kv_len -
|
||||
(query_len + sliding_window) +
|
||||
1).bool().logical_not()
|
||||
mask |= sliding_window_mask
|
||||
if soft_cap is not None:
|
||||
attn = soft_cap * torch.tanh(attn / soft_cap)
|
||||
attn.masked_fill_(mask, float("-inf"))
|
||||
attn = torch.softmax(attn, dim=-1).to(v.dtype)
|
||||
out = torch.einsum("hqk,khd->qhd", attn, v)
|
||||
|
||||
outputs.append(out)
|
||||
start_idx += query_len
|
||||
|
||||
return torch.cat(outputs, dim=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
|
||||
@torch.inference_mode
|
||||
def test_flashinfer_decode_with_paged_kv(
|
||||
kv_lens: List[int],
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(kv_lens)
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
max_kv_len = max(kv_lens)
|
||||
scale = head_size**-0.5
|
||||
|
||||
query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
|
||||
|
||||
key_value_cache = torch.randn(NUM_BLOCKS,
|
||||
2,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
|
||||
value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
|
||||
|
||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||
block_tables = torch.randint(0,
|
||||
NUM_BLOCKS,
|
||||
(num_seqs, max_num_blocks_per_seq),
|
||||
dtype=torch.int32)
|
||||
|
||||
kv_indptr = [0]
|
||||
kv_indices = []
|
||||
kv_last_page_lens = []
|
||||
for i in range(num_seqs):
|
||||
seq_len = kv_lens[i]
|
||||
assert seq_len > 0
|
||||
num_blocks = (seq_len + block_size - 1) // block_size
|
||||
kv_indices.extend(block_tables[i, :num_blocks])
|
||||
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
||||
kv_last_page_len = seq_len % block_size
|
||||
if kv_last_page_len == 0:
|
||||
kv_last_page_len = block_size
|
||||
kv_last_page_lens.append(kv_last_page_len)
|
||||
|
||||
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||
|
||||
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
|
||||
wrapper = flashinfer.\
|
||||
BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
|
||||
use_tensor_cores=(
|
||||
(num_query_heads//num_kv_heads) > 4)
|
||||
)
|
||||
wrapper.begin_forward(kv_indptr,
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
num_query_heads,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
block_size,
|
||||
"NONE",
|
||||
data_type=dtype)
|
||||
|
||||
output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
|
||||
|
||||
ref_output = ref_paged_attn(query=query,
|
||||
key_cache=key_cache,
|
||||
value_cache=value_cache,
|
||||
query_lens=[1] * num_seqs,
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
soft_cap=soft_cap)
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
|
||||
@torch.inference_mode
|
||||
def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int, dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float]) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
max_kv_len = max(kv_lens)
|
||||
scale = head_size**-0.5
|
||||
|
||||
query = torch.randn(sum(query_lens),
|
||||
num_query_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
key_value_cache = torch.randn(NUM_BLOCKS,
|
||||
2,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
|
||||
value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
|
||||
|
||||
# Normalize the scale of the key and value caches to mitigate
|
||||
# numerical instability.
|
||||
key_cache /= head_size**0.5
|
||||
value_cache /= head_size**0.5
|
||||
|
||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||
block_tables = torch.randint(0,
|
||||
NUM_BLOCKS,
|
||||
(num_seqs, max_num_blocks_per_seq),
|
||||
dtype=torch.int32)
|
||||
|
||||
qo_indptr = [0]
|
||||
kv_indptr = [0]
|
||||
kv_indices = []
|
||||
kv_last_page_lens = []
|
||||
for i in range(num_seqs):
|
||||
seq_len = kv_lens[i]
|
||||
assert seq_len > 0
|
||||
num_blocks = (seq_len + block_size - 1) // block_size
|
||||
kv_indices.extend(block_tables[i, :num_blocks])
|
||||
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
||||
kv_last_page_len = seq_len % block_size
|
||||
if kv_last_page_len == 0:
|
||||
kv_last_page_len = block_size
|
||||
kv_last_page_lens.append(kv_last_page_len)
|
||||
qo_indptr.append(qo_indptr[-1] + query_lens[i])
|
||||
|
||||
qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
|
||||
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||
|
||||
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
|
||||
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
|
||||
workspace_buffer, "NHD")
|
||||
wrapper.begin_forward(
|
||||
qo_indptr,
|
||||
kv_indptr,
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
num_query_heads,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
block_size,
|
||||
)
|
||||
|
||||
output = wrapper.forward(
|
||||
query,
|
||||
key_value_cache,
|
||||
logits_soft_cap=soft_cap,
|
||||
)
|
||||
|
||||
ref_output = ref_paged_attn(query=query,
|
||||
key_cache=key_cache,
|
||||
value_cache=value_cache,
|
||||
query_lens=query_lens,
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
soft_cap=soft_cap)
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
|
||||
@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)])
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
|
||||
def test_flashinfer_prefill_with_paged_fp8_kv(
|
||||
seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int],
|
||||
head_size: int, dtype: torch.dtype, block_size: int,
|
||||
soft_cap: Optional[float]) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
max_kv_len = max(kv_lens)
|
||||
scale = head_size**-0.5
|
||||
|
||||
kv_cache_dtype = torch.float8_e4m3fn
|
||||
|
||||
query = torch.randn(sum(query_lens),
|
||||
num_query_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
NUM_BLOCKS_FP8 = 2048
|
||||
key_value_cache = torch.randn(NUM_BLOCKS_FP8,
|
||||
2,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
|
||||
key_cache /= head_size**0.5
|
||||
value_cache /= head_size**0.5
|
||||
|
||||
k_scale = key_cache.amax().item() / 448.0
|
||||
v_scale = value_cache.amax().item() / 448.0
|
||||
|
||||
kv_cache_fp8 = torch.cat([key_cache / k_scale, value_cache / v_scale],
|
||||
dim=1).to(kv_cache_dtype)
|
||||
|
||||
assert (kv_cache_fp8.shape == key_value_cache.shape)
|
||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||
block_tables = torch.randint(0,
|
||||
NUM_BLOCKS_FP8,
|
||||
(num_seqs, max_num_blocks_per_seq),
|
||||
dtype=torch.int32)
|
||||
|
||||
qo_indptr = [0]
|
||||
kv_indptr = [0]
|
||||
kv_indices = []
|
||||
kv_last_page_lens = []
|
||||
for i in range(num_seqs):
|
||||
seq_len = kv_lens[i]
|
||||
assert seq_len > 0
|
||||
num_blocks = (seq_len + block_size - 1) // block_size
|
||||
kv_indices.extend(block_tables[i, :num_blocks])
|
||||
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
||||
kv_last_page_len = seq_len % block_size
|
||||
if kv_last_page_len == 0:
|
||||
kv_last_page_len = block_size
|
||||
kv_last_page_lens.append(kv_last_page_len)
|
||||
qo_indptr.append(qo_indptr[-1] + query_lens[i])
|
||||
|
||||
qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
|
||||
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||
|
||||
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
|
||||
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
|
||||
workspace_buffer, "NHD")
|
||||
wrapper.begin_forward(
|
||||
qo_indptr,
|
||||
kv_indptr,
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
num_query_heads,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
block_size,
|
||||
)
|
||||
|
||||
output = wrapper.forward(query,
|
||||
kv_cache_fp8,
|
||||
logits_soft_cap=soft_cap,
|
||||
k_scale=k_scale,
|
||||
v_scale=v_scale)
|
||||
|
||||
ref_output = ref_paged_attn(query=query,
|
||||
key_cache=key_cache.squeeze(1),
|
||||
value_cache=value_cache.squeeze(1),
|
||||
query_lens=query_lens,
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
soft_cap=soft_cap)
|
||||
del query
|
||||
del block_tables
|
||||
# verify prefill fp8
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
|
||||
@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)])
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
|
||||
@torch.inference_mode
|
||||
def test_flashinfer_decode_with_paged_fp8_kv(
|
||||
kv_lens: List[int],
|
||||
num_heads: Tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: Optional[float],
|
||||
) -> None:
|
||||
# test doesn't work for num_heads = (16,16)
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(kv_lens)
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
max_kv_len = max(kv_lens)
|
||||
scale = head_size**-0.5
|
||||
use_tensor_cores = (num_query_heads // num_kv_heads) > 4
|
||||
kv_cache_dtype = torch.float8_e4m3fn
|
||||
|
||||
query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
|
||||
NUM_BLOCKS_FP8 = 2048
|
||||
key_value_cache = torch.randn(NUM_BLOCKS_FP8,
|
||||
2,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
key_cache, value_cache = torch.chunk(key_value_cache, 2, dim=1)
|
||||
key_cache /= head_size**0.5
|
||||
value_cache /= head_size**0.5
|
||||
|
||||
k_scale = key_cache.amax().item() / 448.0
|
||||
v_scale = value_cache.amax().item() / 448.0
|
||||
|
||||
key_cache_fp8 = (key_cache / k_scale).to(kv_cache_dtype)
|
||||
value_cache_fp8 = (value_cache / v_scale).to(kv_cache_dtype)
|
||||
assert (key_cache_fp8.shape[1] == 1 and value_cache_fp8.shape[1] == 1)
|
||||
kv_cache_fp8 = torch.cat([key_cache_fp8, value_cache_fp8], dim=1)
|
||||
|
||||
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
|
||||
block_tables = torch.randint(0,
|
||||
NUM_BLOCKS_FP8,
|
||||
(num_seqs, max_num_blocks_per_seq),
|
||||
dtype=torch.int32)
|
||||
|
||||
kv_indptr = [0]
|
||||
kv_indices = []
|
||||
kv_last_page_lens = []
|
||||
for i in range(num_seqs):
|
||||
seq_len = kv_lens[i]
|
||||
assert seq_len > 0
|
||||
num_blocks = (seq_len + block_size - 1) // block_size
|
||||
kv_indices.extend(block_tables[i, :num_blocks])
|
||||
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
||||
kv_last_page_len = seq_len % block_size
|
||||
if kv_last_page_len == 0:
|
||||
kv_last_page_len = block_size
|
||||
kv_last_page_lens.append(kv_last_page_len)
|
||||
|
||||
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||
|
||||
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
|
||||
wrapper = flashinfer.\
|
||||
BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD",
|
||||
use_tensor_cores=use_tensor_cores)
|
||||
wrapper.begin_forward(kv_indptr,
|
||||
kv_indices,
|
||||
kv_last_page_lens,
|
||||
num_query_heads,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
block_size,
|
||||
"NONE",
|
||||
data_type=dtype,
|
||||
q_data_type=dtype)
|
||||
output = wrapper.forward(query,
|
||||
kv_cache_fp8,
|
||||
logits_soft_cap=soft_cap,
|
||||
k_scale=k_scale,
|
||||
v_scale=v_scale)
|
||||
key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
|
||||
value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
|
||||
|
||||
ref_output = ref_paged_attn(query=query,
|
||||
key_cache=key_cache,
|
||||
value_cache=value_cache,
|
||||
query_lens=[1] * num_seqs,
|
||||
kv_lens=kv_lens,
|
||||
block_tables=block_tables,
|
||||
scale=scale,
|
||||
soft_cap=soft_cap)
|
||||
# Temporary fix: Increasing the tolerance. Seems like a flashinfer issue
|
||||
torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
114
vllm-v0.6.2/tests/kernels/test_fp8_quant.py
Normal file
114
vllm-v0.6.2/tests/kernels/test_fp8_quant.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from tests.kernels.quant_utils import (FP8_DTYPE,
|
||||
ref_dynamic_per_tensor_fp8_quant,
|
||||
ref_dynamic_per_token_quant)
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
|
||||
8193] # Arbitrary values for testing
|
||||
HIDDEN_SIZES += list(range(1024, 1033)) # vectorized conversion edge cases
|
||||
NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing
|
||||
SCALE_UBS = [True, False]
|
||||
SEEDS = [0]
|
||||
|
||||
|
||||
def opcheck_fp8_quant(output,
|
||||
input,
|
||||
scale=None,
|
||||
scale_ub=None,
|
||||
use_per_token_if_dynamic=False):
|
||||
if scale is not None:
|
||||
opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
|
||||
elif use_per_token_if_dynamic:
|
||||
scale = torch.empty((input.shape[0], 1),
|
||||
device=input.device,
|
||||
dtype=torch.float32)
|
||||
opcheck(torch.ops._C.dynamic_per_token_scaled_fp8_quant,
|
||||
(output, input, scale, scale_ub))
|
||||
else:
|
||||
scale = torch.empty((input.numel() // input.shape[-1], 1),
|
||||
device=input.device,
|
||||
dtype=torch.float32)
|
||||
opcheck(torch.ops._C.dynamic_scaled_fp8_quant, (output, input, scale))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("scale_ub", SCALE_UBS)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@torch.inference_mode()
|
||||
def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
|
||||
dtype: torch.dtype, scale_ub: bool,
|
||||
seed: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size, dtype=dtype,
|
||||
device="cuda") + 1e-6 # avoid nans
|
||||
|
||||
scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
|
||||
if scale_ub else None
|
||||
ref_out, ref_scales = ref_dynamic_per_token_quant(x, FP8_DTYPE, scale_ub)
|
||||
ops_out, ops_scales = ops.scaled_fp8_quant(x,
|
||||
scale_ub=scale_ub,
|
||||
use_per_token_if_dynamic=True)
|
||||
|
||||
torch.testing.assert_close(ref_scales, ops_scales)
|
||||
torch.testing.assert_close(ref_out.to(dtype=torch.float32),
|
||||
ops_out.to(dtype=torch.float32))
|
||||
|
||||
opcheck_fp8_quant(ops_out,
|
||||
x,
|
||||
None,
|
||||
scale_ub,
|
||||
use_per_token_if_dynamic=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@torch.inference_mode()
|
||||
def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
|
||||
dtype: torch.dtype, seed: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
|
||||
|
||||
ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x)
|
||||
ops_out, ops_scale = ops.scaled_fp8_quant(x)
|
||||
|
||||
torch.testing.assert_close(ref_scale, ops_scale)
|
||||
torch.testing.assert_close(ref_out.to(dtype=torch.float32),
|
||||
ops_out.to(dtype=torch.float32))
|
||||
|
||||
opcheck_fp8_quant(ops_out, x)
|
||||
|
||||
|
||||
# Regression test for a case with large activations where an int32 index cannot
|
||||
# represent the number of elements.
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
def test_fp8_quant_large(seed: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings
|
||||
hidden_size = 1152 # Smallest hidden_size to reproduce the error
|
||||
dtype = torch.bfloat16
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
|
||||
ref_out, scale = ref_dynamic_per_tensor_fp8_quant(x)
|
||||
ops_out, _ = ops.scaled_fp8_quant(x, scale)
|
||||
|
||||
# Minimize memory footprint in this test by freeing x and upconverting
|
||||
# the outputs in place. (torch.allclose does not support fp8)
|
||||
del x
|
||||
ref_out = ref_out.to(dtype=dtype)
|
||||
ops_out = ops_out.to(dtype=dtype)
|
||||
|
||||
torch.testing.assert_close(ref_out, ops_out)
|
||||
22
vllm-v0.6.2/tests/kernels/test_ggml.py
Normal file
22
vllm-v0.6.2/tests/kernels/test_ggml.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import gguf
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops # noqa: F401
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quant_type", [12])
|
||||
def test_ggml_opcheck(quant_type):
|
||||
block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
|
||||
shape = [256, 1152]
|
||||
qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
|
||||
m = qweight.shape[0]
|
||||
n = qweight.shape[1] // type_size * block_size
|
||||
opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
|
||||
|
||||
x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
|
||||
opcheck(torch.ops._C.ggml_mul_mat_a8,
|
||||
(qweight, x, quant_type, qweight.shape[0]))
|
||||
opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
|
||||
(qweight, x, quant_type, qweight.shape[0]))
|
||||
127
vllm-v0.6.2/tests/kernels/test_gguf.py
Normal file
127
vllm-v0.6.2/tests/kernels/test_gguf.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
|
||||
|
||||
|
||||
def get_gguf_sample_tensors(
|
||||
hidden_size: int,
|
||||
quant_type: GGMLQuantizationType) -> List[ReaderTensor]:
|
||||
sample_dir = GGUF_SAMPLE
|
||||
filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
|
||||
sample_file = Path(sample_dir) / filename
|
||||
return GGUFReader(sample_file).tensors
|
||||
|
||||
|
||||
DTYPES = [torch.half]
|
||||
# Hidden_size for testing, must match the sample file in HF repo,
|
||||
# we have `hidden_size = 256, 1024` for test in HF repo currently.
|
||||
HIDDEN_SIZES = [256, 1024]
|
||||
NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
QUANT_TYPES = [
|
||||
# i-matrix
|
||||
GGMLQuantizationType.IQ1_M,
|
||||
GGMLQuantizationType.IQ1_S,
|
||||
GGMLQuantizationType.IQ2_S,
|
||||
GGMLQuantizationType.IQ2_XS,
|
||||
GGMLQuantizationType.IQ3_S,
|
||||
GGMLQuantizationType.IQ3_XXS,
|
||||
GGMLQuantizationType.IQ4_NL,
|
||||
GGMLQuantizationType.IQ4_XS,
|
||||
# k-quants
|
||||
GGMLQuantizationType.Q2_K,
|
||||
GGMLQuantizationType.Q3_K,
|
||||
GGMLQuantizationType.Q4_K,
|
||||
GGMLQuantizationType.Q5_K,
|
||||
GGMLQuantizationType.Q6_K,
|
||||
# standard quantization
|
||||
GGMLQuantizationType.Q4_0,
|
||||
GGMLQuantizationType.Q5_0,
|
||||
GGMLQuantizationType.Q8_0,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
|
||||
@torch.inference_mode()
|
||||
def test_dequantize(hidden_size: int, dtype: torch.dtype,
|
||||
quant_type: GGMLQuantizationType):
|
||||
tensors = get_gguf_sample_tensors(hidden_size, quant_type)
|
||||
for tensor in tensors:
|
||||
shape_str = tensor.name.split("_")[-1]
|
||||
shape = map(int, shape_str.split("x"))
|
||||
|
||||
ref_output = torch.tensor(dequantize(tensor.data, quant_type),
|
||||
device="cuda").to(dtype)
|
||||
output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
|
||||
quant_type, *list(shape)).to(dtype)
|
||||
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
|
||||
@torch.inference_mode()
|
||||
def test_mmvq(hidden_size: int, dtype: torch.dtype,
|
||||
quant_type: GGMLQuantizationType):
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
tensors = get_gguf_sample_tensors(hidden_size, quant_type)
|
||||
x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
|
||||
for tensor in tensors:
|
||||
weight = torch.tensor(dequantize(tensor.data, quant_type),
|
||||
device="cuda").to(dtype)
|
||||
ref_output = x @ weight.T
|
||||
|
||||
qweight = torch.tensor(tensor.data, device="cuda")
|
||||
output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,
|
||||
qweight.shape[0]).to(dtype)
|
||||
|
||||
torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize(
|
||||
"quant_type",
|
||||
[
|
||||
# k-quants
|
||||
GGMLQuantizationType.Q2_K,
|
||||
GGMLQuantizationType.Q3_K,
|
||||
GGMLQuantizationType.Q4_K,
|
||||
GGMLQuantizationType.Q5_K,
|
||||
GGMLQuantizationType.Q6_K,
|
||||
# standard quants
|
||||
GGMLQuantizationType.Q4_0,
|
||||
GGMLQuantizationType.Q5_0,
|
||||
GGMLQuantizationType.Q8_0,
|
||||
])
|
||||
@torch.inference_mode()
|
||||
def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
|
||||
quant_type: GGMLQuantizationType):
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
tensors = get_gguf_sample_tensors(hidden_size, quant_type)
|
||||
x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
|
||||
for tensor in tensors:
|
||||
weight = torch.tensor(dequantize(tensor.data, quant_type),
|
||||
device="cuda").to(dtype)
|
||||
ref_output = x @ weight.T
|
||||
|
||||
qweight = torch.tensor(tensor.data, device="cuda")
|
||||
output = ops.ggml_mul_mat_a8(qweight, x, quant_type,
|
||||
qweight.shape[0]).to(dtype)
|
||||
|
||||
torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
|
||||
29
vllm-v0.6.2/tests/kernels/test_gptq.py
Normal file
29
vllm-v0.6.2/tests/kernels/test_gptq.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops # noqa: F401
|
||||
|
||||
|
||||
def test_gptq_shuffle_opcheck():
|
||||
weight = torch.randint(-2000000,
|
||||
2000000, (1792, 4096),
|
||||
device='cuda',
|
||||
dtype=torch.int32)
|
||||
perm = torch.empty((0, ), device='cuda', dtype=torch.int32)
|
||||
bit = 4
|
||||
opcheck(torch.ops._C.gptq_shuffle, (weight, perm, bit))
|
||||
|
||||
|
||||
def test_gptq_gemm_opcheck():
|
||||
a = torch.rand((240, 4096), device='cuda', dtype=torch.float16)
|
||||
weight = torch.randint(-2000000,
|
||||
2000000, (512, 6144),
|
||||
device='cuda',
|
||||
dtype=torch.int32)
|
||||
zeros = torch.zeros((32, 768), device='cuda', dtype=torch.int32)
|
||||
scales = torch.rand((32, 6144), device='cuda', dtype=torch.float16)
|
||||
idx = torch.empty((0, ), device='cuda', dtype=torch.int32)
|
||||
use_exllama = True
|
||||
bit = 4
|
||||
opcheck(torch.ops._C.gptq_gemm,
|
||||
(a, weight, zeros, scales, idx, use_exllama, bit))
|
||||
190
vllm-v0.6.2/tests/kernels/test_int8_quant.py
Normal file
190
vllm-v0.6.2/tests/kernels/test_int8_quant.py
Normal file
@@ -0,0 +1,190 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm._custom_ops import scaled_int8_quant
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
HIDDEN_SIZES = [16, 67, 768, 5137, 8193] # Arbitrary values for testing
|
||||
NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
SCALE = [0.1, 2.1]
|
||||
|
||||
|
||||
def opcheck_int8_quant_static(output, input, scale, azp=None):
|
||||
if azp is None:
|
||||
opcheck(torch.ops._C.static_scaled_int8_quant,
|
||||
(output, input, scale, None))
|
||||
else:
|
||||
opcheck(torch.ops._C.static_scaled_int8_quant,
|
||||
(output, input, scale, azp))
|
||||
|
||||
|
||||
def opcheck_int8_quant_dynamic(output, input, symmetric=True):
|
||||
scale = torch.empty((input.numel() // input.shape[-1], 1),
|
||||
device=input.device,
|
||||
dtype=torch.float32)
|
||||
if symmetric:
|
||||
opcheck(torch.ops._C.dynamic_scaled_int8_quant,
|
||||
(output, input, scale, None))
|
||||
else:
|
||||
azp = torch.empty((input.numel() // input.shape[-1], 1),
|
||||
device=input.device,
|
||||
dtype=torch.int32)
|
||||
opcheck(torch.ops._C.dynamic_scaled_int8_quant,
|
||||
(output, input, scale, azp))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@torch.inference_mode()
|
||||
def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
|
||||
dtype: torch.dtype, seed: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
|
||||
|
||||
# reference
|
||||
ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.int8)
|
||||
# kernel
|
||||
ops_out, ops_scales, _ = scaled_int8_quant(x)
|
||||
|
||||
torch.testing.assert_close(ops_scales, ref_scales)
|
||||
# big atol to account for rounding errors
|
||||
torch.testing.assert_close(ops_out, ref_out, atol=1, rtol=0.0)
|
||||
|
||||
opcheck_int8_quant_dynamic(ops_out, x)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@torch.inference_mode()
|
||||
def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
|
||||
dtype: torch.dtype, seed: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
int8_traits = torch.iinfo(torch.int8)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size, dtype=dtype,
|
||||
device="cuda") * 1000 - 300
|
||||
|
||||
x_token_max, _ = x.to(dtype=torch.float32).max(dim=1, keepdim=True)
|
||||
x_token_min, _ = x.to(dtype=torch.float32).min(dim=1, keepdim=True)
|
||||
|
||||
# calculate scale and azp, and adjust the range
|
||||
scales = (x_token_max - x_token_min) / torch.tensor(255.0)
|
||||
azps = torch.round(torch.tensor(-128.0) - x_token_min / scales).to(
|
||||
torch.int32)
|
||||
|
||||
torch_out = ((x / scales).round() + azps).clamp(
|
||||
int8_traits.min, int8_traits.max).to(torch.int8)
|
||||
assert torch_out.min() >= int8_traits.min and torch_out.max(
|
||||
) <= int8_traits.max
|
||||
|
||||
ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False)
|
||||
|
||||
if (not torch.allclose(scales_out, scales)):
|
||||
print(torch.argmax(torch.abs(scales_out - scales)))
|
||||
torch.testing.assert_close(scales_out, scales)
|
||||
# big atol to account for rounding errors
|
||||
torch.testing.assert_close(azp_out, azps, atol=1, rtol=0.0)
|
||||
# if AZP is off by 1, after rounding-to-even, the output may be off by 2
|
||||
torch.testing.assert_close(ops_out, torch_out, atol=2, rtol=0.0)
|
||||
|
||||
opcheck_int8_quant_dynamic(ops_out, x, False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("scale", SCALE)
|
||||
@torch.inference_mode()
|
||||
def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
|
||||
dtype: torch.dtype, seed: int,
|
||||
scale: float) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
int8_traits = torch.iinfo(torch.int8)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
|
||||
scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
|
||||
|
||||
out1 = (x / scale_arg).round().clamp(int8_traits.min,
|
||||
int8_traits.max).to(torch.int8)
|
||||
out2, scale2, _ = scaled_int8_quant(x, scale_arg)
|
||||
assert scale2 is scale_arg
|
||||
|
||||
# big atol to account for rounding errors
|
||||
torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
|
||||
|
||||
opcheck_int8_quant_static(out2, x, scale_arg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("scale", SCALE)
|
||||
@pytest.mark.parametrize("azp", [-255, 54])
|
||||
@torch.inference_mode()
|
||||
def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
|
||||
dtype: torch.dtype, seed: int,
|
||||
scale: float, azp: int) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
int8_traits = torch.iinfo(torch.int8)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size, dtype=dtype,
|
||||
device="cuda") * 1000 - 300
|
||||
|
||||
out1 = ((x / scale).round() + azp).clamp(int8_traits.min,
|
||||
int8_traits.max).to(torch.int8)
|
||||
scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
|
||||
azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda")
|
||||
|
||||
out2, scale2, azp2 = scaled_int8_quant(x,
|
||||
scale_arg,
|
||||
azp_arg,
|
||||
symmetric=False)
|
||||
assert scale2 is scale_arg
|
||||
assert azp2 is azp_arg
|
||||
|
||||
# big atol to account for rounding errors
|
||||
torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
|
||||
|
||||
opcheck_int8_quant_static(out2, x, scale_arg, azp_arg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_max", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None:
|
||||
# Test that the saturating cast works correctly for values near i32 max/min
|
||||
|
||||
from numpy import inf, nextafter
|
||||
|
||||
int32_traits = torch.iinfo(torch.int32)
|
||||
val = float(int32_traits.max if is_max else int32_traits.min)
|
||||
|
||||
x_vals = [[
|
||||
nextafter(val, inf), val + 1, val, val - 1,
|
||||
nextafter(val, -inf)
|
||||
]]
|
||||
x = torch.tensor(x_vals, dtype=torch.float32, device="cuda")
|
||||
|
||||
# The calculation in the kernel is: cast<int8>(cast<int32>(x / scale) + azp)
|
||||
# where cast<T> is a saturating cast to type T.
|
||||
# Scale is set to 1.0 so that the input values are the ones that are cast.
|
||||
# AZP is set to 0 to make sure the int8 saturating cast is tested as well.
|
||||
scale = torch.scalar_tensor(1.0, dtype=torch.float32, device="cuda")
|
||||
azp = torch.scalar_tensor(0, dtype=torch.int32, device="cuda")
|
||||
|
||||
int8_traits = torch.iinfo(torch.int8)
|
||||
val_i8 = int8_traits.max if is_max else int8_traits.min
|
||||
expected = torch.full((1, 5), val_i8, dtype=torch.int8, device="cuda")
|
||||
|
||||
out, _, _ = scaled_int8_quant(x, scale, azp, symmetric=False)
|
||||
torch.testing.assert_close(expected, out, atol=0, rtol=0)
|
||||
134
vllm-v0.6.2/tests/kernels/test_layernorm.py
Normal file
134
vllm-v0.6.2/tests/kernels/test_layernorm.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.quant_utils import FP8_DTYPE
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DTYPES = [torch.half, torch.float]
|
||||
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
|
||||
HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
|
||||
8199] # Arbitrary values for testing
|
||||
ADD_RESIDUAL = [False, True]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_rms_norm(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
add_residual: bool,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
layer = RMSNorm(hidden_size).to(dtype=dtype, device=device)
|
||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||
scale = 1 / (2 * hidden_size)
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device)
|
||||
x *= scale
|
||||
residual = torch.randn_like(x) * scale if add_residual else None
|
||||
|
||||
# NOTE(woosuk): The reference implementation should be executed first
|
||||
# because the custom kernel is in-place.
|
||||
ref_out = layer.forward_native(x, residual)
|
||||
out = layer(x, residual)
|
||||
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
|
||||
# numerical errors than other operators because they involve reductions.
|
||||
# Therefore, we use a larger tolerance.
|
||||
if add_residual:
|
||||
torch.testing.assert_close(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
|
||||
torch.testing.assert_close(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
|
||||
else:
|
||||
torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
|
||||
|
||||
if residual is not None:
|
||||
opcheck(torch.ops._C.fused_add_rms_norm,
|
||||
(x, residual, layer.weight.data, layer.variance_epsilon))
|
||||
else:
|
||||
opcheck(torch.ops._C.rms_norm,
|
||||
(out, x, layer.weight.data, layer.variance_epsilon))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0])
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_fused_rms_norm_quant(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
add_residual: bool,
|
||||
dtype: torch.dtype,
|
||||
quant_scale: float,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
|
||||
scale = 1 / (2 * hidden_size)
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
x *= scale
|
||||
if add_residual:
|
||||
residual = torch.randn_like(x) * scale
|
||||
residual_fused = residual.clone()
|
||||
else:
|
||||
residual = residual_fused = None
|
||||
|
||||
out_norm = torch.empty_like(x)
|
||||
out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
|
||||
out_quant_fused = torch.empty_like(out_quant)
|
||||
|
||||
quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
|
||||
|
||||
if add_residual:
|
||||
torch.ops._C.fused_add_rms_norm_static_fp8_quant(
|
||||
out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
|
||||
|
||||
# Unfused kernel is in-place so it goes second
|
||||
# Also use a separate clone of x to avoid modifying the input
|
||||
x_unfused = x.clone()
|
||||
torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
|
||||
torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused,
|
||||
quant_scale_t)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(residual_fused,
|
||||
residual,
|
||||
atol=1e-2,
|
||||
rtol=1e-2)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C.fused_add_rms_norm_static_fp8_quant,
|
||||
(out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
|
||||
else:
|
||||
torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
|
||||
quant_scale_t, 1e-6)
|
||||
|
||||
torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
|
||||
torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
|
||||
quant_scale_t)
|
||||
|
||||
opcheck(torch.ops._C.rms_norm_static_fp8_quant,
|
||||
(out_quant_fused, x, weight, quant_scale_t, 1e-6))
|
||||
|
||||
torch.testing.assert_close(out_quant_fused.to(dtype=torch.float32),
|
||||
out_quant.to(dtype=torch.float32),
|
||||
atol=1e-3,
|
||||
rtol=1e-3)
|
||||
284
vllm-v0.6.2/tests/kernels/test_machete_gemm.py
Normal file
284
vllm-v0.6.2/tests/kernels/test_machete_gemm.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""Tests for the machete kernel.
|
||||
|
||||
Run `pytest tests/kernels/test_machete_gemm.py`.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
pack_rows, quantize_weights)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
MNK_SHAPES = [
|
||||
(1, 128, 128),
|
||||
(1, 512, 1024),
|
||||
(1, 4096, 4096),
|
||||
(1, 8192, 28672),
|
||||
(13, 8192, 4096),
|
||||
(26, 4096, 8192),
|
||||
(64, 4096, 4096),
|
||||
(64, 8192, 28672),
|
||||
(257, 128, 4096),
|
||||
(257, 4224, 4160),
|
||||
(257, 4096, 4096),
|
||||
(1024, 4096, 8192),
|
||||
(1024, 8192, 4096),
|
||||
]
|
||||
|
||||
ACT_TYPES = [torch.float16, torch.bfloat16]
|
||||
WTYPE_ZEROPOINTS = [
|
||||
# GPTQ style
|
||||
(scalar_types.uint4b8, False),
|
||||
(scalar_types.uint8b128, False),
|
||||
# AWQ style
|
||||
(scalar_types.uint4, True),
|
||||
(scalar_types.uint8, True),
|
||||
]
|
||||
|
||||
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
|
||||
# unit tests to a common utility function. Currently the use of
|
||||
# `is_quant_method_supported` conflates kernels with quantization methods
|
||||
# an assumption which is breaking down as quantizations methods can have
|
||||
# have kernels and some kernels support multiple quantization methods.
|
||||
IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
|
||||
|
||||
|
||||
def rand_data(shape, dtype=torch.float16):
|
||||
return 10 * (torch.rand(shape, dtype=dtype, device="cuda") - 0.3)
|
||||
|
||||
|
||||
def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
|
||||
return zps if zps is None else -1 * s * (zps.to(s.dtype))
|
||||
|
||||
|
||||
def machete_quantize_and_pack(w: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
group_size: int,
|
||||
zero_points: bool = False):
|
||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||
|
||||
w_ref, w_q, w_s, w_zp = quantize_weights(
|
||||
w,
|
||||
wtype,
|
||||
group_size,
|
||||
zero_points=zero_points,
|
||||
# to match how the kernel applies zps
|
||||
ref_zero_points_after_scales=True)
|
||||
|
||||
w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
|
||||
w_q = w_q.t().contiguous().t() # convert to col major
|
||||
w_q_machete = ops.machete_prepack_B(w_q, wtype)
|
||||
|
||||
opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype.id))
|
||||
|
||||
return w_ref, w_q_machete, w_s, w_zp
|
||||
|
||||
|
||||
def machete_gemm_test_helper(a: torch.Tensor, b: torch.Tensor,
|
||||
wtype: ScalarType, group_size: int,
|
||||
zero_points: bool):
|
||||
w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
|
||||
b, wtype, group_size, zero_points)
|
||||
|
||||
output_ref = torch.matmul(a, w_ref)
|
||||
|
||||
output = ops.machete_gemm(
|
||||
a=a,
|
||||
b_q=w_q_packed,
|
||||
b_type=wtype,
|
||||
b_scales=w_s,
|
||||
b_zeros=maybe_convert_zeropoints(w_zp, w_s),
|
||||
b_group_size=group_size,
|
||||
)
|
||||
|
||||
# Relax atol as our reduction dim becomes larger (more rounding error)
|
||||
# Relax atol when we have zeropoints since the way machete applies
|
||||
# zeropoints (after scales) causes noise around 0
|
||||
atol = 1 if zero_points else min(5e-2 * math.sqrt(a.shape[1]), 1)
|
||||
torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
|
||||
reason="Machete is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("shape",
|
||||
MNK_SHAPES,
|
||||
ids=lambda x: "x".join(str(v) for v in x))
|
||||
@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
|
||||
@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
|
||||
@pytest.mark.parametrize("group_size", [128, None])
|
||||
def test_machete_all_schedules(shape, atype: torch.dtype,
|
||||
wtype_zeropoints: Tuple[ScalarType, bool],
|
||||
group_size: Optional[int]):
|
||||
m, n, k = shape
|
||||
wtype, zero_points = wtype_zeropoints
|
||||
|
||||
if group_size is not None and k % group_size != 0:
|
||||
return
|
||||
|
||||
print(f"MNK = {m} {n} {k}")
|
||||
|
||||
# Normalize group_size
|
||||
if group_size is None:
|
||||
group_size = k
|
||||
assert group_size <= k
|
||||
|
||||
a = rand_data((m, k), atype)
|
||||
w = rand_data((k, n), atype)
|
||||
|
||||
w_ref, w_q_machete, w_s, w_zp = machete_quantize_and_pack(
|
||||
w, wtype, group_size, zero_points)
|
||||
|
||||
output_ref = torch.matmul(a, w_ref)
|
||||
|
||||
for schedule in ops.machete_supported_schedules(wtype):
|
||||
print(f"Testing schedule {schedule}")
|
||||
output = ops.machete_gemm(
|
||||
a,
|
||||
b_q=w_q_machete,
|
||||
b_type=wtype,
|
||||
b_scales=w_s,
|
||||
b_zeros=maybe_convert_zeropoints(w_zp, w_s),
|
||||
b_group_size=group_size,
|
||||
schedule=schedule,
|
||||
)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C.machete_gemm,
|
||||
(a, w_q_machete, wtype.id, w_s, maybe_convert_zeropoints(
|
||||
w_zp, w_s), group_size, None, None, None, schedule))
|
||||
|
||||
# Relax atol as our reduction dim becomes larger (more rounding error)
|
||||
# Relax atol when we have zeropoints since the way machete applies
|
||||
# zeropoints (after scales) causes noise around 0
|
||||
atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
|
||||
torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol),\
|
||||
f"Schedule failed {schedule}"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
|
||||
reason="Machete is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("shape",
|
||||
MNK_SHAPES,
|
||||
ids=lambda x: "x".join(str(v) for v in x))
|
||||
@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x))
|
||||
@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS)
|
||||
@pytest.mark.parametrize("group_size", [128, None])
|
||||
def test_machete_heuristic(shape, atype: torch.dtype,
|
||||
wtype_zeropoints: Tuple[ScalarType, bool],
|
||||
group_size: Optional[int]):
|
||||
m, n, k = shape
|
||||
wtype, zero_points = wtype_zeropoints
|
||||
|
||||
if group_size is not None and k % group_size != 0:
|
||||
return
|
||||
|
||||
# Normalize group_size
|
||||
if group_size is None:
|
||||
group_size = k
|
||||
assert group_size <= k
|
||||
|
||||
a = rand_data((m, k), atype)
|
||||
b = rand_data((k, n), atype)
|
||||
|
||||
machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
|
||||
|
||||
|
||||
# Test working on other devices
|
||||
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
|
||||
reason="Machete is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_machete_devices(device: str):
|
||||
m, n, k = 512, 4096, 4096
|
||||
wtype = scalar_types.uint4b8
|
||||
group_size = 128
|
||||
zero_points = False
|
||||
|
||||
print(f"MNK = {m} {n} {k}, device = {device}")
|
||||
|
||||
a = rand_data((m, k), torch.float16).to(device)
|
||||
b = rand_data((k, n), torch.float16).to(device)
|
||||
|
||||
machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
|
||||
|
||||
|
||||
# Test working with a subset of A and B
|
||||
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
|
||||
reason="Machete is not supported on this GPU type.")
|
||||
def test_machete_subset():
|
||||
big_m, big_n, big_k = 1024, 1024, 1024
|
||||
m, n, k = 512, 512, 512
|
||||
wtype = scalar_types.uint4b8
|
||||
group_size = 128
|
||||
zero_points = False
|
||||
|
||||
whole_a = rand_data((big_m, big_k), torch.float16)
|
||||
whole_b = rand_data((big_k, big_n), torch.float16)
|
||||
|
||||
a = whole_a[0:m, 0:k]
|
||||
b = whole_b[0:k, 0:n]
|
||||
|
||||
machete_gemm_test_helper(a, b, wtype, group_size, zero_points)
|
||||
|
||||
|
||||
# Test to make sure cuda graphs work
|
||||
class MacheteLayer(torch.nn.Module):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__()
|
||||
self.kwargs = kwargs
|
||||
|
||||
def forward(self, a):
|
||||
return ops.machete_gemm(**self.kwargs)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
|
||||
reason="Machete is not supported on this GPU type.")
|
||||
def test_machete_cuda_graph():
|
||||
m, n, k = 512, 4096, 4096
|
||||
|
||||
a = rand_data((m, k), torch.float16)
|
||||
b = rand_data((k, n), torch.float16)
|
||||
wtype = scalar_types.uint4b8
|
||||
group_size = 128
|
||||
zero_points = False
|
||||
|
||||
w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack(
|
||||
b, wtype, group_size, zero_points)
|
||||
|
||||
# Construct a trivial model with a single layer that calls a machete kernel
|
||||
model = MacheteLayer(
|
||||
a=a,
|
||||
b_q=w_q_packed,
|
||||
b_type=wtype,
|
||||
b_scales=w_s,
|
||||
b_zeros=maybe_convert_zeropoints(w_zp, w_s),
|
||||
b_group_size=group_size,
|
||||
)
|
||||
|
||||
output_ref = torch.matmul(a, w_ref)
|
||||
|
||||
# Run the model with a cuda graph
|
||||
stream = torch.cuda.Stream()
|
||||
with torch.cuda.stream(stream):
|
||||
g = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(g):
|
||||
output = model(a)
|
||||
output.zero_()
|
||||
g.replay()
|
||||
|
||||
# Relax atol as our reduction dim becomes larger (more rounding error)
|
||||
# Relax atol when we have zeropoints since the way machete applies
|
||||
# zeropoints (after scales) causes noise around 0
|
||||
atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1)
|
||||
torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol)
|
||||
720
vllm-v0.6.2/tests/kernels/test_mamba_ssm.py
Normal file
720
vllm-v0.6.2/tests/kernels/test_mamba_ssm.py
Normal file
@@ -0,0 +1,720 @@
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange, repeat
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm import _custom_ops as ops # noqa: F401
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
|
||||
selective_scan_fn, selective_state_update)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def selective_state_update_ref(state,
|
||||
x,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=None,
|
||||
z=None,
|
||||
dt_bias=None,
|
||||
dt_softplus=False):
|
||||
"""
|
||||
Argument:
|
||||
state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
|
||||
x: (batch, dim) or (batch, nheads, dim)
|
||||
dt: (batch, dim) or (batch, nheads, dim)
|
||||
A: (dim, dstate) or (nheads, dim, dstate)
|
||||
B: (batch, dstate) or (batch, ngroups, dstate)
|
||||
C: (batch, dstate) or (batch, ngroups, dstate)
|
||||
D: (dim,) or (nheads, dim)
|
||||
z: (batch, dim) or (batch, nheads, dim)
|
||||
dt_bias: (dim,) or (nheads, dim)
|
||||
Return:
|
||||
out: (batch, dim) or (batch, nheads, dim)
|
||||
"""
|
||||
has_heads = state.dim() > 3
|
||||
if state.dim() == 3:
|
||||
state = state.unsqueeze(1)
|
||||
if x.dim() == 2:
|
||||
x = x.unsqueeze(1)
|
||||
if dt.dim() == 2:
|
||||
dt = dt.unsqueeze(1)
|
||||
if A.dim() == 2:
|
||||
A = A.unsqueeze(0)
|
||||
if B.dim() == 2:
|
||||
B = B.unsqueeze(1)
|
||||
if C.dim() == 2:
|
||||
C = C.unsqueeze(1)
|
||||
if D is not None and D.dim() == 1:
|
||||
D = D.unsqueeze(0)
|
||||
if z is not None and z.dim() == 2:
|
||||
z = z.unsqueeze(1)
|
||||
if dt_bias is not None and dt_bias.dim() == 1:
|
||||
dt_bias = dt_bias.unsqueeze(0)
|
||||
batch, nheads, dim, dstate = state.shape
|
||||
assert x.shape == (batch, nheads, dim)
|
||||
assert dt.shape == x.shape
|
||||
assert A.shape == (nheads, dim, dstate)
|
||||
ngroups = B.shape[1]
|
||||
assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
|
||||
assert B.shape == (batch, ngroups, dstate)
|
||||
assert C.shape == B.shape
|
||||
if D is not None:
|
||||
assert D.shape == (nheads, dim)
|
||||
if z is not None:
|
||||
assert z.shape == x.shape
|
||||
if dt_bias is not None:
|
||||
assert dt_bias.shape == (nheads, dim)
|
||||
dt = dt + dt_bias
|
||||
dt = F.softplus(dt) if dt_softplus else dt
|
||||
dA = torch.exp(rearrange(dt, "b h d -> b h d 1") *
|
||||
A) # (batch, nheads, dim, dstate)
|
||||
B = repeat(B, "b g n -> b (g h) n",
|
||||
h=nheads // ngroups) # (batch, nheads, dstate)
|
||||
C = repeat(C, "b g n -> b (g h) n",
|
||||
h=nheads // ngroups) # (batch, nheads, dstate)
|
||||
dB = rearrange(dt, "b h d -> b h d 1") * rearrange(
|
||||
B, "b h n -> b h 1 n") # (batch, nheads, dim, dstate)
|
||||
state.copy_(state * dA +
|
||||
dB * rearrange(x, "b h d -> b h d 1")) # (batch, dim, dstate
|
||||
out = torch.einsum("bhdn,bhn->bhd", state.to(C.dtype), C)
|
||||
if D is not None:
|
||||
out += (x * D).to(out.dtype)
|
||||
out = (out if z is None else out * F.silu(z)).to(x.dtype)
|
||||
if not has_heads:
|
||||
out = out.squeeze(1)
|
||||
return out
|
||||
|
||||
|
||||
def selective_scan_ref(u,
|
||||
delta,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=None,
|
||||
z=None,
|
||||
delta_bias=None,
|
||||
delta_softplus=False,
|
||||
return_last_state=False,
|
||||
prev_state=None,
|
||||
final_state_out=None):
|
||||
"""
|
||||
u: r(B D L)
|
||||
delta: r(B D L)
|
||||
A: c(D N) or r(D N)
|
||||
B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
|
||||
C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
|
||||
D: r(D)
|
||||
z: r(B D L)
|
||||
delta_bias: r(D), fp32
|
||||
prev_state: r(B D N), fp32
|
||||
|
||||
out: r(B D L)
|
||||
last_state (optional): r(B D dstate) or c(B D dstate)
|
||||
"""
|
||||
dtype_in = u.dtype
|
||||
u = u.float()
|
||||
delta = delta.float()
|
||||
if delta_bias is not None:
|
||||
delta = delta + delta_bias[..., None].float()
|
||||
if delta_softplus:
|
||||
delta = F.softplus(delta)
|
||||
batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
|
||||
is_variable_B = B.dim() >= 3
|
||||
is_variable_C = C.dim() >= 3
|
||||
B = B.float()
|
||||
C = C.float()
|
||||
x = A.new_zeros((batch, dim, dstate)) if prev_state is None else prev_state
|
||||
ys = []
|
||||
deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
|
||||
if not is_variable_B:
|
||||
deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
|
||||
else:
|
||||
if B.dim() == 3:
|
||||
deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
|
||||
else:
|
||||
B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
|
||||
deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
|
||||
if is_variable_C and C.dim() == 4:
|
||||
C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
|
||||
for i in range(u.shape[2]):
|
||||
x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
|
||||
if not is_variable_C:
|
||||
y = torch.einsum('bdn,dn->bd', x, C)
|
||||
else:
|
||||
if C.dim() == 3:
|
||||
y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
|
||||
else:
|
||||
y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
|
||||
if i == u.shape[2] - 1:
|
||||
if final_state_out is None:
|
||||
final_state_out = x
|
||||
else:
|
||||
final_state_out.copy_(x)
|
||||
ys.append(y)
|
||||
y = torch.stack(ys, dim=2) # (batch dim L)
|
||||
out = y if D is None else y + u * rearrange(D, "d -> d 1")
|
||||
if z is not None:
|
||||
out = out * F.silu(z)
|
||||
out = out.to(dtype=dtype_in)
|
||||
return out if not return_last_state else (out, final_state_out)
|
||||
|
||||
|
||||
def selective_scan_opcheck_fn(u,
|
||||
delta,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=None,
|
||||
z=None,
|
||||
delta_bias=None,
|
||||
delta_softplus=False,
|
||||
cu_seq_len=None,
|
||||
cache_indices=None,
|
||||
has_initial_state=None,
|
||||
ssm_states=None,
|
||||
pad_slot_id=PAD_SLOT_ID):
|
||||
"""if return_last_state is True, returns (out, last_state)
|
||||
last_state has shape (batch, dim, dstate).
|
||||
"""
|
||||
if u.stride(-1) != 1:
|
||||
u = u.contiguous()
|
||||
if delta.stride(-1) != 1:
|
||||
delta = delta.contiguous()
|
||||
if D is not None:
|
||||
D = D.contiguous()
|
||||
if B.stride(-1) != 1:
|
||||
B = B.contiguous()
|
||||
if C.stride(-1) != 1:
|
||||
C = C.contiguous()
|
||||
if z is not None and z.stride(-1) != 1:
|
||||
z = z.contiguous()
|
||||
if B.dim() == 3 and cu_seq_len is None:
|
||||
B = B.unsqueeze(1)
|
||||
if B.dim() == 2 and cu_seq_len is not None:
|
||||
B = B.unsqueeze(0)
|
||||
if C.dim() == 3 and cu_seq_len is None:
|
||||
C = C.unsqueeze(1)
|
||||
if C.dim() == 2 and cu_seq_len is not None:
|
||||
C = C.unsqueeze(0)
|
||||
|
||||
# Disable test_autograd_registration for now as it seems to trigger
|
||||
# a bogus error.
|
||||
opcheck(torch.ops._C.selective_scan_fwd,
|
||||
(u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len,
|
||||
cache_indices, has_initial_state, ssm_states, pad_slot_id),
|
||||
test_utils=["test_schema", "test_faketensor"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wtype', [torch.float32])
|
||||
@pytest.mark.parametrize('itype',
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize('seqlen', [128, 256, 512, 1024, 2048, 4096])
|
||||
@pytest.mark.parametrize('has_delta_bias', [True])
|
||||
@pytest.mark.parametrize('delta_softplus', [True])
|
||||
@pytest.mark.parametrize('has_z', [True])
|
||||
@pytest.mark.parametrize('has_D', [True])
|
||||
@pytest.mark.parametrize("varBC_groups", [1, 2])
|
||||
@pytest.mark.parametrize("is_variable_C", [True])
|
||||
@pytest.mark.parametrize("is_variable_B", [True])
|
||||
@pytest.mark.parametrize("scan_chunks", [1, 2, 3])
|
||||
def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D,
|
||||
has_z, has_delta_bias, delta_softplus, seqlen, itype,
|
||||
wtype, scan_chunks):
|
||||
if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
|
||||
pytest.skip() # This config is not applicable
|
||||
device = 'cuda'
|
||||
rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 3e-2, 5e-2
|
||||
rtolw, atolw = (1e-3, 1e-3)
|
||||
if has_z: # If we have z, the errors on the weights seem higher
|
||||
rtolw = max(rtolw, rtol)
|
||||
atolw = max(atolw, atol)
|
||||
# set seed
|
||||
current_platform.seed_everything(0)
|
||||
batch_size = 1
|
||||
dim = 4
|
||||
dstate = 8
|
||||
A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
|
||||
A_ref = A.clone()
|
||||
if not is_variable_B:
|
||||
B_shape = [dim, dstate]
|
||||
elif varBC_groups == 1:
|
||||
B_shape = [batch_size, dstate, seqlen]
|
||||
else:
|
||||
B_shape = [batch_size, varBC_groups, dstate, seqlen]
|
||||
B = torch.randn(B_shape,
|
||||
device=device,
|
||||
dtype=wtype if not is_variable_B else itype)
|
||||
B_ref = B.clone()
|
||||
if not is_variable_C:
|
||||
C_shape = [dim, dstate]
|
||||
elif varBC_groups == 1:
|
||||
C_shape = [batch_size, dstate, seqlen]
|
||||
else:
|
||||
C_shape = [batch_size, varBC_groups, dstate, seqlen]
|
||||
C = torch.randn(C_shape,
|
||||
device=device,
|
||||
dtype=wtype if not is_variable_C else itype)
|
||||
C_ref = C.clone()
|
||||
D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
|
||||
D_ref = D.clone()
|
||||
z = torch.randn(batch_size, dim, seqlen, device=device,
|
||||
dtype=itype) if has_z else None
|
||||
z_ref = z.clone() if has_z else None
|
||||
delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
|
||||
) if has_delta_bias else None
|
||||
u = torch.randn(batch_size, dim, seqlen, device=device, dtype=itype)
|
||||
u_ref = u.clone()
|
||||
delta = (0.5 *
|
||||
torch.rand(batch_size, dim, seqlen, device=device, dtype=itype))
|
||||
delta_ref = delta.clone()
|
||||
state_shape = (batch_size, u.shape[1], int(A.shape[1]))
|
||||
state = torch.randn(state_shape,
|
||||
device=u.device,
|
||||
dtype=itype,
|
||||
requires_grad=False)
|
||||
state_ref = state.clone()
|
||||
out = None
|
||||
out_ref = None
|
||||
outs = []
|
||||
for c in range(scan_chunks):
|
||||
chunked_prompt_len = seqlen // scan_chunks
|
||||
chunk_start = chunked_prompt_len * c
|
||||
chunk_end = chunked_prompt_len * (c + 1)
|
||||
if c == scan_chunks - 1:
|
||||
chunk_end = seqlen
|
||||
_B = B
|
||||
if is_variable_B:
|
||||
_B = B[..., chunk_start:chunk_end]
|
||||
_C = C
|
||||
if is_variable_B:
|
||||
_C = C[..., chunk_start:chunk_end]
|
||||
_z = z
|
||||
if has_z:
|
||||
assert z is not None
|
||||
_z = z[..., chunk_start:chunk_end]
|
||||
out = selective_scan_fn(
|
||||
u[..., chunk_start:chunk_end],
|
||||
state,
|
||||
delta[..., chunk_start:chunk_end],
|
||||
A,
|
||||
_B,
|
||||
_C,
|
||||
D,
|
||||
z=_z,
|
||||
delta_bias=delta_bias,
|
||||
delta_softplus=delta_softplus,
|
||||
has_initial_state=torch.ones(batch_size,
|
||||
device=u.device,
|
||||
dtype=torch.bool) if c > 0 else None)
|
||||
outs.append(out)
|
||||
if len(outs) > 1:
|
||||
out = torch.cat(outs, dim=-1)
|
||||
|
||||
out_ref, state_ref, *rest = selective_scan_ref(
|
||||
u_ref,
|
||||
delta_ref,
|
||||
A_ref,
|
||||
B_ref,
|
||||
C_ref,
|
||||
D_ref,
|
||||
z=z_ref,
|
||||
delta_bias=delta_bias,
|
||||
delta_softplus=delta_softplus,
|
||||
return_last_state=True)
|
||||
|
||||
assert out is not None and out_ref is not None
|
||||
assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
|
||||
assert state is not None and state_ref is not None
|
||||
assert torch.allclose(state, state_ref.to(itype), rtol=rtol, atol=atol)
|
||||
|
||||
selective_scan_opcheck_fn(u,
|
||||
delta,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D,
|
||||
z,
|
||||
delta_bias=delta_bias,
|
||||
delta_softplus=delta_softplus,
|
||||
ssm_states=state)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("has_z", [False, True])
|
||||
@pytest.mark.parametrize("dstate", [16, 32, 64])
|
||||
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
|
||||
def test_selective_state_update(dim, dstate, has_z, itype):
|
||||
device = "cuda"
|
||||
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 1e-2, 5e-2
|
||||
if torch.version.hip:
|
||||
atol *= 2
|
||||
# set seed
|
||||
current_platform.seed_everything(0)
|
||||
batch_size = 1
|
||||
state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device)
|
||||
x = torch.randn(batch_size, dim, device=device, dtype=itype)
|
||||
dt = torch.randn(batch_size, dim, device=device, dtype=itype)
|
||||
dt_bias = torch.rand(dim, device=device) - 4.0
|
||||
A = -torch.rand(dim, dstate, device=device) - 1.0
|
||||
B = torch.randn(batch_size, dstate, device=device)
|
||||
C = torch.randn(batch_size, dstate, device=device)
|
||||
D = torch.randn(dim, device=device)
|
||||
z = torch.randn_like(x) if has_z else None
|
||||
state_ref = state.detach().clone()
|
||||
out = selective_state_update(state,
|
||||
x,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=D,
|
||||
z=z,
|
||||
dt_bias=dt_bias,
|
||||
dt_softplus=True)
|
||||
out_ref = selective_state_update_ref(state_ref,
|
||||
x,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=D,
|
||||
z=z,
|
||||
dt_bias=dt_bias,
|
||||
dt_softplus=True)
|
||||
|
||||
assert torch.allclose(state, state_ref, rtol=rtol, atol=atol)
|
||||
assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wtype', [torch.float32])
|
||||
@pytest.mark.parametrize('itype', [torch.float32])
|
||||
@pytest.mark.parametrize('seqlen', [1, 128, 129, 256, 512, 1024, 2048, 4096])
|
||||
@pytest.mark.parametrize("return_last_state", [True])
|
||||
@pytest.mark.parametrize('has_delta_bias', [True])
|
||||
@pytest.mark.parametrize('delta_softplus', [True])
|
||||
@pytest.mark.parametrize('has_z', [True])
|
||||
@pytest.mark.parametrize('has_D', [True])
|
||||
@pytest.mark.parametrize("varBC_groups", [1, 2])
|
||||
@pytest.mark.parametrize("is_variable_C", [True])
|
||||
@pytest.mark.parametrize("is_variable_B", [True])
|
||||
# tests correctness in case subset of the sequences are padded
|
||||
@pytest.mark.parametrize("with_padding", [False, True])
|
||||
def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
|
||||
varBC_groups, has_D, has_z, has_delta_bias,
|
||||
delta_softplus, return_last_state, seqlen,
|
||||
itype, wtype):
|
||||
if varBC_groups > 1 and (not is_variable_B or not is_variable_C):
|
||||
pytest.skip() # This config is not applicable
|
||||
device = 'cuda'
|
||||
rtol, atol = (6e-4, 2e-3) if itype == torch.float32 else (3e-3, 5e-3)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 3e-2, 5e-2
|
||||
rtolw, atolw = (1e-3, 1e-3)
|
||||
if has_z: # If we have z, the errors on the weights seem higher
|
||||
rtolw = max(rtolw, rtol)
|
||||
atolw = max(atolw, atol)
|
||||
# set seed
|
||||
torch.random.manual_seed(0)
|
||||
seqlens = []
|
||||
batch_size = 4
|
||||
if seqlen < 10:
|
||||
batch_size = 1
|
||||
padding = 3 if with_padding else 0
|
||||
padded_batch_size = batch_size + padding
|
||||
|
||||
if with_padding and seqlen < padded_batch_size:
|
||||
pytest.skip()
|
||||
|
||||
nsplits = padded_batch_size - 1
|
||||
eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
|
||||
seqlens.append(
|
||||
torch.diff(
|
||||
torch.cat(
|
||||
[torch.tensor([-1]), eos_pos,
|
||||
torch.tensor([seqlen - 1])])).tolist())
|
||||
|
||||
assert sum(seqlens[-1]) == seqlen
|
||||
assert all(s > 0 for s in seqlens[-1])
|
||||
|
||||
total_entries = batch_size * 10
|
||||
cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
|
||||
cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum],
|
||||
dim=0).cuda()
|
||||
|
||||
dim = 4
|
||||
dstate = 8
|
||||
A = (-0.5 * torch.rand(dim, dstate, device=device, dtype=wtype))
|
||||
A_ref = A.clone()
|
||||
B_shape = [varBC_groups, dstate, seqlen]
|
||||
B = torch.randn(B_shape,
|
||||
device=device,
|
||||
dtype=wtype if not is_variable_B else itype)
|
||||
B_ref = B.clone()
|
||||
C_shape = [varBC_groups, dstate, seqlen]
|
||||
C = torch.randn(C_shape,
|
||||
device=device,
|
||||
dtype=wtype if not is_variable_C else itype)
|
||||
C_ref = C.clone()
|
||||
D = torch.randn(dim, device=device, dtype=torch.float32) if has_D else None
|
||||
D_ref = D.clone()
|
||||
z = torch.randn(dim, seqlen, device=device, dtype=itype)
|
||||
z_ref = z.clone()
|
||||
delta_bias = (0.5 * torch.rand(dim, device=device, dtype=torch.float32)
|
||||
) if has_delta_bias else None
|
||||
u = torch.randn(dim, seqlen, device=device, dtype=itype)
|
||||
u_ref = u.clone()
|
||||
delta = (0.5 * torch.rand(dim, seqlen, device=device, dtype=itype))
|
||||
delta_ref = delta.clone()
|
||||
out = None
|
||||
out_ref = None
|
||||
|
||||
prev_state_shape = (total_entries, u.shape[0], int(A.shape[1]))
|
||||
prev_state = torch.randn(prev_state_shape,
|
||||
device=u.device,
|
||||
dtype=itype,
|
||||
requires_grad=False)
|
||||
prev_state_ref = prev_state.clone()
|
||||
state_indices = torch.randperm(total_entries,
|
||||
dtype=torch.int32,
|
||||
device=u.device)[:batch_size]
|
||||
unused_states_bool = torch.ones(total_entries,
|
||||
dtype=torch.bool,
|
||||
device=device)
|
||||
unused_states_bool[state_indices] = False
|
||||
padded_state_indices = torch.concat([
|
||||
state_indices,
|
||||
torch.as_tensor(
|
||||
[PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
|
||||
],
|
||||
dim=-1)
|
||||
|
||||
has_initial_state = torch.randint(0,
|
||||
2, (cumsum.shape[0] - 1, ),
|
||||
dtype=torch.bool,
|
||||
device=u.device)
|
||||
out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias,
|
||||
delta_softplus, cumsum, padded_state_indices,
|
||||
has_initial_state)
|
||||
outs_ref = []
|
||||
splits = [
|
||||
torch.split(var, seqlens[0], dim=-1)
|
||||
for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
|
||||
]
|
||||
for i in range(len(seqlens[0])):
|
||||
u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
|
||||
if padded_state_indices[i] == PAD_SLOT_ID:
|
||||
continue
|
||||
out_ref_s, _ = selective_scan_ref(
|
||||
u_s,
|
||||
delta_s,
|
||||
A_ref,
|
||||
B_s,
|
||||
C_s,
|
||||
D_ref,
|
||||
z=z_s,
|
||||
delta_bias=delta_bias,
|
||||
delta_softplus=delta_softplus,
|
||||
return_last_state=return_last_state,
|
||||
prev_state=prev_state_ref[padded_state_indices[i]].unsqueeze(0)
|
||||
if has_initial_state[i] else None,
|
||||
final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze(
|
||||
0))
|
||||
outs_ref.append(out_ref_s)
|
||||
out_ref = torch.cat(outs_ref, dim=-1)[0]
|
||||
|
||||
unpadded_out = out[:, :out_ref[0].shape[-1]]
|
||||
print("Output diff max", (unpadded_out - out_ref).max())
|
||||
print("Output diff mean", (unpadded_out - out_ref).mean())
|
||||
print("Output state diff max", (prev_state - prev_state_ref).max())
|
||||
print("Output state diff mean", (prev_state - prev_state_ref).mean())
|
||||
assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol)
|
||||
assert torch.allclose(unpadded_out, out_ref, rtol=rtol, atol=atol)
|
||||
selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias,
|
||||
delta_softplus, cumsum, padded_state_indices,
|
||||
has_initial_state, prev_state)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("has_z", [True])
|
||||
@pytest.mark.parametrize("dstate", [16, 32, 64])
|
||||
@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
|
||||
# tests correctness in case subset of the sequences are padded
|
||||
@pytest.mark.parametrize("with_padding", [True, False])
|
||||
def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
|
||||
has_z, itype):
|
||||
device = "cuda"
|
||||
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 1e-1, 1e-1
|
||||
if torch.version.hip:
|
||||
atol *= 2
|
||||
# set seed
|
||||
torch.random.manual_seed(0)
|
||||
batch_size = 3
|
||||
padding = 5 if with_padding else 0
|
||||
padded_batch_size = batch_size + padding
|
||||
total_entries = 10 * batch_size
|
||||
state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device)
|
||||
state_indices = torch.randperm(total_entries)[:batch_size].to(
|
||||
dtype=torch.int32, device=device)
|
||||
unused_states_bool = torch.ones(total_entries,
|
||||
dtype=torch.bool,
|
||||
device=device)
|
||||
unused_states_bool[state_indices] = False
|
||||
padded_state_indices = torch.concat([
|
||||
state_indices,
|
||||
torch.as_tensor(
|
||||
[PAD_SLOT_ID] * padding, dtype=torch.int32, device=device)
|
||||
],
|
||||
dim=0)
|
||||
x = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
|
||||
dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype)
|
||||
dt_bias = torch.rand(dim, device=device) - 4.0
|
||||
A = -torch.rand(dim, dstate, device=device) - 1.0
|
||||
B = torch.randn(padded_batch_size, dstate, device=device)
|
||||
C = torch.randn(padded_batch_size, dstate, device=device)
|
||||
D = torch.randn(dim, device=device)
|
||||
z = torch.randn_like(x) if has_z else None
|
||||
state_ref = state[state_indices, :].clone()
|
||||
state_before = state.clone()
|
||||
out = selective_state_update(state,
|
||||
x,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=D,
|
||||
z=z,
|
||||
dt_bias=dt_bias,
|
||||
dt_softplus=True,
|
||||
state_batch_indices=padded_state_indices,
|
||||
pad_slot_id=PAD_SLOT_ID)
|
||||
out_ref = selective_state_update_ref(state_ref,
|
||||
x[:batch_size],
|
||||
dt[:batch_size],
|
||||
A,
|
||||
B[:batch_size],
|
||||
C[:batch_size],
|
||||
D=D,
|
||||
z=z[:batch_size],
|
||||
dt_bias=dt_bias,
|
||||
dt_softplus=True)
|
||||
|
||||
print("Output diff max", (out[:batch_size] - out_ref).max())
|
||||
print("Output diff mean", (out[:batch_size] - out_ref).mean())
|
||||
print("Output state diff max", (state[state_indices, :] - state_ref).max())
|
||||
print("Output state diff mean",
|
||||
(state[state_indices, :] - state_ref).mean())
|
||||
# test padded entries stay the same
|
||||
if with_padding:
|
||||
assert torch.equal(state_before[unused_states_bool],
|
||||
state[unused_states_bool])
|
||||
assert torch.equal(x[batch_size + 1:], x[batch_size + 1:])
|
||||
assert torch.equal(dt[batch_size + 1:], dt[batch_size + 1:])
|
||||
assert torch.equal(B[batch_size + 1:], B[batch_size + 1:])
|
||||
assert torch.equal(C[batch_size + 1:], C[batch_size + 1:])
|
||||
|
||||
# test "real" entries
|
||||
assert torch.allclose(state[state_indices, :],
|
||||
state_ref,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("itype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("has_z", [False, True])
|
||||
@pytest.mark.parametrize("tie_hdim", [False, True])
|
||||
@pytest.mark.parametrize("ngroups", [1, 2, 4])
|
||||
@pytest.mark.parametrize("dstate", [16, 32, 64])
|
||||
@pytest.mark.parametrize("dim", [2048, 4096])
|
||||
def test_selective_state_update_with_heads_with_batch_indices(
|
||||
dim, dstate, ngroups, has_z, tie_hdim, itype):
|
||||
device = "cuda"
|
||||
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 3e-2)
|
||||
if itype == torch.bfloat16:
|
||||
rtol, atol = 1e-1, 1e-1
|
||||
# set seed
|
||||
torch.random.manual_seed(0)
|
||||
batch_size = 3
|
||||
headdim = 64
|
||||
nheads = dim // headdim
|
||||
|
||||
total_entries = 10 * batch_size
|
||||
state = torch.randn(total_entries,
|
||||
nheads,
|
||||
headdim,
|
||||
dstate,
|
||||
dtype=itype,
|
||||
device=device)
|
||||
state_indices = torch.randperm(total_entries)[:batch_size].to(
|
||||
dtype=torch.int32, device=device)
|
||||
|
||||
x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype)
|
||||
if not tie_hdim:
|
||||
dt = torch.randn(batch_size,
|
||||
nheads,
|
||||
headdim,
|
||||
device=device,
|
||||
dtype=itype)
|
||||
dt_bias = torch.rand(nheads, headdim, device=device) - 4.0
|
||||
A = -torch.rand(nheads, headdim, dstate, device=device) - 1.0
|
||||
D = torch.randn(nheads, headdim, device=device)
|
||||
else:
|
||||
dt = repeat(torch.randn(batch_size, nheads, device=device,
|
||||
dtype=itype),
|
||||
"b h -> b h p",
|
||||
p=headdim)
|
||||
dt_bias = repeat(torch.rand(nheads, device=device) - 4.0,
|
||||
"h -> h p",
|
||||
p=headdim)
|
||||
A = repeat(-torch.rand(nheads, device=device) - 1.0,
|
||||
"h -> h p n",
|
||||
p=headdim,
|
||||
n=dstate)
|
||||
D = repeat(torch.randn(nheads, device=device), "h -> h p", p=headdim)
|
||||
B = torch.randn(batch_size, ngroups, dstate, device=device)
|
||||
C = torch.randn(batch_size, ngroups, dstate, device=device)
|
||||
z = torch.randn_like(x) if has_z else None
|
||||
state_ref = state[state_indices, :].detach().clone()
|
||||
out = selective_state_update(state,
|
||||
x,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=D,
|
||||
z=z,
|
||||
dt_bias=dt_bias,
|
||||
dt_softplus=True,
|
||||
state_batch_indices=state_indices,
|
||||
pad_slot_id=PAD_SLOT_ID)
|
||||
out_ref = selective_state_update_ref(state_ref,
|
||||
x,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
D=D,
|
||||
z=z,
|
||||
dt_bias=dt_bias,
|
||||
dt_softplus=True)
|
||||
|
||||
print(f"Output max diff: {(out - out_ref).abs().max().item()}")
|
||||
print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
|
||||
assert torch.allclose(state[state_indices, :],
|
||||
state_ref,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
|
||||
528
vllm-v0.6.2/tests/kernels/test_marlin_gemm.py
Normal file
528
vllm-v0.6.2/tests/kernels/test_marlin_gemm.py
Normal file
@@ -0,0 +1,528 @@
|
||||
"""Tests for the marlin kernel.
|
||||
|
||||
Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
|
||||
from vllm.model_executor.layers.quantization.qqq import (
|
||||
MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
|
||||
MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
|
||||
marlin_permute_scales, query_marlin_supported_quant_types)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
||||
pack_fp8_to_int32)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
|
||||
marlin_weights)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
|
||||
marlin_24_quantize)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import ( # noqa: E501
|
||||
marlin_qqq_quantize)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
|
||||
|
||||
ACT_ORDER_OPTS = [False, True]
|
||||
K_FULL_OPTS = [False, True]
|
||||
USE_FP32_REDUCE_OPTS = [False, True]
|
||||
|
||||
MARLIN_K_CHUNKS = [128]
|
||||
MARLIN_N_CHUNKS = [64, 256]
|
||||
|
||||
MARLIN_24_K_CHUNKS = [128]
|
||||
MARLIN_24_N_CHUNKS = [512]
|
||||
|
||||
MNK_FACTORS = [
|
||||
(1, 1, 1),
|
||||
(1, 4, 8),
|
||||
(1, 7, 5),
|
||||
(13, 17, 67),
|
||||
(26, 37, 13),
|
||||
(67, 13, 11),
|
||||
]
|
||||
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
|
||||
|
||||
def compute_max_diff(output, output_ref):
|
||||
return torch.mean(torch.abs(output - output_ref)) / torch.mean(
|
||||
torch.abs(output_ref))
|
||||
|
||||
|
||||
def rand_data(shape, dtype=torch.float16):
|
||||
return torch.randn(shape, dtype=dtype, device="cuda")
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
|
||||
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
|
||||
@pytest.mark.parametrize("quant_type",
|
||||
query_marlin_supported_quant_types(False))
|
||||
@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
|
||||
@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
|
||||
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
|
||||
def test_gptq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
||||
act_order, mnk_factors):
|
||||
m_factor, n_factor, k_factor = mnk_factors
|
||||
|
||||
size_k = k_chunk * k_factor
|
||||
size_n = n_chunk * n_factor
|
||||
|
||||
# Filter act_order
|
||||
if act_order:
|
||||
if group_size == -1:
|
||||
return
|
||||
if group_size == size_k:
|
||||
return
|
||||
|
||||
# Normalize group_size
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
|
||||
# Create input
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
# Quantize (and apply act_order if provided)
|
||||
w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
|
||||
b_weight, quant_type, group_size, act_order)
|
||||
|
||||
# Pack to GPTQ format
|
||||
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
|
||||
|
||||
# For act_order, sort the "weights" and "g_idx" so that group ids are
|
||||
# increasing
|
||||
sort_indices = torch.empty(0, dtype=torch.int, device=b_weight.device)
|
||||
if act_order:
|
||||
q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
|
||||
|
||||
# Pack to Marlin format
|
||||
weight_perm = get_weight_perm(quant_type.size_bits)
|
||||
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
||||
weight_perm)
|
||||
|
||||
opcheck(torch.ops._C.gptq_marlin_repack,
|
||||
(q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits))
|
||||
|
||||
# Run Marlin repack GPU kernel
|
||||
marlin_q_w_2 = ops.gptq_marlin_repack(
|
||||
q_w_gptq,
|
||||
sort_indices,
|
||||
size_k,
|
||||
size_n,
|
||||
quant_type.size_bits,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
|
||||
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
|
||||
@pytest.mark.parametrize("quant_type",
|
||||
query_marlin_supported_quant_types(False))
|
||||
@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
|
||||
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
|
||||
def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
|
||||
mnk_factors):
|
||||
m_factor, n_factor, k_factor = mnk_factors
|
||||
|
||||
size_k = k_chunk * k_factor
|
||||
size_n = n_chunk * n_factor
|
||||
|
||||
# Normalize group_size
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
|
||||
# Create input
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
# Quantize
|
||||
w_ref, q_w, s, zp = quantize_weights(b_weight,
|
||||
quant_type,
|
||||
group_size,
|
||||
zero_points=True)
|
||||
|
||||
# Pack to AWQ format
|
||||
q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
|
||||
|
||||
# Pack to Marlin format
|
||||
weight_perm = get_weight_perm(quant_type.size_bits)
|
||||
marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
||||
weight_perm)
|
||||
|
||||
opcheck(torch.ops._C.awq_marlin_repack,
|
||||
(q_w_awq, size_k, size_n, quant_type.size_bits))
|
||||
|
||||
# Run Marlin repack GPU kernel
|
||||
marlin_q_w_2 = ops.awq_marlin_repack(
|
||||
q_w_awq,
|
||||
size_k,
|
||||
size_n,
|
||||
quant_type.size_bits,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
|
||||
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
|
||||
@pytest.mark.parametrize("quant_type",
|
||||
query_marlin_supported_quant_types(False))
|
||||
@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
|
||||
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
|
||||
@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
|
||||
@pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
|
||||
@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
|
||||
def test_gptq_marlin_gemm(
|
||||
k_chunk,
|
||||
n_chunk,
|
||||
quant_type,
|
||||
group_size,
|
||||
mnk_factors,
|
||||
act_order,
|
||||
is_k_full,
|
||||
use_fp32_reduce,
|
||||
):
|
||||
m_factor, n_factor, k_factor = mnk_factors
|
||||
|
||||
size_m = m_factor
|
||||
size_k = k_chunk * k_factor
|
||||
size_n = n_chunk * n_factor
|
||||
|
||||
if act_order:
|
||||
if group_size == -1:
|
||||
return
|
||||
if group_size == size_k:
|
||||
return
|
||||
|
||||
a_input = rand_data((size_m, size_k))
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
|
||||
b_weight, quant_type, group_size, act_order)
|
||||
|
||||
marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
|
||||
|
||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C.gptq_marlin_gemm,
|
||||
(a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
|
||||
workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1],
|
||||
a_input.shape[1], is_k_full, False, use_fp32_reduce),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
a_input,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_zp,
|
||||
g_idx,
|
||||
sort_indices,
|
||||
workspace.scratch,
|
||||
quant_type,
|
||||
a_input.shape[0],
|
||||
b_weight.shape[1],
|
||||
a_input.shape[1],
|
||||
is_k_full=is_k_full,
|
||||
has_zp=False,
|
||||
use_fp32_reduce=use_fp32_reduce,
|
||||
)
|
||||
output_ref = torch.matmul(a_input, w_ref)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
max_diff = compute_max_diff(output, output_ref)
|
||||
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
# TODO: find better way to test this?
|
||||
@torch.compile(fullgraph=True)
|
||||
def marlin_24_gemm_tester(a_input, marlin_24_q_w_comp, marlin_24_meta,
|
||||
marlin_24_s, scratch, quant_type, size_m, size_n,
|
||||
size_k):
|
||||
return ops.gptq_marlin_24_gemm(a_input, marlin_24_q_w_comp, marlin_24_meta,
|
||||
marlin_24_s, scratch, quant_type, size_m,
|
||||
size_n, size_k)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
|
||||
@pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
|
||||
@pytest.mark.parametrize("quant_type", GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
|
||||
@pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
|
||||
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
|
||||
def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size,
|
||||
mnk_factors):
|
||||
m_factor, n_factor, k_factor = mnk_factors
|
||||
|
||||
size_m = m_factor
|
||||
size_k = k_chunk * k_factor
|
||||
size_n = n_chunk * n_factor
|
||||
|
||||
a_input = rand_data((size_m, size_k))
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
(w_24_ref, marlin_24_q_w_comp, marlin_24_meta,
|
||||
marlin_24_s) = marlin_24_quantize(b_weight, quant_type, group_size)
|
||||
|
||||
workspace_24 = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL)
|
||||
|
||||
output_ref = torch.matmul(a_input, w_24_ref)
|
||||
|
||||
opcheck(torch.ops._C.gptq_marlin_24_gemm,
|
||||
(a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s,
|
||||
workspace_24.scratch, quant_type.id, a_input.shape[0],
|
||||
b_weight.shape[1], a_input.shape[1]),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
|
||||
|
||||
output = marlin_24_gemm_tester(
|
||||
a_input,
|
||||
marlin_24_q_w_comp,
|
||||
marlin_24_meta,
|
||||
marlin_24_s,
|
||||
workspace_24.scratch,
|
||||
quant_type,
|
||||
a_input.shape[0],
|
||||
b_weight.shape[1],
|
||||
a_input.shape[1],
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
max_diff = compute_max_diff(output, output_ref)
|
||||
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
|
||||
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
|
||||
@pytest.mark.parametrize("num_bits", [8])
|
||||
@pytest.mark.parametrize("group_size", [-1])
|
||||
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
def test_fp8_marlin_gemm(
|
||||
k_chunk,
|
||||
n_chunk,
|
||||
num_bits,
|
||||
group_size,
|
||||
mnk_factors,
|
||||
dtype,
|
||||
):
|
||||
m_factor, n_factor, k_factor = mnk_factors
|
||||
|
||||
size_m = m_factor
|
||||
size_k = k_chunk * k_factor
|
||||
size_n = n_chunk * n_factor
|
||||
|
||||
a_input = rand_data((size_m, size_k), dtype=dtype)
|
||||
b_weight = rand_data((size_k, size_n), dtype=dtype)
|
||||
|
||||
# WEIGHTS
|
||||
fp8_weight, weight_scale = ops.scaled_fp8_quant(b_weight, scale=None)
|
||||
# Repack weights to gptq format (packed int32 elements)
|
||||
packed_gptq_qweight = pack_fp8_to_int32(fp8_weight)
|
||||
# Repack weights to marlin format
|
||||
marlin_qweight = ops.gptq_marlin_repack(
|
||||
b_q_weight=packed_gptq_qweight,
|
||||
perm=torch.empty(0, dtype=torch.int, device="cuda"),
|
||||
size_k=size_k,
|
||||
size_n=size_n,
|
||||
num_bits=8,
|
||||
)
|
||||
|
||||
# WEIGHT SCALES
|
||||
# Currently Marlin doesn't support per-tensor scales, so we
|
||||
# expand it to channelwise
|
||||
scales = weight_scale.repeat(1, size_n).to(a_input.dtype).to("cuda")
|
||||
# Permute scales
|
||||
marlin_scales = marlin_permute_scales(s=scales,
|
||||
size_k=size_k,
|
||||
size_n=size_n,
|
||||
group_size=-1)
|
||||
|
||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
opcheck(torch.ops._C.fp8_marlin_gemm,
|
||||
(a_input, marlin_qweight, marlin_scales, workspace.scratch,
|
||||
num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1]))
|
||||
|
||||
output = ops.fp8_marlin_gemm(
|
||||
a=a_input,
|
||||
b_q_weight=marlin_qweight,
|
||||
b_scales=marlin_scales,
|
||||
workspace=workspace.scratch,
|
||||
num_bits=num_bits,
|
||||
size_m=a_input.shape[0],
|
||||
size_n=b_weight.shape[1],
|
||||
size_k=a_input.shape[1],
|
||||
)
|
||||
output_ref = torch.matmul(a_input, b_weight)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
max_diff = compute_max_diff(output, output_ref)
|
||||
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
|
||||
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
|
||||
@pytest.mark.parametrize("quant_type",
|
||||
query_marlin_supported_quant_types(True))
|
||||
@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
|
||||
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
|
||||
@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
|
||||
def test_awq_marlin_gemm(
|
||||
k_chunk,
|
||||
n_chunk,
|
||||
quant_type,
|
||||
group_size,
|
||||
mnk_factors,
|
||||
use_fp32_reduce,
|
||||
):
|
||||
m_factor, n_factor, k_factor = mnk_factors
|
||||
|
||||
size_m = m_factor
|
||||
size_k = k_chunk * k_factor
|
||||
size_n = n_chunk * n_factor
|
||||
|
||||
a_input = rand_data((size_m, size_k))
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
|
||||
b_weight, quant_type, group_size)
|
||||
|
||||
g_idx = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
|
||||
sort_indices = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
|
||||
is_k_full = True
|
||||
has_zp = True
|
||||
|
||||
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
a_input,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_zp,
|
||||
g_idx,
|
||||
sort_indices,
|
||||
workspace.scratch,
|
||||
quant_type,
|
||||
a_input.shape[0],
|
||||
b_weight.shape[1],
|
||||
a_input.shape[1],
|
||||
is_k_full=is_k_full,
|
||||
has_zp=has_zp,
|
||||
use_fp32_reduce=use_fp32_reduce,
|
||||
)
|
||||
output_ref = torch.matmul(a_input, w_ref)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
max_diff = compute_max_diff(output, output_ref)
|
||||
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("qqq"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
|
||||
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
|
||||
@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
|
||||
@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
|
||||
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
|
||||
def test_marlin_qqq_gemm(
|
||||
k_chunk,
|
||||
n_chunk,
|
||||
num_bits,
|
||||
group_size,
|
||||
mnk_factors,
|
||||
):
|
||||
int8_traits = torch.iinfo(torch.int8)
|
||||
m_factor, n_factor, k_factor = mnk_factors
|
||||
|
||||
size_m = m_factor
|
||||
size_k = k_chunk * k_factor
|
||||
size_n = n_chunk * n_factor
|
||||
|
||||
a_input = rand_data((size_m, size_k))
|
||||
b_weight = rand_data((size_k, size_n))
|
||||
|
||||
# Quantize activations
|
||||
s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
|
||||
torch.float)
|
||||
q_a = (a_input / s_a).round().clamp(int8_traits.min,
|
||||
int8_traits.max).to(torch.int8)
|
||||
|
||||
# Quantize weights
|
||||
w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
|
||||
marlin_qqq_quantize(b_weight, num_bits, group_size)
|
||||
|
||||
workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
|
||||
MARLIN_QQQ_MAX_PARALLEL)
|
||||
|
||||
opcheck(torch.ops._C.marlin_qqq_gemm,
|
||||
(q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
|
||||
marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
|
||||
b_weight.shape[1], a_input.shape[1]))
|
||||
|
||||
output = ops.marlin_qqq_gemm(
|
||||
q_a,
|
||||
marlin_qqq_q_w,
|
||||
s_a,
|
||||
marlin_qqq_s_channel,
|
||||
marlin_qqq_s_group,
|
||||
workspace.scratch,
|
||||
a_input.shape[0],
|
||||
b_weight.shape[1],
|
||||
a_input.shape[1],
|
||||
)
|
||||
output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
max_diff = compute_max_diff(output, output_ref)
|
||||
|
||||
assert max_diff < 0.04
|
||||
|
||||
|
||||
def test_marlin_gemm_opcheck():
|
||||
size_m = 2048
|
||||
size_n = 4096
|
||||
size_k = 4096
|
||||
a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
|
||||
w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
|
||||
s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
|
||||
wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL).scratch
|
||||
x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
|
||||
y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
|
||||
torch.testing.assert_close(x, y)
|
||||
opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
|
||||
354
vllm-v0.6.2/tests/kernels/test_moe.py
Normal file
354
vllm-v0.6.2/tests/kernels/test_moe.py
Normal file
@@ -0,0 +1,354 @@
|
||||
"""Tests for the MOE layers.
|
||||
|
||||
Run `pytest tests/kernels/test_moe.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import MixtralConfig
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
||||
|
||||
import vllm.model_executor.layers.fused_moe # noqa
|
||||
from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev,
|
||||
torch_moe, torch_moe_single)
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_topk, moe_align_block_size)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
marlin_quantize)
|
||||
from vllm.model_executor.models.mixtral import MixtralMoE
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
TOP_KS = [2, 6]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
def test_fused_moe(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||
triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
|
||||
torch_output = torch_moe(a, w1, w2, score, topk)
|
||||
torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype",
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@torch.inference_mode()
|
||||
def test_mixtral_moe(dtype: torch.dtype):
|
||||
"""Make sure our Mixtral MoE implementation agrees with the one from
|
||||
huggingface."""
|
||||
|
||||
# Instantiate our and huggingface's MoE blocks
|
||||
config = MixtralConfig()
|
||||
hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
|
||||
vllm_moe = MixtralMoE(
|
||||
num_experts=config.num_local_experts,
|
||||
top_k=config.num_experts_per_tok,
|
||||
hidden_size=config.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
params_dtype=dtype,
|
||||
tp_size=1,
|
||||
).cuda()
|
||||
|
||||
# Load the weights
|
||||
vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
|
||||
for i in range(config.num_local_experts):
|
||||
weights = (hf_moe.experts[i].w1.weight.data,
|
||||
hf_moe.experts[i].w3.weight.data)
|
||||
vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
|
||||
vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
|
||||
|
||||
# Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
|
||||
hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
|
||||
# vLLM uses 1D query [num_tokens, hidden_dim]
|
||||
vllm_inputs = hf_inputs.flatten(0, 1)
|
||||
|
||||
# Run forward passes for both MoE blocks
|
||||
hf_states, _ = hf_moe.forward(hf_inputs)
|
||||
vllm_states = vllm_moe.forward(vllm_inputs)
|
||||
|
||||
mixtral_moe_tol = {
|
||||
torch.float32: 1e-3,
|
||||
torch.float16: 1e-3,
|
||||
torch.bfloat16: 1e-2,
|
||||
}
|
||||
|
||||
torch.testing.assert_close(hf_states.flatten(0, 1),
|
||||
vllm_states,
|
||||
rtol=mixtral_moe_tol[dtype],
|
||||
atol=mixtral_moe_tol[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64, 222])
|
||||
@pytest.mark.parametrize("n", [128, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 1024])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("group_size", [-1, 32, 128])
|
||||
@pytest.mark.parametrize("act_order", [True, False])
|
||||
@pytest.mark.parametrize("num_bits", [4, 8])
|
||||
@pytest.mark.parametrize("is_k_full", [True, False])
|
||||
@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
|
||||
def test_fused_marlin_moe(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
group_size: int,
|
||||
act_order: bool,
|
||||
num_bits: int,
|
||||
is_k_full: bool,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
|
||||
# Filter act_order
|
||||
if act_order:
|
||||
if group_size == -1:
|
||||
return
|
||||
if group_size in (k, n):
|
||||
return
|
||||
else:
|
||||
if not is_k_full:
|
||||
return
|
||||
|
||||
quant_type = (scalar_types.uint4b8
|
||||
if num_bits == 4 else scalar_types.uint8b128)
|
||||
dtype = torch.float16
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
|
||||
|
||||
w_ref1_l = []
|
||||
qweight1_l = []
|
||||
scales1_l = []
|
||||
g_idx1_l = []
|
||||
sort_indices1_l = []
|
||||
|
||||
for i in range(w1.shape[0]):
|
||||
test_perm = torch.randperm(k)
|
||||
w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
|
||||
w1[i].transpose(1, 0), quant_type, group_size, act_order,
|
||||
test_perm)
|
||||
w_ref1_l.append(w_ref1)
|
||||
qweight1_l.append(qweight1)
|
||||
scales1_l.append(scales1)
|
||||
g_idx1_l.append(g_idx1)
|
||||
sort_indices1_l.append(sort_indices1)
|
||||
|
||||
w_ref1 = stack_and_dev(w_ref1_l)
|
||||
qweight1 = stack_and_dev(qweight1_l).contiguous()
|
||||
scales1 = stack_and_dev(scales1_l)
|
||||
g_idx1 = stack_and_dev(g_idx1_l)
|
||||
sort_indices1 = stack_and_dev(sort_indices1_l)
|
||||
|
||||
w_ref2_l = []
|
||||
qweight2_l = []
|
||||
scales2_l = []
|
||||
g_idx2_l = []
|
||||
sort_indices2_l = []
|
||||
|
||||
for i in range(w2.shape[0]):
|
||||
test_perm = torch.randperm(n)
|
||||
w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
|
||||
w2[i].transpose(1, 0), quant_type, group_size, act_order,
|
||||
test_perm)
|
||||
w_ref2_l.append(w_ref2)
|
||||
qweight2_l.append(qweight2)
|
||||
scales2_l.append(scales2)
|
||||
g_idx2_l.append(g_idx2)
|
||||
sort_indices2_l.append(sort_indices2)
|
||||
|
||||
w_ref2 = stack_and_dev(w_ref2_l)
|
||||
qweight2 = stack_and_dev(qweight2_l).contiguous()
|
||||
scales2 = stack_and_dev(scales2_l)
|
||||
g_idx2 = stack_and_dev(g_idx2_l)
|
||||
sort_indices2 = stack_and_dev(sort_indices2_l)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||
|
||||
topk_weights, topk_ids = fused_topk(a, score, topk, False)
|
||||
|
||||
triton_output = fused_moe(
|
||||
a,
|
||||
w_ref1.transpose(1, 2).contiguous(),
|
||||
w_ref2.transpose(1, 2).contiguous(),
|
||||
score,
|
||||
topk,
|
||||
renormalize=False,
|
||||
)
|
||||
marlin_output = torch.ops.vllm.fused_marlin_moe(
|
||||
a,
|
||||
qweight1,
|
||||
qweight2,
|
||||
scales1,
|
||||
scales2,
|
||||
score,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
g_idx1=g_idx1,
|
||||
g_idx2=g_idx2,
|
||||
sort_indices1=sort_indices1,
|
||||
sort_indices2=sort_indices2,
|
||||
num_bits=num_bits,
|
||||
is_k_full=is_k_full,
|
||||
)
|
||||
|
||||
assert compute_max_diff(marlin_output, triton_output) < 4e-2
|
||||
|
||||
if ops.supports_moe_ops:
|
||||
token_expert_indicies = torch.empty(m,
|
||||
topk,
|
||||
dtype=torch.int32,
|
||||
device=a.device)
|
||||
|
||||
opcheck(torch.ops._moe_C.topk_softmax, (
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
token_expert_indicies,
|
||||
score.float(),
|
||||
))
|
||||
|
||||
block_size_m = 4
|
||||
|
||||
sorted_token_ids, _, _ = moe_align_block_size(topk_ids, block_size_m,
|
||||
e)
|
||||
|
||||
max_workspace_size = ((m + 255) // 256) * (max(2 * n, k) // 64) * 16
|
||||
workspace = torch.zeros(max_workspace_size,
|
||||
dtype=torch.int,
|
||||
device="cuda",
|
||||
requires_grad=False)
|
||||
|
||||
zp = torch.empty((0, 0),
|
||||
dtype=dtype,
|
||||
device="cuda",
|
||||
requires_grad=False)
|
||||
opcheck(torch.ops._moe_C.marlin_gemm_moe,
|
||||
(a, qweight1, sorted_token_ids, topk_weights, topk_ids,
|
||||
scales1, zp, g_idx1, sort_indices1, workspace, quant_type.id,
|
||||
m, 2 * n, k, True, e, topk, block_size_m, True, False))
|
||||
|
||||
|
||||
@pytest.mark.skip("This test is here for the sake of debugging, "
|
||||
"don't run it in automated tests.")
|
||||
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
|
||||
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
|
||||
@pytest.mark.parametrize("k", [128, 1024, 512])
|
||||
@pytest.mark.parametrize("e", [8, 64])
|
||||
@pytest.mark.parametrize("topk", [2, 6])
|
||||
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
|
||||
@pytest.mark.parametrize("act_order", [True, False])
|
||||
@pytest.mark.parametrize("num_bits", [4, 8])
|
||||
@pytest.mark.parametrize("is_k_full", [True, False])
|
||||
@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
|
||||
def test_single_marlin_moe_multiply(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
group_size: int,
|
||||
act_order: bool,
|
||||
num_bits: int,
|
||||
is_k_full: bool,
|
||||
):
|
||||
|
||||
# Filter act_order
|
||||
if act_order:
|
||||
if group_size == -1:
|
||||
return
|
||||
if group_size == k:
|
||||
return
|
||||
else:
|
||||
if not is_k_full:
|
||||
return
|
||||
|
||||
quant_type = (scalar_types.uint4b8
|
||||
if num_bits == 4 else scalar_types.uint8b128)
|
||||
dtype = torch.float16
|
||||
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||
w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
|
||||
|
||||
w_ref_l = []
|
||||
qweights_l = []
|
||||
scales_l = []
|
||||
g_idx_l = []
|
||||
sort_indices_l = []
|
||||
|
||||
for i in range(w.shape[0]):
|
||||
test_perm = torch.randperm(k)
|
||||
w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
|
||||
w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm)
|
||||
w_ref_l.append(w_ref)
|
||||
qweights_l.append(qweight)
|
||||
scales_l.append(scales)
|
||||
g_idx_l.append(g_idx)
|
||||
sort_indices_l.append(sort_indices)
|
||||
|
||||
w_ref = stack_and_dev(w_ref_l)
|
||||
qweight = stack_and_dev(qweights_l).contiguous()
|
||||
scales = stack_and_dev(scales_l)
|
||||
g_idx = stack_and_dev(g_idx_l)
|
||||
sort_indices = stack_and_dev(sort_indices_l)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=dtype)
|
||||
marlin_output = torch.ops.vllm.single_marlin_moe(
|
||||
a,
|
||||
qweight,
|
||||
scales,
|
||||
score,
|
||||
topk,
|
||||
renormalize=False,
|
||||
g_idx=g_idx,
|
||||
sort_indices=sort_indices,
|
||||
num_bits=num_bits,
|
||||
is_k_full=is_k_full,
|
||||
)
|
||||
|
||||
torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
|
||||
|
||||
assert compute_max_diff(marlin_output, torch_output) < 1e-2
|
||||
|
||||
|
||||
def test_moe_align_block_size_opcheck():
|
||||
num_experts = 4
|
||||
block_size = 4
|
||||
topk_ids = torch.randint(0,
|
||||
num_experts, (3, 4),
|
||||
dtype=torch.int32,
|
||||
device='cuda')
|
||||
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
sorted_ids = torch.empty((max_num_tokens_padded, ),
|
||||
dtype=torch.int32,
|
||||
device=topk_ids.device)
|
||||
sorted_ids.fill_(topk_ids.numel())
|
||||
max_num_m_blocks = max_num_tokens_padded // block_size
|
||||
expert_ids = torch.empty((max_num_m_blocks, ),
|
||||
dtype=torch.int32,
|
||||
device=topk_ids.device)
|
||||
num_tokens_post_pad = torch.empty((1),
|
||||
dtype=torch.int32,
|
||||
device=topk_ids.device)
|
||||
|
||||
opcheck(torch.ops._moe_C.moe_align_block_size,
|
||||
(topk_ids, num_experts, block_size, sorted_ids, expert_ids,
|
||||
num_tokens_post_pad))
|
||||
15
vllm-v0.6.2/tests/kernels/test_permute_cols.py
Normal file
15
vllm-v0.6.2/tests/kernels/test_permute_cols.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm._custom_ops import permute_cols
|
||||
|
||||
|
||||
@pytest.mark.parametrize('shape', [(1, 512), (544, 4096), (67, 8192)])
|
||||
@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16])
|
||||
def test_permute_cols(shape, dtype):
|
||||
x = torch.randn(shape, dtype=dtype).cuda()
|
||||
perm = torch.randperm(x.shape[1]).to(torch.int).cuda()
|
||||
opcheck(torch.ops._C.permute_cols, (x, perm))
|
||||
y = permute_cols(x, perm)
|
||||
torch.testing.assert_close(y, x[:, perm])
|
||||
244
vllm-v0.6.2/tests/kernels/test_pos_encoding.py
Normal file
244
vllm-v0.6.2/tests/kernels/test_pos_encoding.py
Normal file
@@ -0,0 +1,244 @@
|
||||
from itertools import accumulate, product
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .allclose_default import get_default_atol, get_default_rtol
|
||||
|
||||
IS_NEOX_STYLE = [True, False]
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
HEAD_SIZES = [64, 80, 112, 120, 256]
|
||||
ROTARY_DIMS = [None, 32] # None means rotary dim == head size
|
||||
NUM_HEADS = [17] # Arbitrary values for testing
|
||||
BATCH_SIZES = [5] # Arbitrary values for testing
|
||||
SEQ_LENS = [11, 8192] # Arbitrary values for testing
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_rotary_embedding(
|
||||
is_neox_style: bool,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||
rope = rope.to(dtype=dtype)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
# NOTE(woosuk): The reference implementation should be executed first
|
||||
# because the custom kernel is in-place.
|
||||
ref_query, ref_key = rope.forward_native(positions, query, key)
|
||||
out_query, out_key = rope.forward(positions, query, key)
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(out_query,
|
||||
ref_query,
|
||||
atol=get_default_atol(out_query),
|
||||
rtol=get_default_rtol(out_query))
|
||||
torch.testing.assert_close(out_key,
|
||||
ref_key,
|
||||
atol=get_default_atol(out_key),
|
||||
rtol=get_default_rtol(out_key))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_batched_rotary_embedding(
|
||||
is_neox_style: bool,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
|
||||
"rope_type": "linear",
|
||||
"factor": (1, )
|
||||
})
|
||||
rope = rope.to(dtype=dtype)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
# NOTE(woosuk): The reference implementation should be executed first
|
||||
# because the custom kernel is in-place.
|
||||
ref_query, ref_key = rope.forward_native(positions, query, key)
|
||||
out_query, out_key = rope.forward(positions,
|
||||
query,
|
||||
key,
|
||||
offsets=torch.zeros(batch_size * seq_len,
|
||||
dtype=torch.long,
|
||||
device=device))
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(out_query,
|
||||
ref_query,
|
||||
atol=get_default_atol(out_query),
|
||||
rtol=get_default_rtol(out_query))
|
||||
torch.testing.assert_close(out_key,
|
||||
ref_key,
|
||||
atol=get_default_atol(out_key),
|
||||
rtol=get_default_rtol(out_key))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_batched_rotary_embedding_multi_lora(
|
||||
is_neox_style: bool,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
scaling_factors: List[int] = [1, 2, 4]
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
|
||||
"rope_type": "linear",
|
||||
"factor": tuple(scaling_factors)
|
||||
})
|
||||
rope = rope.to(dtype=dtype)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
offset_map = torch.tensor(
|
||||
list(
|
||||
accumulate([0] + [
|
||||
max_position * scaling_factor * 2
|
||||
for scaling_factor in scaling_factors[:-1]
|
||||
])))
|
||||
query_types = torch.randint(0,
|
||||
len(scaling_factors), (batch_size, seq_len),
|
||||
device=device)
|
||||
query_offsets = offset_map[query_types]
|
||||
|
||||
# NOTE(woosuk): The reference implementation should be executed first
|
||||
# because the custom kernel is in-place.
|
||||
ref_query, ref_key = rope.forward_native(positions, query, key,
|
||||
query_offsets)
|
||||
out_query, out_key = rope.forward(positions, query, key,
|
||||
query_offsets.flatten())
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(out_query,
|
||||
ref_query,
|
||||
atol=get_default_atol(out_query),
|
||||
rtol=get_default_rtol(out_query))
|
||||
torch.testing.assert_close(out_key,
|
||||
ref_key,
|
||||
atol=get_default_atol(out_key),
|
||||
rtol=get_default_rtol(out_key))
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def test_rope_module_cache():
|
||||
MAX_POSITIONS = [123, 1234]
|
||||
BASES = [10000, 1000000]
|
||||
ROPE_SCALINGS = (None, {
|
||||
"rope_type": "linear",
|
||||
"factor": (1, )
|
||||
}, {
|
||||
"rope_type": "dynamic",
|
||||
"factor": 1
|
||||
})
|
||||
settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
|
||||
ROPE_SCALINGS, DTYPES)
|
||||
rope_setting_id_map: Dict[str, int] = {}
|
||||
for setting in product(*settings):
|
||||
head_size, rotary_dim, max_position, base, \
|
||||
is_neox_stype, rope_scaling, dtype = setting
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base,
|
||||
is_neox_stype, rope_scaling, dtype)
|
||||
# different settings cannot share the same rope module
|
||||
assert id(rope) not in rope_setting_id_map.values()
|
||||
assert all(x.dtype == dtype for x in rope.buffers())
|
||||
assert all(x.dtype == dtype for x in rope.parameters())
|
||||
rope_setting_id_map[str(setting)] = id(rope)
|
||||
|
||||
for setting in product(*settings):
|
||||
head_size, rotary_dim, max_position, base, \
|
||||
is_neox_stype, rope_scaling, dtype = setting
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base,
|
||||
is_neox_stype, rope_scaling, dtype)
|
||||
# check if cache take effect
|
||||
assert id(rope) == rope_setting_id_map[str(setting)]
|
||||
464
vllm-v0.6.2/tests/kernels/test_prefix_prefill.py
Normal file
464
vllm-v0.6.2/tests/kernels/test_prefix_prefill.py
Normal file
@@ -0,0 +1,464 @@
|
||||
import math
|
||||
import random
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from xformers import ops as xops
|
||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
|
||||
|
||||
from vllm.attention.backends.xformers import _make_alibi_bias
|
||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
NUM_HEADS = [64]
|
||||
NUM_QUERIES_PER_KV = [1, 8, 64]
|
||||
HEAD_SIZES = [128, 96, 24]
|
||||
DTYPES = [torch.float16]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
|
||||
KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
|
||||
@torch.inference_mode()
|
||||
def test_contexted_kv_attention(
|
||||
num_heads: int,
|
||||
num_queries_per_kv: int,
|
||||
head_size: int,
|
||||
sliding_window: int,
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(0)
|
||||
torch.set_default_device(device)
|
||||
|
||||
# Need this, otherwise when we capture the graph the process
|
||||
# for GPU 1 would run on both GPU0 and GPU1 and things would hang
|
||||
#
|
||||
# see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
MAX_SEQ_LEN = 1024
|
||||
MAX_CTX_LEN = 1024
|
||||
BS = 10
|
||||
cache_size = 640
|
||||
block_size = 32
|
||||
max_block_per_request = 64
|
||||
query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
|
||||
ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
|
||||
seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
|
||||
num_kv_heads = num_heads // num_queries_per_kv
|
||||
|
||||
num_tokens = sum(query_lens)
|
||||
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
query.uniform_(-1e-3, 1e-3)
|
||||
output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
|
||||
kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
|
||||
kv.uniform_(-1e-3, 1e-3)
|
||||
key, value = kv.unbind(dim=1)
|
||||
|
||||
if kv_cache_dtype == "auto":
|
||||
cache_dtype = dtype
|
||||
else:
|
||||
cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
|
||||
k_cache = torch.zeros(cache_size,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=cache_dtype)
|
||||
v_cache = torch.zeros(cache_size,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=cache_dtype)
|
||||
k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
values = torch.arange(0, cache_size, dtype=torch.long)
|
||||
values = values[torch.randperm(cache_size)]
|
||||
block_table = values[:BS * max_block_per_request].view(
|
||||
BS, max_block_per_request)
|
||||
b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
|
||||
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
|
||||
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
max_input_len = MAX_SEQ_LEN
|
||||
# copy kv to cache
|
||||
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
for i in range(BS):
|
||||
for j in range(query_lens[i]):
|
||||
k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
|
||||
j])
|
||||
v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
|
||||
b_ctx_len[i] + j])
|
||||
cur_ctx = 0
|
||||
block_id = 0
|
||||
while cur_ctx < b_ctx_len[i]:
|
||||
start_loc = b_seq_start_loc[i] + cur_ctx
|
||||
if cur_ctx + block_size > b_ctx_len[i]:
|
||||
end_loc = b_seq_start_loc[i] + b_ctx_len[i]
|
||||
else:
|
||||
end_loc = start_loc + block_size
|
||||
start_slot = block_table[i, block_id] * block_size
|
||||
end_slot = start_slot + end_loc - start_loc
|
||||
k_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(
|
||||
key[start_loc:end_loc])
|
||||
v_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(
|
||||
value[start_loc:end_loc])
|
||||
cur_ctx += block_size
|
||||
block_id += 1
|
||||
# transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
|
||||
# to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
|
||||
k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
|
||||
8).permute(0, 2, 3, 1, 4).contiguous()
|
||||
# transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
|
||||
# to V_cache[num_blocks, num_kv_heads, head_size, block_size]
|
||||
v_cache = v_cache.view(-1, block_size, num_kv_heads,
|
||||
head_size).permute(0, 2, 3, 1).contiguous()
|
||||
|
||||
# Warm up the Triton kernel by calling it once before actually measuring
|
||||
# generation time
|
||||
context_attention_fwd(query,
|
||||
k,
|
||||
v,
|
||||
output,
|
||||
kv_cache_dtype,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
sliding_window=sliding_window)
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
context_attention_fwd(query,
|
||||
k,
|
||||
v,
|
||||
output,
|
||||
kv_cache_dtype,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
sliding_window=sliding_window)
|
||||
torch.cuda.synchronize()
|
||||
end_time = time.time()
|
||||
print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
|
||||
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
|
||||
attn_op = xops.fmha.cutlass.FwOp()
|
||||
|
||||
if num_kv_heads != num_heads:
|
||||
# As of Nov 2023, xformers only supports MHA. For MQA/GQA,
|
||||
# project the key and value tensors to the desired number of
|
||||
# heads.
|
||||
#
|
||||
# see also: vllm/model_executor/layers/attention.py
|
||||
query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
|
||||
query.shape[-1])
|
||||
key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
|
||||
num_queries_per_kv, key.shape[-1])
|
||||
value = value[:, :,
|
||||
None, :].expand(value.shape[0], num_kv_heads,
|
||||
num_queries_per_kv, value.shape[-1])
|
||||
query = query.unsqueeze(0)
|
||||
key = key.unsqueeze(0)
|
||||
value = value.unsqueeze(0)
|
||||
|
||||
attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
|
||||
query_lens, seq_lens)
|
||||
if sliding_window > 0:
|
||||
attn_bias = attn_bias.make_local_attention_from_bottomright(
|
||||
sliding_window)
|
||||
output_ref = xops.memory_efficient_attention_forward(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
attn_bias=attn_bias,
|
||||
p=0.0,
|
||||
scale=scale,
|
||||
op=attn_op,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
output_ref = xops.memory_efficient_attention_forward(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
attn_bias=attn_bias,
|
||||
p=0.0,
|
||||
scale=scale,
|
||||
op=attn_op,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
end_time = time.time()
|
||||
print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
|
||||
output_ref = output_ref.reshape(output.shape)
|
||||
atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
|
||||
torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_contexted_kv_attention_alibi(
|
||||
num_heads: int,
|
||||
num_queries_per_kv: int,
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
kv_cache_dtype: str,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(0)
|
||||
torch.set_default_device(device)
|
||||
|
||||
# Need this, otherwise when we capture the graph the process
|
||||
# for GPU 1 would run on both GPU0 and GPU1 and things would hang
|
||||
#
|
||||
# see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
|
||||
# Fork from: vllm/vllm/model_executor/models/bloom.py#L44
|
||||
closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
|
||||
base = torch.tensor(
|
||||
2**(-(2**-(math.log2(closest_power_of_2) - 3))),
|
||||
dtype=torch.float32,
|
||||
)
|
||||
powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
|
||||
slopes = torch.pow(base, powers)
|
||||
|
||||
if closest_power_of_2 != total_num_heads:
|
||||
extra_base = torch.tensor(
|
||||
2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
|
||||
dtype=torch.float32,
|
||||
)
|
||||
num_remaining_heads = min(closest_power_of_2,
|
||||
total_num_heads - closest_power_of_2)
|
||||
extra_powers = torch.arange(start=1,
|
||||
end=1 + 2 * num_remaining_heads,
|
||||
step=2,
|
||||
dtype=torch.int32)
|
||||
slopes = torch.cat(
|
||||
[slopes, torch.pow(extra_base, extra_powers)], dim=0)
|
||||
return slopes
|
||||
|
||||
alibi_slopes = _get_alibi_slopes(num_heads).to(device)
|
||||
|
||||
MAX_SEQ_LEN = 1024
|
||||
MAX_CTX_LEN = 1024
|
||||
BS = 10
|
||||
cache_size = 640
|
||||
block_size = 32
|
||||
max_block_per_request = 64
|
||||
query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
|
||||
ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
|
||||
seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
|
||||
num_kv_heads = num_heads // num_queries_per_kv
|
||||
|
||||
num_tokens = sum(query_lens)
|
||||
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
query.uniform_(-1e-3, 1e-3)
|
||||
output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
|
||||
kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
|
||||
kv.uniform_(-1e-3, 1e-3)
|
||||
key, value = kv.unbind(dim=1)
|
||||
if kv_cache_dtype == "auto":
|
||||
cache_dtype = dtype
|
||||
else:
|
||||
cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]
|
||||
k_cache = torch.zeros(cache_size,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=cache_dtype)
|
||||
v_cache = torch.zeros(cache_size,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=cache_dtype)
|
||||
k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
values = torch.arange(0, cache_size, dtype=torch.long)
|
||||
values = values[torch.randperm(cache_size)]
|
||||
block_table = values[:BS * max_block_per_request].view(
|
||||
BS, max_block_per_request)
|
||||
b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
|
||||
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
|
||||
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
max_input_len = MAX_SEQ_LEN
|
||||
# copy kv to cache
|
||||
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
for i in range(BS):
|
||||
for j in range(query_lens[i]):
|
||||
k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
|
||||
j])
|
||||
v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
|
||||
b_ctx_len[i] + j])
|
||||
cur_ctx = 0
|
||||
block_id = 0
|
||||
while cur_ctx < b_ctx_len[i]:
|
||||
start_loc = b_seq_start_loc[i] + cur_ctx
|
||||
if cur_ctx + block_size > b_ctx_len[i]:
|
||||
end_loc = b_seq_start_loc[i] + b_ctx_len[i]
|
||||
else:
|
||||
end_loc = start_loc + block_size
|
||||
start_slot = block_table[i, block_id] * block_size
|
||||
end_slot = start_slot + end_loc - start_loc
|
||||
k_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(
|
||||
key[start_loc:end_loc])
|
||||
v_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(
|
||||
value[start_loc:end_loc])
|
||||
cur_ctx += block_size
|
||||
block_id += 1
|
||||
# transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
|
||||
# to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
|
||||
k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
|
||||
8).permute(0, 2, 3, 1, 4).contiguous()
|
||||
# transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
|
||||
# to V_cache[num_blocks, num_kv_heads, head_size, block_size]
|
||||
v_cache = v_cache.view(-1, block_size, num_kv_heads,
|
||||
head_size).permute(0, 2, 3, 1).contiguous()
|
||||
|
||||
# Warm up the Triton kernel by calling it once before actually measuring
|
||||
# generation time
|
||||
context_attention_fwd(query,
|
||||
k,
|
||||
v,
|
||||
output,
|
||||
kv_cache_dtype,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
alibi_slopes=alibi_slopes)
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
context_attention_fwd(query,
|
||||
k,
|
||||
v,
|
||||
output,
|
||||
kv_cache_dtype,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
alibi_slopes=alibi_slopes)
|
||||
torch.cuda.synchronize()
|
||||
end_time = time.time()
|
||||
print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
|
||||
# NOTE(DefTruth): In order to reuse _make_alibi_bias function,
|
||||
# we have to pad query tensor before MQA/GQA expanding.
|
||||
if query.shape[0] != key.shape[0]:
|
||||
query_pad = torch.empty(sum(seq_lens),
|
||||
num_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
query_pad.uniform_(-1e-3, 1e-3)
|
||||
seq_start = 0
|
||||
query_start = 0
|
||||
for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
|
||||
seq_end = seq_start + seq_len
|
||||
query_end = query_start + query_len
|
||||
query_pad[seq_start:seq_end, ...] = torch.cat([
|
||||
torch.zeros(
|
||||
seq_len - query_len, num_heads, head_size, dtype=dtype),
|
||||
query[query_start:query_end, ...]
|
||||
],
|
||||
dim=0)
|
||||
seq_start += seq_len
|
||||
query_start += query_len
|
||||
query = query_pad
|
||||
|
||||
if num_kv_heads != num_heads:
|
||||
# As of Nov 2023, xformers only supports MHA. For MQA/GQA,
|
||||
# project the key and value tensors to the desired number of
|
||||
# heads.
|
||||
#
|
||||
# see also: vllm/model_executor/layers/attention.py
|
||||
query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
|
||||
query.shape[-1])
|
||||
key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
|
||||
num_queries_per_kv, key.shape[-1])
|
||||
value = value[:, :,
|
||||
None, :].expand(value.shape[0], num_kv_heads,
|
||||
num_queries_per_kv, value.shape[-1])
|
||||
|
||||
query = query.unsqueeze(0)
|
||||
key = key.unsqueeze(0)
|
||||
value = value.unsqueeze(0)
|
||||
|
||||
attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
|
||||
output_ref = torch.empty_like(output)
|
||||
seq_start = 0
|
||||
query_start = 0
|
||||
start_time = time.time()
|
||||
# Attention with alibi slopes.
|
||||
# FIXME(DefTruth): Because xformers does not support dynamic sequence
|
||||
# lengths with custom attention bias, we process each prompt one by
|
||||
# one. This is inefficient, especially when we have many short prompts.
|
||||
# modified from: vllm/attention/backends/xformers.py#L343
|
||||
for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
|
||||
seq_end = seq_start + seq_len
|
||||
query_end = query_start + query_len
|
||||
out = xops.memory_efficient_attention_forward(query[:,
|
||||
seq_start:seq_end],
|
||||
key[:,
|
||||
seq_start:seq_end],
|
||||
value[:,
|
||||
seq_start:seq_end],
|
||||
attn_bias=attn_bias[i],
|
||||
p=0.0,
|
||||
scale=scale)
|
||||
out = out.view_as(query[:, seq_start:seq_end]).view(
|
||||
seq_len, num_heads, head_size)
|
||||
output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len:,
|
||||
...])
|
||||
seq_start += seq_len
|
||||
query_start += query_len
|
||||
torch.cuda.synchronize()
|
||||
end_time = time.time()
|
||||
print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
|
||||
atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
|
||||
torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
|
||||
62
vllm-v0.6.2/tests/kernels/test_rotary_embedding.py
Normal file
62
vllm-v0.6.2/tests/kernels/test_rotary_embedding.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Tests for miscellaneous utilities
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||
|
||||
|
||||
def rotary_embedding_opcheck(rot,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
offsets: Optional[torch.Tensor] = None):
|
||||
cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
|
||||
|
||||
# ops.rotary_embedding()/batched_rotary_embedding()
|
||||
# are in-place operations that update the query and key tensors.
|
||||
if offsets is not None:
|
||||
opcheck(torch.ops._C.batched_rotary_embedding,
|
||||
(positions, query, key, rot.head_size, cos_sin_cache,
|
||||
rot.is_neox_style, rot.rotary_dim, offsets))
|
||||
else:
|
||||
opcheck(torch.ops._C.rotary_embedding,
|
||||
(positions, query, key, rot.head_size, cos_sin_cache,
|
||||
rot.is_neox_style))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cuda"])
|
||||
@pytest.mark.parametrize("max_position", [11, 4096, 32768])
|
||||
@pytest.mark.parametrize("is_neox_style", [True, False])
|
||||
@pytest.mark.parametrize("rotary_dim", [32])
|
||||
@pytest.mark.parametrize("head_size", [32, 108])
|
||||
@pytest.mark.parametrize("seq_len", [11, 1024])
|
||||
def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
||||
is_neox_style, rotary_dim, head_size,
|
||||
seq_len):
|
||||
batch_size = 1
|
||||
base = 0
|
||||
num_heads = 7
|
||||
rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
|
||||
is_neox_style, torch.float32)
|
||||
|
||||
positions = torch.randint(0,
|
||||
max_position, (batch_size, seq_len),
|
||||
device=device)
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
rotary_embedding_opcheck(rot, positions, query, key)
|
||||
offsets = torch.zeros(batch_size * seq_len,
|
||||
device=device,
|
||||
dtype=torch.long)
|
||||
rotary_embedding_opcheck(rot, positions, query, key, offsets)
|
||||
106
vllm-v0.6.2/tests/kernels/test_triton_scaled_mm.py
Normal file
106
vllm-v0.6.2/tests/kernels/test_triton_scaled_mm.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Tests for the triton_scaled_mm kernel
|
||||
|
||||
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
|
||||
"""
|
||||
import importlib
|
||||
from typing import Optional, Type
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
device = "cuda"
|
||||
|
||||
|
||||
def scaled_mm_torch(a: torch.Tensor,
|
||||
b: torch.Tensor,
|
||||
scale_a: torch.Tensor,
|
||||
scale_b: torch.Tensor,
|
||||
out_dtype: Type[torch.dtype],
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
out = torch.mm(a.to(torch.float32), b.to(torch.float32))
|
||||
out = scale_a * out
|
||||
out = scale_b.T * out
|
||||
out = out.to(out_dtype)
|
||||
if bias is not None:
|
||||
out = out + bias
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def get_8bit_types():
|
||||
types = [torch.int8]
|
||||
supports_fp8 = current_platform.has_device_capability(89)
|
||||
if current_platform.is_rocm() and supports_fp8:
|
||||
types.append(torch.float8_e4m3fnuz)
|
||||
elif current_platform.is_cuda() and supports_fp8:
|
||||
types.append(torch.float8_e4m3fn)
|
||||
return types
|
||||
|
||||
|
||||
@pytest.mark.parametrize("M", [1, 33, 64, 512])
|
||||
@pytest.mark.parametrize("N", [256, 971, 20486])
|
||||
@pytest.mark.parametrize("K", [128, 496, 1024])
|
||||
@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("in_dtype", get_8bit_types())
|
||||
@pytest.mark.parametrize("use_scalar_scale_a", [True, False])
|
||||
@pytest.mark.parametrize("use_scalar_scale_b", [True, False])
|
||||
@pytest.mark.parametrize("use_bias", [True, False])
|
||||
def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
|
||||
use_scalar_scale_b, use_bias):
|
||||
is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
|
||||
).is_floating_point()
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
# NOTE: There are cases, where if the matrix is large enough, an output
|
||||
# like 65504.4 can be produced, and can easily turn into inf when
|
||||
# multiplied when using float16/bfloat16. This means one function, e.g.,
|
||||
# testing function, and another function, e.g. golden function, can
|
||||
# produce a non-inf value while the other produces an inf value, and
|
||||
# will cause assert_close/allclose to fail, even though if overflow
|
||||
# wouldn't have occurred, the values would have been "close."
|
||||
#
|
||||
# So, the values here are kept small enough to avoid this situation.
|
||||
if is_floating_point_type(in_dtype):
|
||||
a = (0.25 * torch.rand(
|
||||
(M, K), dtype=torch.float32, device=device)).to(in_dtype)
|
||||
b = (0.25 * torch.rand(
|
||||
(K, N), dtype=torch.float32, device=device)).to(in_dtype)
|
||||
else:
|
||||
a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
|
||||
b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
|
||||
|
||||
if use_scalar_scale_a:
|
||||
scale_a = torch.rand((1, 1), device=device)
|
||||
else:
|
||||
scale_a = 0.25 * torch.rand((M, 1), device=device)
|
||||
|
||||
if use_scalar_scale_b:
|
||||
scale_b = torch.rand((1, 1), device=device)
|
||||
else:
|
||||
scale_b = 0.25 * torch.rand((N, 1), device=device)
|
||||
|
||||
bias = None
|
||||
if use_bias:
|
||||
bias = torch.rand((N, ), device=device, dtype=out_dtype)
|
||||
|
||||
triton_scaled_mm_module = importlib.import_module(
|
||||
"vllm.model_executor.layers.quantization.compressed_tensors."
|
||||
"triton_scaled_mm")
|
||||
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
||||
|
||||
c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
||||
|
||||
a_cpu = a.cpu()
|
||||
b_cpu = b.cpu()
|
||||
scale_a_cpu = scale_a.cpu()
|
||||
scale_b_cpu = scale_b.cpu()
|
||||
bias_cpu = None if bias is None else bias.cpu()
|
||||
|
||||
c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
|
||||
out_dtype, bias_cpu)
|
||||
|
||||
c_check_cpu = c_check.cpu()
|
||||
torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
|
||||
24
vllm-v0.6.2/tests/kernels/test_utils.py
Normal file
24
vllm-v0.6.2/tests/kernels/test_utils.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""
|
||||
Tests for miscellaneous utilities
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.utils import opcheck
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def test_convert_fp8_opcheck():
|
||||
data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
|
||||
result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
|
||||
opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(),
|
||||
reason="Only supported for CUDA")
|
||||
def test_cuda_utils_opcheck():
|
||||
opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
|
||||
opcheck(
|
||||
torch.ops._C_cuda_utils.
|
||||
get_max_shared_memory_per_block_device_attribute, (0, ))
|
||||
1098
vllm-v0.6.2/tests/kernels/utils.py
Normal file
1098
vllm-v0.6.2/tests/kernels/utils.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user