forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
302
vllm-v0.6.2/benchmarks/kernels/benchmark_aqlm.py
Normal file
302
vllm-v0.6.2/benchmarks/kernels/benchmark_aqlm.py
Normal file
@@ -0,0 +1,302 @@
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.aqlm import (
|
||||
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
||||
optimized_dequantize_gemm)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
|
||||
|
||||
def torch_mult(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
weights: torch.Tensor,
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
) -> torch.Tensor:
|
||||
output = F.linear(input, weights)
|
||||
return output
|
||||
|
||||
|
||||
def dequant_out_scale(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||
codebooks: torch.
|
||||
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
output_partition_sizes: torch.IntTensor,
|
||||
bias: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
|
||||
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||
|
||||
if bias is None:
|
||||
output = F.linear(input, weights, bias)
|
||||
orig_shape = output.shape
|
||||
flattened_output = output.view(-1, output.size(-1))
|
||||
f_scales = scales.view(-1, scales.shape[0])
|
||||
b_scales = f_scales.expand(flattened_output.shape[0], -1)
|
||||
flattened_output *= b_scales
|
||||
return flattened_output.view(orig_shape)
|
||||
else:
|
||||
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
|
||||
-1, weights.shape[1])
|
||||
weights *= b_scales
|
||||
return F.linear(input, weights, bias)
|
||||
|
||||
|
||||
def dequant_weight_scale(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||
codebooks: torch.
|
||||
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
output_partition_sizes: torch.IntTensor,
|
||||
bias: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
|
||||
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||
|
||||
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
|
||||
-1, weights.shape[1])
|
||||
weights *= b_scales
|
||||
return F.linear(input, weights, bias)
|
||||
|
||||
|
||||
def dequant_no_scale(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||
codebooks: torch.
|
||||
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
output_partition_sizes: torch.IntTensor,
|
||||
bias: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
|
||||
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||
|
||||
return F.linear(input, weights, bias)
|
||||
|
||||
|
||||
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
||||
# the generic pytorch version.
|
||||
# Just visual comparison.
|
||||
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
|
||||
|
||||
n = int(parts.sum().item())
|
||||
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
code_range = (1 << bits) // 2
|
||||
ingroups = 8
|
||||
|
||||
codes = torch.randint(-code_range,
|
||||
code_range,
|
||||
size=(n, k // ingroups, nbooks),
|
||||
dtype=get_int_dtype(bits),
|
||||
device=device)
|
||||
|
||||
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
|
||||
dtype=torch.float16,
|
||||
device=device)
|
||||
|
||||
count = 0
|
||||
for index in range(16):
|
||||
for i in range(8):
|
||||
for book in range(nbooks):
|
||||
codebooks[book, index, 0, i] = count * (10**book)
|
||||
count += 1
|
||||
|
||||
print("codes shape", codes.shape)
|
||||
|
||||
for i in range(16):
|
||||
for book in range(nbooks):
|
||||
codes[0, i, book] = i
|
||||
codes[0, -i, book] = i
|
||||
|
||||
weights = dequantize_weight(codes, codebooks, None)
|
||||
weights2 = ops.aqlm_dequant(codes, codebooks, parts)
|
||||
|
||||
print("weights shape:", weights.shape)
|
||||
print("weights2 shape:", weights2.shape)
|
||||
|
||||
print("weights are:", weights)
|
||||
print("weights2 are:", weights2)
|
||||
|
||||
print("first 128 weights are", weights[0, 0:128].to(torch.int32))
|
||||
print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
|
||||
|
||||
print("last 128 weights are", weights[0, -128:])
|
||||
print("last 128 weights2 are:", weights2[0, -128:])
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
|
||||
|
||||
# Add arguments
|
||||
parser.add_argument("--nbooks",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of codebooks (default: 1)")
|
||||
parser.add_argument("--bits",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Number of bits per code element (default: 16)")
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Run the decompression/dequant tester rather than benchmarking "
|
||||
"(default: False)")
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
# Extract values
|
||||
nbooks = args.nbooks
|
||||
bits = args.bits
|
||||
|
||||
if args.test:
|
||||
dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
|
||||
return
|
||||
|
||||
# Otherwise, benchmark.
|
||||
methods = [
|
||||
ops.aqlm_gemm,
|
||||
dequant_out_scale,
|
||||
generic_dequantize_gemm,
|
||||
optimized_dequantize_gemm,
|
||||
dequant_weight_scale,
|
||||
torch_mult,
|
||||
dequant_no_scale,
|
||||
]
|
||||
|
||||
filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
|
||||
print(f"writing benchmarks to file {filename}")
|
||||
with open(filename, "w") as f:
|
||||
sys.stdout = f
|
||||
|
||||
print('m | k | n | n parts', end='')
|
||||
for method in methods:
|
||||
print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
|
||||
print('')
|
||||
|
||||
# These are reasonable prefill sizes.
|
||||
ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
|
||||
(4096, (11008, 11008)), (11008, (4096, )))
|
||||
|
||||
# reasonable ranges for m.
|
||||
for m in [
|
||||
1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
|
||||
128, 256, 512, 1024, 1536, 2048, 3072, 4096
|
||||
]:
|
||||
print(f'{m}', file=sys.__stdout__)
|
||||
for ksp in ksandpartions:
|
||||
run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
|
||||
methods)
|
||||
|
||||
sys.stdout = sys.__stdout__
|
||||
|
||||
|
||||
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
|
||||
methods):
|
||||
|
||||
# I didn't see visible improvements from increasing these, but feel free :)
|
||||
num_warmup_trials = 1
|
||||
num_trials = 1
|
||||
|
||||
num_calls = 100
|
||||
|
||||
# warmup.
|
||||
for method in methods:
|
||||
for _ in range(num_warmup_trials):
|
||||
run_timing(
|
||||
num_calls=num_calls,
|
||||
m=m,
|
||||
k=k,
|
||||
parts=parts,
|
||||
nbooks=nbooks,
|
||||
bits=bits,
|
||||
method=method,
|
||||
)
|
||||
|
||||
n = parts.sum().item()
|
||||
print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
|
||||
|
||||
for method in methods:
|
||||
best_time_us = 1e20
|
||||
for _ in range(num_trials):
|
||||
kernel_dur_ms = run_timing(
|
||||
num_calls=num_calls,
|
||||
m=m,
|
||||
k=k,
|
||||
parts=parts,
|
||||
nbooks=nbooks,
|
||||
bits=bits,
|
||||
method=method,
|
||||
)
|
||||
|
||||
kernel_dur_us = 1000 * kernel_dur_ms
|
||||
|
||||
if kernel_dur_us < best_time_us:
|
||||
best_time_us = kernel_dur_us
|
||||
|
||||
print(f' | {kernel_dur_us:.0f}', end='')
|
||||
|
||||
print('')
|
||||
|
||||
|
||||
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
|
||||
nbooks: int, bits: int, method) -> float:
|
||||
|
||||
n = int(parts.sum().item())
|
||||
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
input = torch.randn((1, m, k), dtype=torch.float16, device=device)
|
||||
|
||||
code_range = (1 << bits) // 2
|
||||
ingroups = 8
|
||||
|
||||
codes = torch.randint(-code_range,
|
||||
code_range,
|
||||
size=(n, k // ingroups, nbooks),
|
||||
dtype=get_int_dtype(bits),
|
||||
device=device)
|
||||
|
||||
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
|
||||
dtype=torch.float16,
|
||||
device=device)
|
||||
|
||||
scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
|
||||
|
||||
# for comparison to just a pytorch mult.
|
||||
weights = torch.randn((n, k), dtype=torch.float16, device=device)
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start_event.record()
|
||||
|
||||
if method is torch_mult:
|
||||
for i in range(num_calls):
|
||||
torch_mult(input, weights, scales)
|
||||
else:
|
||||
for i in range(num_calls):
|
||||
method(input, codes, codebooks, scales, parts, None)
|
||||
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
|
||||
dur_ms = start_event.elapsed_time(end_event) / num_calls
|
||||
return dur_ms
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
86
vllm-v0.6.2/benchmarks/kernels/benchmark_layernorm.py
Normal file
86
vllm-v0.6.2/benchmarks/kernels/benchmark_layernorm.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(num_tokens: int,
|
||||
hidden_size: int,
|
||||
add_residual: bool,
|
||||
dtype: torch.dtype,
|
||||
seed: int = 0,
|
||||
do_profile: bool = False,
|
||||
num_warmup_iters: int = 5,
|
||||
num_iters: int = 100) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||
scale = 1 / (2 * hidden_size)
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
x *= scale
|
||||
residual = torch.randn_like(x) * scale if add_residual else None
|
||||
|
||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||
torch.cuda.synchronize()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for _ in range(num_iters):
|
||||
layer(x, residual)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
return (end_time - start_time) / num_iters
|
||||
|
||||
# Warmup.
|
||||
print("Warming up...")
|
||||
run_benchmark = run_cuda_benchmark
|
||||
run_benchmark(num_iters=num_warmup_iters, profile=False)
|
||||
|
||||
# Benchmark.
|
||||
if do_profile:
|
||||
latency = run_benchmark(num_iters=1, profile=True)
|
||||
else:
|
||||
latency = run_benchmark(num_iters=num_iters, profile=False)
|
||||
print(f"Kernel running time: {latency * 1000000:.3f} us")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the layernorm kernel.")
|
||||
parser.add_argument("--num-tokens", type=int, default=4096)
|
||||
parser.add_argument("--hidden-size", type=int, default=8192)
|
||||
parser.add_argument("--add-residual", action="store_true")
|
||||
parser.add_argument("--dtype",
|
||||
type=str,
|
||||
choices=["half", "bfloat16", "float"],
|
||||
default="half")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--profile", action="store_true")
|
||||
parser.add_argument("--num-warmup-iters", type=int, default=5)
|
||||
parser.add_argument("--num-iters",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of benchmark iterations. "
|
||||
"If --profile is set, this number is ignored")
|
||||
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
main(num_tokens=args.num_tokens,
|
||||
hidden_size=args.hidden_size,
|
||||
add_residual=args.add_residual,
|
||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||
seed=args.seed,
|
||||
do_profile=args.profile,
|
||||
num_warmup_iters=args.num_warmup_iters,
|
||||
num_iters=args.num_iters)
|
||||
420
vllm-v0.6.2/benchmarks/kernels/benchmark_machete.py
Normal file
420
vllm-v0.6.2/benchmarks/kernels/benchmark_machete.py
Normal file
@@ -0,0 +1,420 @@
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
import math
|
||||
import pickle as pkl
|
||||
import time
|
||||
from itertools import product
|
||||
from typing import Callable, Iterable, List, Optional, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
MarlinWorkspace)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
gptq_pack, pack_rows, quantize_weights)
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
|
||||
DEFAULT_TP_SIZES = [1]
|
||||
|
||||
|
||||
def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
|
||||
w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
|
||||
w_q = w_q.t().contiguous().t() # make col major
|
||||
return ops.machete_prepack_B(w_q, wtype)
|
||||
|
||||
|
||||
def make_bench_tensors(
|
||||
atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
|
||||
k: int
|
||||
) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
|
||||
torch.tensor]]]:
|
||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||
|
||||
# we want to make sure that weights don't fit into L2 cache between runs so
|
||||
# we construct enough weights to exceed L2 cache, which is 50mb on a H100
|
||||
# so we target total weight size > 2*50mb
|
||||
num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
|
||||
|
||||
a = torch.randn((m, k), device="cuda", dtype=atype) * 5
|
||||
weights = [
|
||||
torch.randn((k, n), device="cuda", dtype=atype)
|
||||
for _ in range(num_weights)
|
||||
]
|
||||
quanitized_weights = [
|
||||
quantize_weights(w, wtype, group_size) for w in weights
|
||||
]
|
||||
|
||||
return a, quanitized_weights
|
||||
|
||||
|
||||
# impl
|
||||
|
||||
|
||||
# bench
|
||||
def bench_fn(label: str, sub_label: str, description: str,
|
||||
fn: Callable) -> TMeasurement:
|
||||
|
||||
min_run_time = 1
|
||||
return TBenchmark.Timer(
|
||||
stmt="fn()",
|
||||
globals={
|
||||
"fn": fn
|
||||
},
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description=description,
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
|
||||
|
||||
def loop_over_weights(
|
||||
a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
|
||||
torch.tensor, torch.tensor]],
|
||||
fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
|
||||
None]):
|
||||
for w_ref, w_q, w_s, _ in weights:
|
||||
fn(a, w_ref, w_q, w_s)
|
||||
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||
|
||||
|
||||
def bench(atype: torch.dtype,
|
||||
wtype: ScalarType,
|
||||
group_size: int,
|
||||
m: int,
|
||||
k: int,
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
benchmark_marlinv1: bool = True,
|
||||
sweep_schedules: bool = True) -> Iterable[TMeasurement]:
|
||||
global _SWEEP_SCHEDULES_RESULTS
|
||||
|
||||
a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
|
||||
sub_label += f", L={len(weights)}"
|
||||
|
||||
weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
|
||||
for w_ref, w_q, w_s, w_zp in weights]
|
||||
|
||||
timers = []
|
||||
# pytorch impl
|
||||
timers.append(
|
||||
bench_fn(
|
||||
label, sub_label, "torch.matmul", lambda: loop_over_weights(
|
||||
a,
|
||||
weights,
|
||||
lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
|
||||
)))
|
||||
|
||||
if benchmark_marlinv1:
|
||||
w_ref = weights[0][0]
|
||||
|
||||
w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
|
||||
sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
|
||||
g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
|
||||
|
||||
def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
|
||||
w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
|
||||
return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
|
||||
wtype.size_bits)
|
||||
|
||||
def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
|
||||
return marlin_permute_scales(w_s, *w_ref.shape, group_size)
|
||||
|
||||
weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
|
||||
marlinv1_permute_scales(w_s), w_zp)
|
||||
for w_ref, w_q, w_s, w_zp in weights]
|
||||
|
||||
workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
# marlinv1
|
||||
timers.append(
|
||||
bench_fn(
|
||||
label, sub_label, "marlin_orig", lambda: loop_over_weights(
|
||||
a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
|
||||
gptq_marlin_gemm(a,
|
||||
w_q,
|
||||
w_s,
|
||||
w_zp_empty,
|
||||
g_idx,
|
||||
sort_indices,
|
||||
workspace.scratch,
|
||||
wtype,
|
||||
size_m=a.shape[0],
|
||||
size_n=w_ref.shape[1],
|
||||
size_k=w_ref.shape[0],
|
||||
is_k_full=True))))
|
||||
|
||||
# machete
|
||||
timers.append(
|
||||
bench_fn(
|
||||
label, sub_label, "machete_heuristic", lambda: loop_over_weights(
|
||||
a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
|
||||
a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
|
||||
|
||||
if sweep_schedules:
|
||||
print("Finding best schedule for machete")
|
||||
best = None
|
||||
best_schedule = None
|
||||
schedules = ops.machete_supported_schedules(wtype)
|
||||
for schedule in reversed(schedules):
|
||||
schedule_M = int(schedule.split("_")[0].split("x")[1])
|
||||
|
||||
# Prune known bad schedules
|
||||
if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
|
||||
continue
|
||||
|
||||
def run(a, _, w_q, w_s, schedule=schedule):
|
||||
ops.machete_gemm(a,
|
||||
w_q,
|
||||
wtype,
|
||||
w_s,
|
||||
b_group_size=group_size,
|
||||
schedule=schedule)
|
||||
|
||||
res = bench_fn(label, sub_label, "machete_best",
|
||||
lambda: loop_over_weights(a, weights_machete, run))
|
||||
|
||||
results_row = {
|
||||
"M": m,
|
||||
"K": k,
|
||||
"N": n,
|
||||
"group_size": group_size,
|
||||
"schedule": schedule,
|
||||
"median": res.median,
|
||||
}
|
||||
if _SWEEP_SCHEDULES_RESULTS is None:
|
||||
_SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
|
||||
columns=results_row.keys())
|
||||
_SWEEP_SCHEDULES_RESULTS.\
|
||||
loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
|
||||
|
||||
print(f" {res.median:5.5} ", schedule)
|
||||
if not best or res.median < best.median:
|
||||
best = res
|
||||
best_schedule = schedule
|
||||
print("Best schedule:", best_schedule)
|
||||
timers.append(best)
|
||||
|
||||
return timers
|
||||
|
||||
|
||||
# runner
|
||||
def print_timers(timers: Iterable[TMeasurement]):
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
||||
def run(dtype: torch.dtype, sweep_schedules: bool,
|
||||
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
||||
|
||||
results = []
|
||||
for m, k, n in MKNs:
|
||||
timers = bench(dtype,
|
||||
scalar_types.uint4b8,
|
||||
128,
|
||||
m,
|
||||
k,
|
||||
n,
|
||||
f"{dtype}-gemm",
|
||||
f"MKN=({m}x{k}x{n})",
|
||||
sweep_schedules=sweep_schedules)
|
||||
print_timers(timers)
|
||||
results.extend(timers)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# output makers
|
||||
def make_output(
|
||||
data: Iterable[TMeasurement],
|
||||
MKNs: Iterable[Tuple[int, int, int]],
|
||||
base_description: str,
|
||||
timestamp=None,
|
||||
):
|
||||
|
||||
print(f"== All Results {base_description} ====")
|
||||
print_timers(data)
|
||||
|
||||
# pickle all the results
|
||||
timestamp = int(time.time()) if timestamp is None else timestamp
|
||||
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
|
||||
pkl.dump(data, f)
|
||||
|
||||
|
||||
# argparse runners
|
||||
|
||||
|
||||
def run_square_bench(args):
|
||||
dim_sizes = list(
|
||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||
|
||||
data = run(args.dtype, args.sweep_schedules, MKNs)
|
||||
|
||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||
|
||||
|
||||
def run_range_bench(args):
|
||||
m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
|
||||
m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
|
||||
m_increment, k_increment, n_increment = \
|
||||
(int(x) for x in args.dim_increment.split(","))
|
||||
Ms = list(range(m_start, m_end + 1, m_increment))
|
||||
Ks = list(range(k_start, k_end + 1, k_increment))
|
||||
Ns = list(range(n_start, n_end + 1, n_increment))
|
||||
MKNs = list(product(Ms, Ks, Ns))
|
||||
|
||||
data = run(args.dtype, args.sweep_schedules, MKNs)
|
||||
|
||||
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||
|
||||
|
||||
def run_model_bench(args):
|
||||
|
||||
print("Benchmarking models:")
|
||||
for i, model in enumerate(args.models):
|
||||
print(f"[{i}] {model}")
|
||||
|
||||
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
||||
KNs = []
|
||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||
KNs.append(KN)
|
||||
return KNs
|
||||
|
||||
model_bench_data = []
|
||||
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||
for model, tp_size in models_tps:
|
||||
Ms = args.batch_sizes
|
||||
KNs = model_shapes(model, tp_size)
|
||||
MKNs = []
|
||||
for m in Ms:
|
||||
for k, n in KNs:
|
||||
MKNs.append((m, k, n))
|
||||
|
||||
data = run(args.dtype, args.sweep_schedules, MKNs)
|
||||
model_bench_data.append(data)
|
||||
|
||||
# Print all results
|
||||
for data, model_tp in zip(model_bench_data, models_tps):
|
||||
model, tp_size = model_tp
|
||||
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
|
||||
print_timers(data)
|
||||
|
||||
timestamp = int(time.time())
|
||||
|
||||
all_data = []
|
||||
for d in model_bench_data:
|
||||
all_data.extend(d)
|
||||
# pickle all data
|
||||
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
|
||||
pkl.dump(all_data, f)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
def to_torch_dtype(dt):
|
||||
if dt == "bfloat16":
|
||||
return torch.bfloat16
|
||||
if dt == "float16":
|
||||
return torch.float16
|
||||
raise ValueError("unsupported dtype")
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="""
|
||||
Benchmark Machete GEMM.
|
||||
|
||||
To run square GEMMs:
|
||||
python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
|
||||
|
||||
To run constant N and K and sweep M:
|
||||
python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
|
||||
|
||||
To run dimensions from a model:
|
||||
python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
|
||||
|
||||
Output:
|
||||
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||
""", # noqa: E501
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dtype",
|
||||
type=to_torch_dtype,
|
||||
required=True,
|
||||
help="Available options are ['bfloat16', 'float16']",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sweep-schedules",
|
||||
action="store_true",
|
||||
help="Run a sweep over all supported schedules",
|
||||
)
|
||||
parser.add_argument("--sweep-csv-out",
|
||||
help="CSV to store sweep results",
|
||||
default="sch_sweep_results.csv")
|
||||
subparsers = parser.add_subparsers(dest="cmd", required=True)
|
||||
|
||||
square_parser = subparsers.add_parser("square_bench")
|
||||
square_parser.add_argument("--dim-start", type=int, required=True)
|
||||
square_parser.add_argument("--dim-end", type=int, required=True)
|
||||
square_parser.add_argument("--dim-increment", type=int, required=True)
|
||||
square_parser.set_defaults(func=run_square_bench)
|
||||
|
||||
range_parser = subparsers.add_parser("range_bench")
|
||||
range_parser.add_argument(
|
||||
"--dim-start",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Start value for M,K,N as common separated list")
|
||||
range_parser.add_argument(
|
||||
"--dim-end",
|
||||
type=str,
|
||||
required=True,
|
||||
help="End value (inclusive) for M,K,N as common separated list")
|
||||
range_parser.add_argument(
|
||||
"--dim-increment",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Increment value for M,K,N as common separated list")
|
||||
range_parser.set_defaults(func=run_range_bench)
|
||||
|
||||
model_parser = subparsers.add_parser("model_bench")
|
||||
model_parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=DEFAULT_MODELS,
|
||||
choices=WEIGHT_SHAPES.keys(),
|
||||
)
|
||||
model_parser.add_argument("--tp-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_TP_SIZES)
|
||||
model_parser.add_argument("--batch-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZES)
|
||||
model_parser.set_defaults(func=run_model_bench)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
|
||||
args.func(args)
|
||||
|
||||
if _SWEEP_SCHEDULES_RESULTS is not None:
|
||||
_SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
|
||||
254
vllm-v0.6.2/benchmarks/kernels/benchmark_marlin.py
Normal file
254
vllm-v0.6.2/benchmarks/kernels/benchmark_marlin.py
Normal file
@@ -0,0 +1,254 @@
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
MarlinWorkspace, marlin_quantize)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
|
||||
marlin_24_quantize)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
gptq_pack, gptq_quantize_weights, sort_weights)
|
||||
from vllm.scalar_type import ScalarType
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||
|
||||
ACT_ORDER_OPTS = [False, True]
|
||||
K_FULL_OPTS = [False, True]
|
||||
|
||||
|
||||
def bench_run(results: List[benchmark.Measurement], model: str,
|
||||
act_order: bool, is_k_full: bool, quant_type: ScalarType,
|
||||
group_size: int, size_m: int, size_k: int, size_n: int):
|
||||
label = "Quant Matmul"
|
||||
|
||||
sub_label = ("{}, act={} k_full={}, q={}, g={}, "
|
||||
"MKN=({}x{}x{})".format(model, act_order, is_k_full,
|
||||
str(quant_type), group_size, size_m,
|
||||
size_k, size_n))
|
||||
|
||||
print(f"Testing: {sub_label}")
|
||||
|
||||
a = torch.randn(size_m, size_k).to(torch.half).cuda()
|
||||
b = torch.rand(size_k, size_n).to(torch.half).cuda()
|
||||
|
||||
a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
|
||||
|
||||
# Marlin quant
|
||||
(
|
||||
marlin_w_ref,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_g_idx,
|
||||
marlin_sort_indices,
|
||||
marlin_rand_perm,
|
||||
) = marlin_quantize(b, quant_type, group_size, act_order)
|
||||
|
||||
# Marlin_24 quant
|
||||
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
|
||||
marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
|
||||
|
||||
marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
|
||||
|
||||
# GPTQ quant
|
||||
(w_ref, q_w, s, g_idx,
|
||||
rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
|
||||
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
|
||||
|
||||
# For act_order, sort the "weights" and "g_idx"
|
||||
# so that group ids are increasing
|
||||
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
|
||||
if act_order:
|
||||
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
|
||||
|
||||
# Prepare
|
||||
marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL)
|
||||
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
|
||||
|
||||
globals = {
|
||||
# Gen params
|
||||
"quant_type": quant_type,
|
||||
"group_size": group_size,
|
||||
"size_m": size_m,
|
||||
"size_n": size_n,
|
||||
"size_k": size_k,
|
||||
"a": a,
|
||||
"a_tmp": a_tmp,
|
||||
# Marlin params
|
||||
"marlin_w_ref": marlin_w_ref,
|
||||
"marlin_q_w": marlin_q_w,
|
||||
"marlin_s": marlin_s,
|
||||
"marlin_zp": marlin_zp,
|
||||
"marlin_g_idx": marlin_g_idx,
|
||||
"marlin_sort_indices": marlin_sort_indices,
|
||||
"marlin_rand_perm": marlin_rand_perm,
|
||||
"marlin_workspace": marlin_workspace,
|
||||
"is_k_full": is_k_full,
|
||||
# Marlin_24 params
|
||||
"marlin_24_w_ref": marlin_24_w_ref,
|
||||
"marlin_24_q_w_comp": marlin_24_q_w_comp,
|
||||
"marlin_24_meta": marlin_24_meta,
|
||||
"marlin_24_s": marlin_24_s,
|
||||
"marlin_24_workspace": marlin_24_workspace,
|
||||
# GPTQ params
|
||||
"q_w_gptq": q_w_gptq,
|
||||
"repack_sort_indices": repack_sort_indices,
|
||||
# Kernels
|
||||
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
|
||||
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
|
||||
"gptq_marlin_repack": ops.gptq_marlin_repack,
|
||||
}
|
||||
|
||||
min_run_time = 1
|
||||
|
||||
# Warmup pytorch
|
||||
for i in range(5):
|
||||
torch.matmul(a, marlin_w_ref)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="torch.matmul(a, marlin_w_ref)",
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="pytorch_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt=
|
||||
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm_fp16",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt=
|
||||
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm_fp32",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt=
|
||||
"output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_24_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt=
|
||||
"q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_repack",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
|
||||
def main(args):
|
||||
print("Benchmarking models:")
|
||||
for i, model in enumerate(args.models):
|
||||
print(f"[{i}] {model}")
|
||||
|
||||
results: List[benchmark.Measurement] = []
|
||||
|
||||
for model in args.models:
|
||||
for layer in WEIGHT_SHAPES[model]:
|
||||
size_k = layer[0]
|
||||
size_n = layer[1]
|
||||
|
||||
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
||||
continue
|
||||
|
||||
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
||||
continue
|
||||
|
||||
for act_order in ACT_ORDER_OPTS:
|
||||
if len(args.limit_act_order
|
||||
) > 0 and act_order not in args.limit_act_order:
|
||||
continue
|
||||
|
||||
for is_k_full in K_FULL_OPTS:
|
||||
if len(args.limit_k_full
|
||||
) > 0 and is_k_full not in args.limit_k_full:
|
||||
continue
|
||||
|
||||
for quant_type in query_marlin_supported_quant_types(
|
||||
False):
|
||||
if len(args.limit_num_bits) > 0 and \
|
||||
quant_type.size_bits not in args.limit_num_bits:
|
||||
continue
|
||||
|
||||
for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
|
||||
if len(
|
||||
args.limit_group_size
|
||||
) > 0 and group_size not in args.limit_group_size:
|
||||
continue
|
||||
|
||||
# For act_order, the group_size must be less than
|
||||
# size_k
|
||||
if act_order and (group_size == size_k
|
||||
or group_size == -1):
|
||||
continue
|
||||
|
||||
for size_m in args.batch_sizes:
|
||||
bench_run(results, model, act_order, is_k_full,
|
||||
quant_type, group_size, size_m,
|
||||
size_k, size_n)
|
||||
|
||||
compare = benchmark.Compare(results)
|
||||
compare.print()
|
||||
|
||||
|
||||
# For quick benchmarking use:
|
||||
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
|
||||
#
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark Marlin across specified models/shapes/batches")
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=DEFAULT_MODELS,
|
||||
choices=WEIGHT_SHAPES.keys(),
|
||||
)
|
||||
parser.add_argument("--batch-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZES)
|
||||
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
367
vllm-v0.6.2/benchmarks/kernels/benchmark_moe.py
Normal file
367
vllm-v0.6.2/benchmarks/kernels/benchmark_moe.py
Normal file
@@ -0,0 +1,367 @@
|
||||
import argparse
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Tuple, TypedDict
|
||||
|
||||
import ray
|
||||
import torch
|
||||
import triton
|
||||
from ray.experimental.tqdm_ray import tqdm
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class BenchmarkConfig(TypedDict):
|
||||
BLOCK_SIZE_M: int
|
||||
BLOCK_SIZE_N: int
|
||||
BLOCK_SIZE_K: int
|
||||
GROUP_SIZE_M: int
|
||||
num_warps: int
|
||||
num_stages: int
|
||||
|
||||
|
||||
def benchmark_config(
|
||||
config: BenchmarkConfig,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
) -> float:
|
||||
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
if use_int8_w8a16:
|
||||
w1 = torch.randint(-127,
|
||||
127, (
|
||||
num_experts,
|
||||
shard_intermediate_size,
|
||||
hidden_size,
|
||||
),
|
||||
dtype=torch.int8)
|
||||
w2 = torch.randint(-127,
|
||||
127, (
|
||||
num_experts,
|
||||
hidden_size,
|
||||
shard_intermediate_size // 2,
|
||||
),
|
||||
dtype=torch.int8)
|
||||
else:
|
||||
w1 = torch.randn(num_experts,
|
||||
shard_intermediate_size,
|
||||
hidden_size,
|
||||
dtype=init_dtype)
|
||||
w2 = torch.randn(num_experts,
|
||||
hidden_size,
|
||||
shard_intermediate_size // 2,
|
||||
dtype=init_dtype)
|
||||
gating_output = torch.randn(num_iters,
|
||||
num_tokens,
|
||||
num_experts,
|
||||
dtype=torch.float32)
|
||||
|
||||
w1_scale = None
|
||||
w2_scale = None
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
if use_int8_w8a16:
|
||||
w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
|
||||
dtype=torch.float32)
|
||||
w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
|
||||
if use_fp8_w8a8:
|
||||
w1_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||
w2_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||
a1_scale = torch.randn(1, dtype=torch.float32)
|
||||
a2_scale = torch.randn(1, dtype=torch.float32)
|
||||
|
||||
w1 = w1.to(torch.float8_e4m3fn)
|
||||
w2 = w2.to(torch.float8_e4m3fn)
|
||||
|
||||
input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
|
||||
|
||||
def prepare(i: int):
|
||||
input_gating.copy_(gating_output[i])
|
||||
|
||||
def run():
|
||||
from vllm.model_executor.layers.fused_moe import override_config
|
||||
with override_config(config):
|
||||
fused_moe(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
input_gating,
|
||||
topk,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Capture 10 invocations with CUDA graph
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
for _ in range(10):
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Warmup
|
||||
for _ in range(5):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
latencies: List[float] = []
|
||||
for i in range(num_iters):
|
||||
prepare(i)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event.record()
|
||||
graph.replay()
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
latencies.append(start_event.elapsed_time(end_event))
|
||||
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
||||
graph.reset()
|
||||
return avg
|
||||
|
||||
|
||||
def get_configs_compute_bound() -> List[Dict[str, int]]:
|
||||
# Reduced search space for faster tuning.
|
||||
# TODO(woosuk): Increase the search space and use a performance model to
|
||||
# prune the search space.
|
||||
configs: List[BenchmarkConfig] = []
|
||||
for num_stages in [2, 3, 4, 5]:
|
||||
for block_m in [16, 32, 64, 128, 256]:
|
||||
for block_k in [64, 128, 256]:
|
||||
for block_n in [32, 64, 128, 256]:
|
||||
for num_warps in [4, 8]:
|
||||
for group_size in [1, 16, 32, 64]:
|
||||
configs.append({
|
||||
"BLOCK_SIZE_M": block_m,
|
||||
"BLOCK_SIZE_N": block_n,
|
||||
"BLOCK_SIZE_K": block_k,
|
||||
"GROUP_SIZE_M": group_size,
|
||||
"num_warps": num_warps,
|
||||
"num_stages": num_stages,
|
||||
})
|
||||
return configs
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class BenchmarkWorker:
|
||||
|
||||
def __init__(self, seed: int) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(seed)
|
||||
self.seed = seed
|
||||
|
||||
def benchmark(
|
||||
self,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
) -> Tuple[Dict[str, int], float]:
|
||||
current_platform.seed_everything(self.seed)
|
||||
dtype_str = get_config_dtype_str(dtype,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
use_fp8_w8a8=use_fp8_w8a8)
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
# is the intermediate size after silu_and_mul.
|
||||
op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
|
||||
dtype_str)
|
||||
if op_config is None:
|
||||
config = get_default_config(num_tokens, num_experts,
|
||||
shard_intermediate_size, hidden_size,
|
||||
topk, dtype_str)
|
||||
else:
|
||||
config = op_config[min(op_config.keys(),
|
||||
key=lambda x: abs(x - num_tokens))]
|
||||
kernel_time = benchmark_config(config, num_tokens, num_experts,
|
||||
shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8_w8a8,
|
||||
use_int8_w8a16)
|
||||
return config, kernel_time
|
||||
|
||||
def tune(
|
||||
self,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
search_space: List[Dict[str, int]],
|
||||
) -> Dict[str, int]:
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
for config in tqdm(search_space):
|
||||
try:
|
||||
kernel_time = benchmark_config(config,
|
||||
num_tokens,
|
||||
num_experts,
|
||||
shard_intermediate_size,
|
||||
hidden_size,
|
||||
topk,
|
||||
dtype,
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=10)
|
||||
except triton.runtime.autotuner.OutOfResources:
|
||||
# Some configurations may be invalid and fail to compile.
|
||||
continue
|
||||
|
||||
if kernel_time < best_time:
|
||||
best_time = kernel_time
|
||||
best_config = config
|
||||
now = datetime.now()
|
||||
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||
assert best_config is not None
|
||||
return best_config
|
||||
|
||||
|
||||
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
||||
return {
|
||||
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
||||
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
||||
"BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
|
||||
"GROUP_SIZE_M": config["GROUP_SIZE_M"],
|
||||
"num_warps": config["num_warps"],
|
||||
"num_stages": config["num_stages"],
|
||||
}
|
||||
|
||||
|
||||
def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
|
||||
shard_intermediate_size: int, hidden_size: int, topk: int,
|
||||
dtype: torch.dtype, use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool) -> None:
|
||||
dtype_str = get_config_dtype_str(dtype,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
use_fp8_w8a8=use_fp8_w8a8)
|
||||
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
# is the intermediate size after silu_and_mul.
|
||||
filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
|
||||
dtype_str)
|
||||
|
||||
print(f"Writing best config to {filename}...")
|
||||
with open(filename, "w") as f:
|
||||
json.dump(configs, f, indent=4)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = AutoConfig.from_pretrained(args.model)
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] == "JambaForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
else:
|
||||
# Default: Mixtral.
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = config.torch_dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
||||
2048, 3072, 4096
|
||||
]
|
||||
else:
|
||||
batch_sizes = [args.batch_size]
|
||||
|
||||
ray.init()
|
||||
num_gpus = int(ray.available_resources()["GPU"])
|
||||
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||
|
||||
def _distribute(method: str, inputs: List[Any]) -> List[Any]:
|
||||
outputs = []
|
||||
worker_idx = 0
|
||||
for input_args in inputs:
|
||||
worker = workers[worker_idx]
|
||||
worker_method = getattr(worker, method)
|
||||
output = worker_method.remote(*input_args)
|
||||
outputs.append(output)
|
||||
worker_idx = (worker_idx + 1) % num_gpus
|
||||
return ray.get(outputs)
|
||||
|
||||
if args.tune:
|
||||
search_space = get_configs_compute_bound()
|
||||
print(f"Start tuning over {len(search_space)} configurations...")
|
||||
|
||||
start = time.time()
|
||||
configs = _distribute(
|
||||
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
|
||||
for batch_size in batch_sizes])
|
||||
best_configs = {
|
||||
M: sort_config(config)
|
||||
for M, config in zip(batch_sizes, configs)
|
||||
}
|
||||
save_configs(best_configs, E, shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8_w8a8, use_int8_w8a16)
|
||||
end = time.time()
|
||||
print(f"Tuning took {end - start:.2f} seconds")
|
||||
else:
|
||||
outputs = _distribute(
|
||||
"benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8_w8a8, use_int8_w8a16)
|
||||
for batch_size in batch_sizes])
|
||||
|
||||
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
|
||||
print(f"Batch size: {batch_size}, config: {config}")
|
||||
print(f"Kernel time: {kernel_time:.2f} us")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--model",
|
||||
type=str,
|
||||
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
||||
parser.add_argument("--tp-size", "-tp", type=int, default=2)
|
||||
parser.add_argument("--dtype",
|
||||
type=str,
|
||||
choices=["auto", "fp8_w8a8", "int8_w8a16"],
|
||||
default="auto")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--tune", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
234
vllm-v0.6.2/benchmarks/kernels/benchmark_paged_attention.py
Normal file
234
vllm-v0.6.2/benchmarks/kernels/benchmark_paged_attention.py
Normal file
@@ -0,0 +1,234 @@
|
||||
import random
|
||||
import time
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
from vllm import _mlu_ops as mlu_ops
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||
create_kv_caches_with_random)
|
||||
import torch_mlu.utils.gpu_migration
|
||||
|
||||
NUM_BLOCKS = 1024
|
||||
PARTITION_SIZE = 512
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(
|
||||
version: str,
|
||||
num_seqs: int,
|
||||
seq_len: int,
|
||||
num_query_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
use_alibi: bool,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
do_profile: bool,
|
||||
device: str = "cuda",
|
||||
kv_cache_dtype: Optional[str] = None,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
query = torch.empty(num_seqs,
|
||||
num_query_heads,
|
||||
head_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
query.uniform_(-scale, scale)
|
||||
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
alibi_slopes = None
|
||||
if use_alibi:
|
||||
alibi_slopes = torch.randn(num_query_heads,
|
||||
dtype=torch.float,
|
||||
device=device)
|
||||
|
||||
seq_lens = [seq_len for _ in range(num_seqs)]
|
||||
max_seq_len = max(seq_lens)
|
||||
seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
|
||||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables_lst: List[List[int]] = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
random.randint(0, NUM_BLOCKS - 1)
|
||||
for _ in range(max_num_blocks_per_seq)
|
||||
]
|
||||
block_tables_lst.append(block_table)
|
||||
|
||||
block_tables = torch.tensor(block_tables_lst,
|
||||
dtype=torch.int,
|
||||
device=device)
|
||||
|
||||
# Create the KV cache.
|
||||
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
||||
block_size,
|
||||
1,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
kv_cache_dtype,
|
||||
dtype,
|
||||
device=device)
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
if version == "tmo":
|
||||
key_cache = key_cache.reshape(NUM_BLOCKS, num_kv_heads, block_size, head_size)
|
||||
value_cache = value_cache.reshape(NUM_BLOCKS, num_kv_heads, block_size, head_size)
|
||||
# Prepare for the paged attention kernel.
|
||||
output = torch.empty_like(query)
|
||||
if version == "v2":
|
||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||
tmp_output = torch.empty(
|
||||
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
||||
dtype=output.dtype,
|
||||
device=output.device,
|
||||
)
|
||||
exp_sums = torch.empty(
|
||||
size=(num_seqs, num_query_heads, num_partitions),
|
||||
dtype=torch.float32,
|
||||
device=output.device,
|
||||
)
|
||||
max_logits = torch.empty_like(exp_sums)
|
||||
|
||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||
if version == "tmo":
|
||||
decode_query = query.view(-1, 1, num_query_heads, head_size)
|
||||
decode_out = torch.empty_like(decode_query)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# Using default kv_scale
|
||||
k_scale = v_scale = 1.0
|
||||
|
||||
for _ in range(num_iters):
|
||||
if version == "v1":
|
||||
ops.paged_attention_v1(
|
||||
output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
elif version == "v2":
|
||||
ops.paged_attention_v2(
|
||||
output,
|
||||
exp_sums,
|
||||
max_logits,
|
||||
tmp_output,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
k_scale,
|
||||
v_scale,
|
||||
)
|
||||
elif version == "tmo":
|
||||
mlu_ops.single_query_cached_kv_attn(decode_query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
decode_out,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
None, #k_cache_quant_scale
|
||||
None, #v_cache_quant_scale
|
||||
alibi_slopes,
|
||||
max_seq_len,
|
||||
-1, # windows_size_left
|
||||
-1, # windows_size_right
|
||||
scale)
|
||||
else:
|
||||
raise ValueError(f"Invalid version: {version}")
|
||||
torch.cuda.synchronize()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
return (end_time - start_time) / num_iters
|
||||
|
||||
# Warmup.
|
||||
print("Warming up...")
|
||||
run_benchmark = run_cuda_benchmark
|
||||
run_benchmark(num_iters=3, profile=False)
|
||||
|
||||
# Benchmark.
|
||||
if do_profile:
|
||||
latency = run_benchmark(num_iters=1, profile=True)
|
||||
else:
|
||||
latency = run_benchmark(num_iters=100, profile=False)
|
||||
print(f"Kernel running time: {latency * 1000000:.3f} us")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the paged attention kernel.")
|
||||
parser.add_argument("--version",
|
||||
type=str,
|
||||
choices=["v1", "v2", "tmo"],
|
||||
default="tmo")
|
||||
parser.add_argument("--batch-size", type=int, default=8)
|
||||
parser.add_argument("--seq-len", type=int, default=4096)
|
||||
parser.add_argument("--num-query-heads", type=int, default=64)
|
||||
parser.add_argument("--num-kv-heads", type=int, default=8)
|
||||
parser.add_argument("--head-size",
|
||||
type=int,
|
||||
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||
default=128)
|
||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||
parser.add_argument("--use-alibi", action="store_true")
|
||||
parser.add_argument("--dtype",
|
||||
type=str,
|
||||
choices=["half", "bfloat16", "float"],
|
||||
default="half")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--profile", action="store_true")
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
|
||||
default="auto",
|
||||
help="Data type for kv cache storage. If 'auto', will use model "
|
||||
"data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
|
||||
"ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.num_query_heads % args.num_kv_heads != 0:
|
||||
raise ValueError("num_query_heads must be divisible by num_kv_heads")
|
||||
main(
|
||||
version=args.version,
|
||||
num_seqs=args.batch_size,
|
||||
seq_len=args.seq_len,
|
||||
num_query_heads=args.num_query_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
head_size=args.head_size,
|
||||
block_size=args.block_size,
|
||||
use_alibi=args.use_alibi,
|
||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||
seed=args.seed,
|
||||
do_profile=args.profile,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
)
|
||||
100
vllm-v0.6.2/benchmarks/kernels/benchmark_quant.py
Normal file
100
vllm-v0.6.2/benchmarks/kernels/benchmark_quant.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def main(num_tokens: int,
|
||||
hidden_size: int,
|
||||
static_scale: bool,
|
||||
quant_dtype: torch.dtype,
|
||||
dtype: torch.dtype,
|
||||
seed: int = 0,
|
||||
do_profile: bool = False,
|
||||
num_warmup_iters: int = 5,
|
||||
num_iters: int = 100) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
|
||||
|
||||
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||
torch.cuda.synchronize()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for _ in range(num_iters):
|
||||
if quant_dtype == torch.int8:
|
||||
ops.scaled_int8_quant(x, scale)
|
||||
else:
|
||||
ops.scaled_fp8_quant(x, scale)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
return (end_time - start_time) / num_iters
|
||||
|
||||
# Warmup.
|
||||
print("Warming up...")
|
||||
run_benchmark = run_cuda_benchmark
|
||||
run_benchmark(num_iters=num_warmup_iters, profile=False)
|
||||
|
||||
# Benchmark.
|
||||
if do_profile:
|
||||
latency = run_benchmark(num_iters=1, profile=True)
|
||||
else:
|
||||
latency = run_benchmark(num_iters=num_iters, profile=False)
|
||||
print(f"Kernel running time: {latency * 1000000:.3f} us")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def to_torch_dtype(dt):
|
||||
if dt == "int8":
|
||||
return torch.int8
|
||||
if dt == "fp8":
|
||||
return torch.float8_e4m3fn
|
||||
raise ValueError(f"Unsupported dtype: {dt}")
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the quantization (fp8 or int8) kernel.")
|
||||
parser.add_argument("--num-tokens", type=int, default=4096)
|
||||
parser.add_argument("--hidden-size", type=int, default=8192)
|
||||
parser.add_argument("--static-scale", action="store_true")
|
||||
parser.add_argument("--quant-dtype",
|
||||
type=str,
|
||||
choices=["fp8", "int8"],
|
||||
default="int8")
|
||||
parser.add_argument("--dtype",
|
||||
type=str,
|
||||
choices=["half", "bfloat16", "float"],
|
||||
default="half")
|
||||
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--profile", action="store_true")
|
||||
parser.add_argument("--num-warmup-iters", type=int, default=5)
|
||||
parser.add_argument("--num-iters",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of benchmark iterations. "
|
||||
"If --profile is set, this number is ignored")
|
||||
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
main(num_tokens=args.num_tokens,
|
||||
hidden_size=args.hidden_size,
|
||||
static_scale=args.static_scale,
|
||||
quant_dtype=to_torch_dtype(args.quant_dtype),
|
||||
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||
seed=args.seed,
|
||||
do_profile=args.profile,
|
||||
num_warmup_iters=args.num_warmup_iters,
|
||||
num_iters=args.num_iters)
|
||||
121
vllm-v0.6.2/benchmarks/kernels/benchmark_rope.py
Normal file
121
vllm-v0.6.2/benchmarks/kernels/benchmark_rope.py
Normal file
@@ -0,0 +1,121 @@
|
||||
from itertools import accumulate
|
||||
from typing import List, Optional
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||
get_rope)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def benchmark_rope_kernels_multi_lora(
|
||||
is_neox_style: bool,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
# silulating serving 4 LoRAs
|
||||
scaling_factors = [1, 2, 4, 8]
|
||||
# batched RoPE can take multiple scaling factors
|
||||
batched_rope = get_rope(head_size, rotary_dim, max_position, base,
|
||||
is_neox_style, {
|
||||
"rope_type": "linear",
|
||||
"factor": tuple(scaling_factors)
|
||||
})
|
||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||
# instances to simulate the same behavior
|
||||
non_batched_ropes: List[RotaryEmbedding] = []
|
||||
for scaling_factor in scaling_factors:
|
||||
non_batched_ropes.append(
|
||||
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
||||
{
|
||||
"rope_type": "linear",
|
||||
"factor": (scaling_factor, )
|
||||
}))
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
|
||||
# create query offsets for batched RoPE, we concat multiple kv cache
|
||||
# together and each query needs to find the right kv cache of its type
|
||||
offset_map = torch.tensor(
|
||||
list(
|
||||
accumulate([0] + [
|
||||
max_position * scaling_factor * 2
|
||||
for scaling_factor in scaling_factors[:-1]
|
||||
])))
|
||||
query_types = torch.randint(0,
|
||||
len(scaling_factors), (batch_size, seq_len),
|
||||
device=device)
|
||||
# map query types to offsets
|
||||
query_offsets = offset_map[query_types]
|
||||
# the kernel takes flattened offsets
|
||||
flatten_offsets = query_offsets.flatten()
|
||||
|
||||
# batched queries of the same type together for non-batched RoPE
|
||||
queries = [query[query_types == i] for i in range(len(scaling_factors))]
|
||||
keys = [key[query_types == i] for i in range(len(scaling_factors))]
|
||||
packed_qkr = zip(queries, keys, non_batched_ropes)
|
||||
# synchronize before start timing
|
||||
torch.cuda.synchronize()
|
||||
with nvtx.annotate("non-batched", color="yellow"):
|
||||
for q, k, r in packed_qkr:
|
||||
r.forward(positions, q, k)
|
||||
torch.cuda.synchronize()
|
||||
with nvtx.annotate("batched", color="green"):
|
||||
batched_rope.forward(positions, query, key, flatten_offsets)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the rotary embedding kernels.")
|
||||
parser.add_argument("--is-neox-style", type=bool, default=True)
|
||||
parser.add_argument("--batch-size", type=int, default=16)
|
||||
parser.add_argument("--seq-len", type=int, default=512)
|
||||
parser.add_argument("--num-heads", type=int, default=8)
|
||||
parser.add_argument("--head-size",
|
||||
type=int,
|
||||
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||
default=128)
|
||||
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
|
||||
parser.add_argument("--dtype",
|
||||
type=str,
|
||||
choices=["bfloat16", "float"],
|
||||
default="float")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--device",
|
||||
type=str,
|
||||
choices=["cuda:0", "cuda:1"],
|
||||
default="cuda:0")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
benchmark_rope_kernels_multi_lora(
|
||||
is_neox_style=args.is_neox_style,
|
||||
batch_size=args.batch_size,
|
||||
seq_len=args.seq_len,
|
||||
num_heads=args.num_heads,
|
||||
head_size=args.head_size,
|
||||
rotary_dim=args.rotary_dim,
|
||||
dtype=getattr(torch, args.dtype),
|
||||
seed=args.seed,
|
||||
device=args.device,
|
||||
)
|
||||
75
vllm-v0.6.2/benchmarks/kernels/benchmark_shapes.py
Normal file
75
vllm-v0.6.2/benchmarks/kernels/benchmark_shapes.py
Normal file
@@ -0,0 +1,75 @@
|
||||
WEIGHT_SHAPES = {
|
||||
"ideal": [[4 * 256 * 32, 256 * 32]],
|
||||
"mistralai/Mistral-7B-v0.1/TP1": [
|
||||
[4096, 6144],
|
||||
[4096, 4096],
|
||||
[4096, 28672],
|
||||
[14336, 4096],
|
||||
],
|
||||
"mistralai/Mistral-7B-v0.1/TP2": [
|
||||
[4096, 3072],
|
||||
[2048, 4096],
|
||||
[4096, 14336],
|
||||
[7168, 4096],
|
||||
],
|
||||
"mistralai/Mistral-7B-v0.1/TP4": [
|
||||
[4096, 1536],
|
||||
[1024, 4096],
|
||||
[4096, 7168],
|
||||
[3584, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf/TP1": [
|
||||
[4096, 12288],
|
||||
[4096, 4096],
|
||||
[4096, 22016],
|
||||
[11008, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf/TP2": [
|
||||
[4096, 6144],
|
||||
[2048, 4096],
|
||||
[4096, 11008],
|
||||
[5504, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf/TP4": [
|
||||
[4096, 3072],
|
||||
[1024, 4096],
|
||||
[4096, 5504],
|
||||
[2752, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf/TP1": [
|
||||
[5120, 15360],
|
||||
[5120, 5120],
|
||||
[5120, 27648],
|
||||
[13824, 5120],
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf/TP2": [
|
||||
[5120, 7680],
|
||||
[2560, 5120],
|
||||
[5120, 13824],
|
||||
[6912, 5120],
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf/TP4": [
|
||||
[5120, 3840],
|
||||
[1280, 5120],
|
||||
[5120, 6912],
|
||||
[3456, 5120],
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf/TP1": [
|
||||
[8192, 10240],
|
||||
[8192, 8192],
|
||||
[8192, 57344],
|
||||
[28672, 8192],
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf/TP2": [
|
||||
[8192, 5120],
|
||||
[4096, 8192],
|
||||
[8192, 28672],
|
||||
[14336, 8192],
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf/TP4": [
|
||||
[8192, 2560],
|
||||
[2048, 8192],
|
||||
[8192, 14336],
|
||||
[7168, 8192],
|
||||
],
|
||||
}
|
||||
62
vllm-v0.6.2/benchmarks/kernels/graph_machete_bench.py
Normal file
62
vllm-v0.6.2/benchmarks/kernels/graph_machete_bench.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import math
|
||||
import pickle
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from typing import List
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Benchmark the latency of processing a single batch of '
|
||||
'requests till completion.')
|
||||
parser.add_argument('filename', type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.filename, 'rb') as f:
|
||||
data: List[TMeasurement] = pickle.load(f)
|
||||
|
||||
results = defaultdict(lambda: list())
|
||||
for v in data:
|
||||
result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
|
||||
if result is not None:
|
||||
KN = result.group(1)
|
||||
else:
|
||||
raise Exception("MKN not found")
|
||||
result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
|
||||
if result is not None:
|
||||
M = result.group(1)
|
||||
else:
|
||||
raise Exception("MKN not found")
|
||||
|
||||
kernel = v.task_spec.description
|
||||
results[KN].append({
|
||||
"kernel": kernel,
|
||||
"batch_size": M,
|
||||
"median": v.median
|
||||
})
|
||||
|
||||
rows = int(math.ceil(len(results) / 2))
|
||||
fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
|
||||
axs = axs.flatten()
|
||||
for axs_idx, (shape, data) in enumerate(results.items()):
|
||||
plt.sca(axs[axs_idx])
|
||||
df = pd.DataFrame(data)
|
||||
sns.lineplot(data=df,
|
||||
x="batch_size",
|
||||
y="median",
|
||||
hue="kernel",
|
||||
style="kernel",
|
||||
markers=True,
|
||||
dashes=False,
|
||||
palette="Dark2")
|
||||
plt.title(f"Shape: {shape}")
|
||||
plt.ylabel("time (median, s)")
|
||||
plt.tight_layout()
|
||||
plt.savefig("graph_machete_bench.pdf")
|
||||
1
vllm-v0.6.2/benchmarks/kernels/requirements.txt
Normal file
1
vllm-v0.6.2/benchmarks/kernels/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
pandas
|
||||
43
vllm-v0.6.2/benchmarks/kernels/weight_shapes.py
Normal file
43
vllm-v0.6.2/benchmarks/kernels/weight_shapes.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# Weight Shapes are in the format
|
||||
# ([K, N], TP_SPLIT_DIM)
|
||||
# Example:
|
||||
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
|
||||
# - TP1 : K = 14336, N = 4096
|
||||
# - TP2 : K = 7168, N = 4096
|
||||
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
|
||||
# - TP1 : K = 4096, N = 6144
|
||||
# - TP4 : K = 4096, N = 1536
|
||||
|
||||
# TP1 shapes
|
||||
WEIGHT_SHAPES = {
|
||||
"mistralai/Mistral-7B-v0.1": [
|
||||
([4096, 6144], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 28672], 1),
|
||||
([14336, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf": [
|
||||
([4096, 12288], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 22016], 1),
|
||||
([11008, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-3-8b": [
|
||||
([4096, 6144], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 28672], 1),
|
||||
([14336, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf": [
|
||||
([5120, 15360], 1),
|
||||
([5120, 5120], 0),
|
||||
([5120, 27648], 1),
|
||||
([13824, 5120], 0),
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 57344], 1),
|
||||
([28672, 8192], 0),
|
||||
],
|
||||
}
|
||||
Reference in New Issue
Block a user