Files
enginex-mlu370-vllm/vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
2026-02-04 17:22:39 +08:00

182 lines
8.0 KiB
Python

"""Autotune max_num_seqs paramter."""
# pylint: skip-file
import argparse
import random
from typing import Dict, Any
from tqdm import tqdm
def run_vllm(config: Dict[str, Any]) -> float:
"""Initialize and run an instance of a language model (LLM) using the
`vllm` library."""
print(f'Evaluate with max_num_seqs: {config["max_num_seqs"]}')
from vllm import LLM
llm = LLM(**config)
print(f'The num of gpu blocks is: {llm.llm_engine.cache_config.num_gpu_blocks}')
return llm.llm_engine.cache_config.num_gpu_blocks
def main(args: argparse.Namespace):
"""The entry function to tune max_num_seqs."""
print(args)
random.seed(args.seed)
config = {
'model': args.model,
'tokenizer': args.tokenizer,
'quantization': args.quantization,
'tensor_parallel_size': args.tensor_parallel_size,
'seed': args.seed,
'trust_remote_code': args.trust_remote_code,
'dtype': args.dtype,
'max_model_len': args.max_model_len,
'enforce_eager': args.enforce_eager,
'kv_cache_dtype': args.kv_cache_dtype,
'quantization_param_path': args.quantization_param_path,
'device': args.device,
'enable_prefix_caching': args.enable_prefix_caching,
'enable_chunked_prefill': args.enable_chunked_prefill,
'max_num_batched_tokens': args.max_num_batched_tokens,
'gpu_memory_utilization': args.gpu_memory_utilization,
'download_dir': args.download_dir,
'block_size': args.block_size
}
import multiprocessing
def worker_wrapper(config, output_queue):
"""Here we get the num_gpu_blocks by instantiate a llm object."""
result = run_vllm(config)
output_queue.put(result)
def get_num_gpu_blocks(cache, num_seqs) -> int:
"""Get the number of GPU blocks with parameter num_seqs."""
if num_seqs in cache:
return cache[num_seqs]
# Here since we cannot manually release the resources hold by Ray and NCCL,
# we evaluate a set of parameters by launching a separate process.
config['max_num_seqs'] = num_seqs
output_queue = multiprocessing.Queue()
process = multiprocessing.Process(target=worker_wrapper,
args=(config, output_queue))
process.start()
process.join()
result = output_queue.get()
cache[num_seqs] = result
return result
def find_optimal_max_num_seqs(init=256) -> int:
"""Search th optimal max_num_seqs which maximizes
min(max_num_seqs, num_gpu_blocks)."""
# Use cache to avoid repeated evaluations.
cache = {}
# Initialization seach range.
num_blocks = get_num_gpu_blocks(cache, init)
left, right = min(num_blocks, init), max(num_blocks, init)
# Binary search.
while 0 < left < right:
mid = (left + right) // 2
num_blocks = get_num_gpu_blocks(cache, mid)
if num_blocks == mid:
return mid
if num_blocks > mid:
left = mid + 1
else:
right = mid - 1
left = max(min(mid, num_blocks), left)
right = min(max(mid, num_blocks), right)
left, right = max(1, left), max(1, right)
final_left = min(left, get_num_gpu_blocks(cache, left))
final_right = min(right, get_num_gpu_blocks(cache, right))
return right if final_right > final_left else left
max_num_seqs = find_optimal_max_num_seqs()
print(f'The optimal max_num_seqs is {max_num_seqs}.')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Tune max_num_seqs.")
parser.add_argument("--backend", type=str, choices=["vllm"], default="vllm")
parser.add_argument("--dataset", type=str, default=None,
help="Path to the dataset.")
parser.add_argument("--input-len", type=int, default=None,
help="Input prompt length for each request")
parser.add_argument("--output-len", type=int, default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument("--model", type=str, default="facebook/opt-125m")
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization', '-q',
choices=['awq', 'gptq', 'squeezellm', None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n", type=int, default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-prompts", type=int, default=1000,
help="Number of prompts to process.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--hf-max-batch-size", type=int, default=None,
help="Maximum batch size for HF backend.")
parser.add_argument("--block-size", type=int, default=-1)
parser.add_argument('--trust-remote-code', action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len', type=int, default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype', type=str, default='auto',
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--gpu-memory-utilization', type=float, default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument("--enforce-eager", action="store_true",
help="enforce eager execution")
parser.add_argument(
"--kv-cache-dtype", type=str, choices=["auto", "fp8"], default="auto",
help=
'Data type for kv cache storage. If "auto", will use model data type.')
parser.add_argument(
'--quantization-param-path', type=str, default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
"--device", type=str, default="cuda", choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.')
parser.add_argument(
"--enable-prefix-caching", action='store_true',
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument("--enable-chunked-prefill", action='store_true',
help="enable chunked prefill for vLLM backend.")
parser.add_argument('--max-num-batched-tokens', type=int, default=None,
help='maximum number of batched tokens per '
'iteration')
parser.add_argument('--download-dir', type=str, default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
cli_args = parser.parse_args()
if cli_args.tokenizer is None:
cli_args.tokenizer = cli_args.model
if cli_args.dataset is None:
assert cli_args.input_len is not None
assert cli_args.output_len is not None
else:
assert cli_args.input_len is None
main(cli_args)