forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
181
vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
Normal file
181
vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""Autotune max_num_seqs paramter."""
|
||||
# pylint: skip-file
|
||||
import argparse
|
||||
import random
|
||||
from typing import Dict, Any
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def run_vllm(config: Dict[str, Any]) -> float:
|
||||
"""Initialize and run an instance of a language model (LLM) using the
|
||||
`vllm` library."""
|
||||
print(f'Evaluate with max_num_seqs: {config["max_num_seqs"]}')
|
||||
from vllm import LLM
|
||||
llm = LLM(**config)
|
||||
print(f'The num of gpu blocks is: {llm.llm_engine.cache_config.num_gpu_blocks}')
|
||||
return llm.llm_engine.cache_config.num_gpu_blocks
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""The entry function to tune max_num_seqs."""
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
config = {
|
||||
'model': args.model,
|
||||
'tokenizer': args.tokenizer,
|
||||
'quantization': args.quantization,
|
||||
'tensor_parallel_size': args.tensor_parallel_size,
|
||||
'seed': args.seed,
|
||||
'trust_remote_code': args.trust_remote_code,
|
||||
'dtype': args.dtype,
|
||||
'max_model_len': args.max_model_len,
|
||||
'enforce_eager': args.enforce_eager,
|
||||
'kv_cache_dtype': args.kv_cache_dtype,
|
||||
'quantization_param_path': args.quantization_param_path,
|
||||
'device': args.device,
|
||||
'enable_prefix_caching': args.enable_prefix_caching,
|
||||
'enable_chunked_prefill': args.enable_chunked_prefill,
|
||||
'max_num_batched_tokens': args.max_num_batched_tokens,
|
||||
'gpu_memory_utilization': args.gpu_memory_utilization,
|
||||
'download_dir': args.download_dir,
|
||||
'block_size': args.block_size
|
||||
}
|
||||
|
||||
import multiprocessing
|
||||
def worker_wrapper(config, output_queue):
|
||||
"""Here we get the num_gpu_blocks by instantiate a llm object."""
|
||||
result = run_vllm(config)
|
||||
output_queue.put(result)
|
||||
|
||||
|
||||
def get_num_gpu_blocks(cache, num_seqs) -> int:
|
||||
"""Get the number of GPU blocks with parameter num_seqs."""
|
||||
if num_seqs in cache:
|
||||
return cache[num_seqs]
|
||||
# Here since we cannot manually release the resources hold by Ray and NCCL,
|
||||
# we evaluate a set of parameters by launching a separate process.
|
||||
config['max_num_seqs'] = num_seqs
|
||||
output_queue = multiprocessing.Queue()
|
||||
process = multiprocessing.Process(target=worker_wrapper,
|
||||
args=(config, output_queue))
|
||||
process.start()
|
||||
process.join()
|
||||
result = output_queue.get()
|
||||
cache[num_seqs] = result
|
||||
return result
|
||||
|
||||
|
||||
def find_optimal_max_num_seqs(init=256) -> int:
|
||||
"""Search th optimal max_num_seqs which maximizes
|
||||
min(max_num_seqs, num_gpu_blocks)."""
|
||||
# Use cache to avoid repeated evaluations.
|
||||
cache = {}
|
||||
|
||||
# Initialization seach range.
|
||||
num_blocks = get_num_gpu_blocks(cache, init)
|
||||
left, right = min(num_blocks, init), max(num_blocks, init)
|
||||
|
||||
# Binary search.
|
||||
while 0 < left < right:
|
||||
mid = (left + right) // 2
|
||||
num_blocks = get_num_gpu_blocks(cache, mid)
|
||||
|
||||
if num_blocks == mid:
|
||||
return mid
|
||||
if num_blocks > mid:
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
left = max(min(mid, num_blocks), left)
|
||||
right = min(max(mid, num_blocks), right)
|
||||
|
||||
left, right = max(1, left), max(1, right)
|
||||
final_left = min(left, get_num_gpu_blocks(cache, left))
|
||||
final_right = min(right, get_num_gpu_blocks(cache, right))
|
||||
return right if final_right > final_left else left
|
||||
|
||||
max_num_seqs = find_optimal_max_num_seqs()
|
||||
print(f'The optimal max_num_seqs is {max_num_seqs}.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Tune max_num_seqs.")
|
||||
parser.add_argument("--backend", type=str, choices=["vllm"], default="vllm")
|
||||
parser.add_argument("--dataset", type=str, default=None,
|
||||
help="Path to the dataset.")
|
||||
parser.add_argument("--input-len", type=int, default=None,
|
||||
help="Input prompt length for each request")
|
||||
parser.add_argument("--output-len", type=int, default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
||||
parser.add_argument("--tokenizer", type=str, default=None)
|
||||
parser.add_argument('--quantization', '-q',
|
||||
choices=['awq', 'gptq', 'squeezellm', None],
|
||||
default=None)
|
||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||
parser.add_argument("--n", type=int, default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--use-beam-search", action="store_true")
|
||||
parser.add_argument("--num-prompts", type=int, default=1000,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--hf-max-batch-size", type=int, default=None,
|
||||
help="Maximum batch size for HF backend.")
|
||||
|
||||
parser.add_argument("--block-size", type=int, default=-1)
|
||||
parser.add_argument('--trust-remote-code', action='store_true',
|
||||
help='trust remote code from huggingface')
|
||||
parser.add_argument(
|
||||
'--max-model-len', type=int, default=None,
|
||||
help='Maximum length of a sequence (including prompt and output). '
|
||||
'If None, will be derived from the model.')
|
||||
parser.add_argument(
|
||||
'--dtype', type=str, default='auto',
|
||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||
help='data type for model weights and activations. '
|
||||
'The "auto" option will use FP16 precision '
|
||||
'for FP32 and FP16 models, and BF16 precision '
|
||||
'for BF16 models.')
|
||||
parser.add_argument('--gpu-memory-utilization', type=float, default=0.9,
|
||||
help='the fraction of GPU memory to be used for '
|
||||
'the model executor, which can range from 0 to 1.'
|
||||
'If unspecified, will use the default value of 0.9.')
|
||||
parser.add_argument("--enforce-eager", action="store_true",
|
||||
help="enforce eager execution")
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype", type=str, choices=["auto", "fp8"], default="auto",
|
||||
help=
|
||||
'Data type for kv cache storage. If "auto", will use model data type.')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path', type=str, default=None,
|
||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||
'instead supported for common inference criteria.')
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cuda", choices=["cuda"],
|
||||
help='device type for vLLM execution, supporting CUDA only currently.')
|
||||
parser.add_argument(
|
||||
"--enable-prefix-caching", action='store_true',
|
||||
help="enable automatic prefix caching for vLLM backend.")
|
||||
parser.add_argument("--enable-chunked-prefill", action='store_true',
|
||||
help="enable chunked prefill for vLLM backend.")
|
||||
parser.add_argument('--max-num-batched-tokens', type=int, default=None,
|
||||
help='maximum number of batched tokens per '
|
||||
'iteration')
|
||||
parser.add_argument('--download-dir', type=str, default=None,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of huggingface')
|
||||
cli_args = parser.parse_args()
|
||||
if cli_args.tokenizer is None:
|
||||
cli_args.tokenizer = cli_args.model
|
||||
if cli_args.dataset is None:
|
||||
assert cli_args.input_len is not None
|
||||
assert cli_args.output_len is not None
|
||||
else:
|
||||
assert cli_args.input_len is None
|
||||
|
||||
main(cli_args)
|
||||
Reference in New Issue
Block a user