enginex-mlu370-vllm/vllm-v0.6.2/tools/utils/tune_max_num_seqs.py

"""Autotune max_num_seqs paramter."""
# pylint: skip-file
import argparse
import random
from typing import Dict, Any
from tqdm import tqdm


def run_vllm(config: Dict[str, Any]) -> float:
    """Initialize and run an instance of a language model (LLM) using the
    `vllm` library."""
    print(f'Evaluate with max_num_seqs: {config["max_num_seqs"]}')
    from vllm import LLM
    llm = LLM(**config)
    print(f'The num of gpu blocks is: {llm.llm_engine.cache_config.num_gpu_blocks}')
    return llm.llm_engine.cache_config.num_gpu_blocks


def main(args: argparse.Namespace):
    """The entry function to tune max_num_seqs."""
    print(args)
    random.seed(args.seed)
    config = {
        'model': args.model,
        'tokenizer': args.tokenizer,
        'quantization': args.quantization,
        'tensor_parallel_size': args.tensor_parallel_size,
        'seed': args.seed,
        'trust_remote_code': args.trust_remote_code,
        'dtype': args.dtype,
        'max_model_len': args.max_model_len,
        'enforce_eager': args.enforce_eager,
        'kv_cache_dtype': args.kv_cache_dtype,
        'quantization_param_path': args.quantization_param_path,
        'device': args.device,
        'enable_prefix_caching': args.enable_prefix_caching,
        'enable_chunked_prefill': args.enable_chunked_prefill,
        'max_num_batched_tokens': args.max_num_batched_tokens,
        'gpu_memory_utilization': args.gpu_memory_utilization,
        'download_dir': args.download_dir,
        'block_size': args.block_size
    }

    import multiprocessing
    def worker_wrapper(config, output_queue):
        """Here we get the num_gpu_blocks by instantiate a llm object."""
        result = run_vllm(config)
        output_queue.put(result)


    def get_num_gpu_blocks(cache, num_seqs) -> int:
        """Get the number of GPU blocks with parameter num_seqs."""
        if num_seqs in cache:
            return cache[num_seqs]
        # Here since we cannot manually release the resources hold by Ray and NCCL,
        # we evaluate a set of parameters by launching a separate process.
        config['max_num_seqs'] = num_seqs
        output_queue = multiprocessing.Queue()
        process = multiprocessing.Process(target=worker_wrapper,
                                          args=(config, output_queue))
        process.start()
        process.join()
        result = output_queue.get()
        cache[num_seqs] = result
        return result


    def find_optimal_max_num_seqs(init=256) -> int:
        """Search th optimal max_num_seqs which maximizes
        min(max_num_seqs, num_gpu_blocks)."""
        # Use cache to avoid repeated evaluations.
        cache = {}

        # Initialization seach range.
        num_blocks = get_num_gpu_blocks(cache, init)
        left, right = min(num_blocks, init), max(num_blocks, init)

        # Binary search.
        while 0 < left < right:
            mid = (left + right) // 2
            num_blocks = get_num_gpu_blocks(cache, mid)

            if num_blocks == mid:
                return mid
            if num_blocks > mid:
                left = mid + 1
            else:
                right = mid - 1
            left = max(min(mid, num_blocks), left)
            right = min(max(mid, num_blocks), right)

        left, right = max(1, left), max(1, right)
        final_left = min(left, get_num_gpu_blocks(cache, left))
        final_right = min(right, get_num_gpu_blocks(cache, right))
        return right if final_right > final_left else left

    max_num_seqs = find_optimal_max_num_seqs()
    print(f'The optimal max_num_seqs is {max_num_seqs}.')


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Tune max_num_seqs.")
    parser.add_argument("--backend", type=str, choices=["vllm"], default="vllm")
    parser.add_argument("--dataset", type=str, default=None,
                        help="Path to the dataset.")
    parser.add_argument("--input-len", type=int, default=None,
                        help="Input prompt length for each request")
    parser.add_argument("--output-len", type=int, default=None,
                        help="Output length for each request. Overrides the "
                        "output length from the dataset.")
    parser.add_argument("--model", type=str, default="facebook/opt-125m")
    parser.add_argument("--tokenizer", type=str, default=None)
    parser.add_argument('--quantization', '-q',
                        choices=['awq', 'gptq', 'squeezellm', None],
                        default=None)
    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--n", type=int, default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument("--num-prompts", type=int, default=1000,
                        help="Number of prompts to process.")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--hf-max-batch-size", type=int, default=None,
                        help="Maximum batch size for HF backend.")

    parser.add_argument("--block-size", type=int, default=-1)
    parser.add_argument('--trust-remote-code', action='store_true',
                        help='trust remote code from huggingface')
    parser.add_argument(
        '--max-model-len', type=int, default=None,
        help='Maximum length of a sequence (including prompt and output). '
        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype', type=str, default='auto',
        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
        help='data type for model weights and activations. '
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
    parser.add_argument('--gpu-memory-utilization', type=float, default=0.9,
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
    parser.add_argument("--enforce-eager", action="store_true",
                        help="enforce eager execution")
    parser.add_argument(
        "--kv-cache-dtype", type=str, choices=["auto", "fp8"], default="auto",
        help=
        'Data type for kv cache storage. If "auto", will use model data type.')
    parser.add_argument(
        '--quantization-param-path', type=str, default=None,
        help='Path to the JSON file containing the KV cache scaling factors. '
        'This should generally be supplied, when KV cache dtype is FP8. '
        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
        'instead supported for common inference criteria.')
    parser.add_argument(
        "--device", type=str, default="cuda", choices=["cuda"],
        help='device type for vLLM execution, supporting CUDA only currently.')
    parser.add_argument(
        "--enable-prefix-caching", action='store_true',
        help="enable automatic prefix caching for vLLM backend.")
    parser.add_argument("--enable-chunked-prefill", action='store_true',
                        help="enable chunked prefill for vLLM backend.")
    parser.add_argument('--max-num-batched-tokens', type=int, default=None,
                        help='maximum number of batched tokens per '
                        'iteration')
    parser.add_argument('--download-dir', type=str, default=None,
                        help='directory to download and load the weights, '
                        'default to the default cache dir of huggingface')
    cli_args = parser.parse_args()
    if cli_args.tokenizer is None:
        cli_args.tokenizer = cli_args.model
    if cli_args.dataset is None:
        assert cli_args.input_len is not None
        assert cli_args.output_len is not None
    else:
        assert cli_args.input_len is None

    main(cli_args)