add qwen3
This commit is contained in:
23
vllm-v0.6.2/tools/utils/README.md
Normal file
23
vllm-v0.6.2/tools/utils/README.md
Normal file
@@ -0,0 +1,23 @@
|
||||
### 1. 非page模式max_num_seqs自动调优工具
|
||||
|
||||
对于MLU370X8平台,在unpage模式下,可以通过调整`max_num_seqs`来提升性能。`tune_max_num_seqs.py`通过自动调参来搜索最佳`max_num_seqs`值。
|
||||
- 用法示例
|
||||
搜索固定配置下,使吞吐量最大`max_num_seqs`值,其中参数部分保持与`benchmark_latency.py`/`benchmark_throughput.py`一致。
|
||||
```bash
|
||||
python tools/utils/tune_max_num_seqs.py --backend vllm --input-len 1024 --output-len 1024 --model /Path/to/Llama-2-70b-chat-hf/ -tp 1 --max-model-len 4096 --dtype float16 --num-prompts 10
|
||||
```
|
||||
通过执行上述命令,可以搜索得到最优`max_num_seqs`配置,在构建LLM对象时,作为参数传入使用。
|
||||
|
||||
### 2. vLLM调度分析辅助工具
|
||||
|
||||
首先,设置环境变量开启调度profiling:export VLLM_SCHEDULER_PROFILE=true
|
||||
|
||||
对于离线测试,测试结束后,会自动保存数据并打印出当前已经运行请求的信息
|
||||
|
||||
对于在线测试,获取调度数据的步骤如下:
|
||||
|
||||
1. 启动server
|
||||
2. 运行client端测试
|
||||
3. 等待client测试结束后,立即运行:python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action save,请求server端将数据保存下来
|
||||
4. server端会打印出当前已经运行请求的信息
|
||||
5. 如果想再次运行client测试(基于现有server),先运行:python3 tools/utils/post_scheduler_view_action.py --host [server端ip地址] --port [server端口号] --action init,恢复server端,然后重复2、3、4
|
||||
27
vllm-v0.6.2/tools/utils/post_scheduler_view_action.py
Normal file
27
vllm-v0.6.2/tools/utils/post_scheduler_view_action.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import argparse
|
||||
import requests
|
||||
|
||||
""" Post a request to server, let server init/save scheduler view. """
|
||||
def post_http_request(api_url: str, action: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
pload = {
|
||||
"model": action,
|
||||
"prompt": "",
|
||||
"n": 1,
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 16,
|
||||
"stream": True,
|
||||
}
|
||||
response = requests.post(api_url, headers=headers, json=pload, stream=True)
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=6000)
|
||||
parser.add_argument("--action", type=str, default="save", choices=['init', 'save'])
|
||||
args = parser.parse_args()
|
||||
api_url = f"http://{args.host}:{args.port}/v1/completions"
|
||||
|
||||
post_http_request(api_url, f"{args.action}_scheduler_view")
|
||||
181
vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
Normal file
181
vllm-v0.6.2/tools/utils/tune_max_num_seqs.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""Autotune max_num_seqs paramter."""
|
||||
# pylint: skip-file
|
||||
import argparse
|
||||
import random
|
||||
from typing import Dict, Any
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def run_vllm(config: Dict[str, Any]) -> float:
|
||||
"""Initialize and run an instance of a language model (LLM) using the
|
||||
`vllm` library."""
|
||||
print(f'Evaluate with max_num_seqs: {config["max_num_seqs"]}')
|
||||
from vllm import LLM
|
||||
llm = LLM(**config)
|
||||
print(f'The num of gpu blocks is: {llm.llm_engine.cache_config.num_gpu_blocks}')
|
||||
return llm.llm_engine.cache_config.num_gpu_blocks
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""The entry function to tune max_num_seqs."""
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
config = {
|
||||
'model': args.model,
|
||||
'tokenizer': args.tokenizer,
|
||||
'quantization': args.quantization,
|
||||
'tensor_parallel_size': args.tensor_parallel_size,
|
||||
'seed': args.seed,
|
||||
'trust_remote_code': args.trust_remote_code,
|
||||
'dtype': args.dtype,
|
||||
'max_model_len': args.max_model_len,
|
||||
'enforce_eager': args.enforce_eager,
|
||||
'kv_cache_dtype': args.kv_cache_dtype,
|
||||
'quantization_param_path': args.quantization_param_path,
|
||||
'device': args.device,
|
||||
'enable_prefix_caching': args.enable_prefix_caching,
|
||||
'enable_chunked_prefill': args.enable_chunked_prefill,
|
||||
'max_num_batched_tokens': args.max_num_batched_tokens,
|
||||
'gpu_memory_utilization': args.gpu_memory_utilization,
|
||||
'download_dir': args.download_dir,
|
||||
'block_size': args.block_size
|
||||
}
|
||||
|
||||
import multiprocessing
|
||||
def worker_wrapper(config, output_queue):
|
||||
"""Here we get the num_gpu_blocks by instantiate a llm object."""
|
||||
result = run_vllm(config)
|
||||
output_queue.put(result)
|
||||
|
||||
|
||||
def get_num_gpu_blocks(cache, num_seqs) -> int:
|
||||
"""Get the number of GPU blocks with parameter num_seqs."""
|
||||
if num_seqs in cache:
|
||||
return cache[num_seqs]
|
||||
# Here since we cannot manually release the resources hold by Ray and NCCL,
|
||||
# we evaluate a set of parameters by launching a separate process.
|
||||
config['max_num_seqs'] = num_seqs
|
||||
output_queue = multiprocessing.Queue()
|
||||
process = multiprocessing.Process(target=worker_wrapper,
|
||||
args=(config, output_queue))
|
||||
process.start()
|
||||
process.join()
|
||||
result = output_queue.get()
|
||||
cache[num_seqs] = result
|
||||
return result
|
||||
|
||||
|
||||
def find_optimal_max_num_seqs(init=256) -> int:
|
||||
"""Search th optimal max_num_seqs which maximizes
|
||||
min(max_num_seqs, num_gpu_blocks)."""
|
||||
# Use cache to avoid repeated evaluations.
|
||||
cache = {}
|
||||
|
||||
# Initialization seach range.
|
||||
num_blocks = get_num_gpu_blocks(cache, init)
|
||||
left, right = min(num_blocks, init), max(num_blocks, init)
|
||||
|
||||
# Binary search.
|
||||
while 0 < left < right:
|
||||
mid = (left + right) // 2
|
||||
num_blocks = get_num_gpu_blocks(cache, mid)
|
||||
|
||||
if num_blocks == mid:
|
||||
return mid
|
||||
if num_blocks > mid:
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
left = max(min(mid, num_blocks), left)
|
||||
right = min(max(mid, num_blocks), right)
|
||||
|
||||
left, right = max(1, left), max(1, right)
|
||||
final_left = min(left, get_num_gpu_blocks(cache, left))
|
||||
final_right = min(right, get_num_gpu_blocks(cache, right))
|
||||
return right if final_right > final_left else left
|
||||
|
||||
max_num_seqs = find_optimal_max_num_seqs()
|
||||
print(f'The optimal max_num_seqs is {max_num_seqs}.')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Tune max_num_seqs.")
|
||||
parser.add_argument("--backend", type=str, choices=["vllm"], default="vllm")
|
||||
parser.add_argument("--dataset", type=str, default=None,
|
||||
help="Path to the dataset.")
|
||||
parser.add_argument("--input-len", type=int, default=None,
|
||||
help="Input prompt length for each request")
|
||||
parser.add_argument("--output-len", type=int, default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--model", type=str, default="facebook/opt-125m")
|
||||
parser.add_argument("--tokenizer", type=str, default=None)
|
||||
parser.add_argument('--quantization', '-q',
|
||||
choices=['awq', 'gptq', 'squeezellm', None],
|
||||
default=None)
|
||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||
parser.add_argument("--n", type=int, default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--use-beam-search", action="store_true")
|
||||
parser.add_argument("--num-prompts", type=int, default=1000,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--hf-max-batch-size", type=int, default=None,
|
||||
help="Maximum batch size for HF backend.")
|
||||
|
||||
parser.add_argument("--block-size", type=int, default=-1)
|
||||
parser.add_argument('--trust-remote-code', action='store_true',
|
||||
help='trust remote code from huggingface')
|
||||
parser.add_argument(
|
||||
'--max-model-len', type=int, default=None,
|
||||
help='Maximum length of a sequence (including prompt and output). '
|
||||
'If None, will be derived from the model.')
|
||||
parser.add_argument(
|
||||
'--dtype', type=str, default='auto',
|
||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||
help='data type for model weights and activations. '
|
||||
'The "auto" option will use FP16 precision '
|
||||
'for FP32 and FP16 models, and BF16 precision '
|
||||
'for BF16 models.')
|
||||
parser.add_argument('--gpu-memory-utilization', type=float, default=0.9,
|
||||
help='the fraction of GPU memory to be used for '
|
||||
'the model executor, which can range from 0 to 1.'
|
||||
'If unspecified, will use the default value of 0.9.')
|
||||
parser.add_argument("--enforce-eager", action="store_true",
|
||||
help="enforce eager execution")
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype", type=str, choices=["auto", "fp8"], default="auto",
|
||||
help=
|
||||
'Data type for kv cache storage. If "auto", will use model data type.')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path', type=str, default=None,
|
||||
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||
'instead supported for common inference criteria.')
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="cuda", choices=["cuda"],
|
||||
help='device type for vLLM execution, supporting CUDA only currently.')
|
||||
parser.add_argument(
|
||||
"--enable-prefix-caching", action='store_true',
|
||||
help="enable automatic prefix caching for vLLM backend.")
|
||||
parser.add_argument("--enable-chunked-prefill", action='store_true',
|
||||
help="enable chunked prefill for vLLM backend.")
|
||||
parser.add_argument('--max-num-batched-tokens', type=int, default=None,
|
||||
help='maximum number of batched tokens per '
|
||||
'iteration')
|
||||
parser.add_argument('--download-dir', type=str, default=None,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of huggingface')
|
||||
cli_args = parser.parse_args()
|
||||
if cli_args.tokenizer is None:
|
||||
cli_args.tokenizer = cli_args.model
|
||||
if cli_args.dataset is None:
|
||||
assert cli_args.input_len is not None
|
||||
assert cli_args.output_len is not None
|
||||
else:
|
||||
assert cli_args.input_len is None
|
||||
|
||||
main(cli_args)
|
||||
Reference in New Issue
Block a user