From bcd0b532f5d4f5f23e30c708cce1307a17b67260 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Thu, 7 Aug 2025 14:15:49 +0800 Subject: [PATCH] [Doc] Update user guide for using lm-eval (#1325) ### What this PR does / why we need it? Update user guide for using lm-eval 1. add using lm-eval on online server 2. add using offline datasets - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/9edd1db02bc6dce6da503503a373657f3466a78b --------- Signed-off-by: hfadzxy --- .../evaluation/using_lm_eval.md | 261 +++++++++++++++++- 1 file changed, 249 insertions(+), 12 deletions(-) diff --git a/docs/source/developer_guide/evaluation/using_lm_eval.md b/docs/source/developer_guide/evaluation/using_lm_eval.md index ce34536..799eff1 100644 --- a/docs/source/developer_guide/evaluation/using_lm_eval.md +++ b/docs/source/developer_guide/evaluation/using_lm_eval.md @@ -1,7 +1,135 @@ # Using lm-eval -This document will guide you have a accuracy testing using [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness). +This document will guide you have a accuracy testing using [lm-eval][1]. -## 1. Run docker container +## Online Server +### 1. start the vLLM server +You can run docker container to start the vLLM server on a single NPU: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +/bin/bash +vllm serve Qwen/Qwen2.5-0.5B-Instruct --max_model_len 4096 & +``` + +Started the vLLM server successfully,if you see log as below: + +``` +INFO: Started server process [9446] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +### 2. Run gsm8k accuracy test using lm-eval + +You can query result with input prompts: + +``` +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "prompt": "'"<|im_start|>system\nYou are a professional accountant. Answer questions using accounting knowledge, output only the option letter (A/B/C/D).<|im_end|>\n"\ +"<|im_start|>user\nQuestion: A company's balance sheet as of December 31, 2023 shows:\n"\ +" Current assets: Cash and equivalents 5 million yuan, Accounts receivable 8 million yuan, Inventory 6 million yuan\n"\ +" Non-current assets: Net fixed assets 12 million yuan\n"\ +" Current liabilities: Short-term loans 4 million yuan, Accounts payable 3 million yuan\n"\ +" Non-current liabilities: Long-term loans 9 million yuan\n"\ +" Owner's equity: Paid-in capital 10 million yuan, Retained earnings ?\n"\ +"Requirement: Calculate the company's Asset-Liability Ratio and Current Ratio (round to two decimal places).\n"\ +"Options:\n"\ +"A. Asset-Liability Ratio=58.33%, Current Ratio=1.90\n"\ +"B. Asset-Liability Ratio=62.50%, Current Ratio=2.17\n"\ +"C. Asset-Liability Ratio=65.22%, Current Ratio=1.75\n"\ +"D. Asset-Liability Ratio=68.00%, Current Ratio=2.50<|im_end|>\n"\ +"<|im_start|>assistant\n"'", + "max_tokens": 1, + "temperature": 0, + "stop": ["<|im_end|>"] + }' | python3 -m json.tool +``` + +The output format matches the following: + +``` +{ + "id": "cmpl-2f678e8bdf5a4b209a3f2c1fa5832e25", + "object": "text_completion", + "created": 1754475138, + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "choices": [ + { + "index": 0, + "text": "A", + "logprobs": null, + "finish_reason": "length", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "service_tier": null, + "system_fingerprint": null, + "usage": { + "prompt_tokens": 252, + "total_tokens": 253, + "completion_tokens": 1, + "prompt_tokens_details": null + }, + "kv_transfer_params": null +} +``` + +Install lm-eval in the container. + +```bash +export HF_ENDPOINT="https://hf-mirror.com" +pip install lm-eval[api] +``` + +Run the following command: + +``` +# Only test gsm8k dataset in this demo +lm_eval \ + --model local-completions \ + --model_args model=Qwen/Qwen2.5-0.5B-Instruct,base_url=http://127.0.0.1:8000/v1/completions,tokenized_requests=False,trust_remote_code=True \ + --tasks gsm8k \ + --output_path ./ +``` + +After 30 mins, the output is as shown below: + +``` +The markdown format results is as below: + +Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.3215|± |0.0129| +| | |strict-match | 5|exact_match|↑ |0.2077|± |0.0112| + +``` + +## Offline Server +### 1. Run docker container You can run docker container on a single NPU: @@ -30,22 +158,23 @@ docker run --rm \ /bin/bash ``` -## 2. Run ceval accuracy test using lm-eval +### 2. Run gsm8k accuracy test using lm-eval Install lm-eval in the container. ```bash +export HF_ENDPOINT="https://hf-mirror.com" pip install lm-eval ``` Run the following command: ``` -# Only test ceval-valid-computer_network dataset in this demo +# Only test gsm8k dataset in this demo lm_eval \ --model vllm \ - --model_args pretrained=Qwen/Qwen2.5-7B-Instruct,max_model_len=4096,block_size=4,tensor_parallel_size=1 \ - --tasks ceval-valid_computer_network \ - --batch_size 8 + --model_args pretrained=Qwen/Qwen2.5-0.5B-Instruct,max_model_len=4096 \ + --tasks gsm8k \ + --batch_size auto ``` After 1-2 mins, the output is as shown below: @@ -53,11 +182,119 @@ After 1-2 mins, the output is as shown below: ``` The markdown format results is as below: -| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| -|----------------------------|------:|------|-----:|--------|---|-----:|---|-----:| -|ceval-valid_computer_network| 2|none | 0|acc |↑ |0.6842|± |0.1096| -| | |none | 0|acc_norm|↑ |0.6842|± |0.1096| +Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.3412|± |0.0131| +| | |strict-match | 5|exact_match|↑ |0.3139|± |0.0128| ``` -You can see more usage on [Lm-eval Docs](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/README.md). +## Use offline Datasets + +Take gsm8k(single dataset) and mmlu(multi-subject dataset) as examples, and you can see more from [here][2]. + +```bash +# set HF_DATASETS_OFFLINE when using offline datasets +export HF_DATASETS_OFFLINE=1 +git clone https://github.com/EleutherAI/lm-evaluation-harness.git +cd lm-evaluation-harness +pip install -e . +# gsm8k yaml path +cd lm_eval/tasks/gsm8k +# mmlu yaml path +cd lm_eval/tasks/mmlu/default +``` + +set [gsm8k.yaml][3] as follows: + +```yaml +tag: + - math_word_problems +task: gsm8k + +# set dataset_path arrow or json or parquet according to the downloaded dataset +dataset_path: arrow + +# set dataset_name to null +dataset_name: null +output_type: generate_until + +# add dataset_kwargs +dataset_kwargs: + data_files: + # train and test data download path + train: /root/.cache/gsm8k/gsm8k-train.arrow + test: /root/.cache/gsm8k/gsm8k-test.arrow + +training_split: train +fewshot_split: train +test_split: test +doc_to_text: 'Q: {{question}} + A(Please follow the summarize the result at the end with the format of "The answer is xxx", where xx is the result.):' +doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false + regexes_to_ignore: + - "," + - "\\$" + - "(?s).*#### " + - "\\.$" +generation_kwargs: + until: + - "Question:" + - "" + - "<|im_end|>" + do_sample: false + temperature: 0.0 +repeats: 1 +num_fewshot: 5 +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "#### (\\-?[0-9\\.\\,]+)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)" + - function: "take_first" +metadata: + version: 3.0 +``` + +set [_default_template_yaml][4] as follows: + +```yaml +# set dataset_path according to the downloaded dataset +dataset_path: /root/.cache/mmlu +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true +``` + +You can see more usage on [Lm-eval Docs][5]. + +[1]: https://github.com/EleutherAI/lm-evaluation-harness +[2]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#using-local-datasets +[3]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml +[4]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmlu/default/_default_template_yaml +[5]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/README.md