241 lines
7.8 KiB
Markdown
241 lines
7.8 KiB
Markdown
|
|
## Overall accuracy test
|
|||
|
|
|
|||
|
|
### EvalScope
|
|||
|
|
|
|||
|
|
#### 1.Download and install
|
|||
|
|
|
|||
|
|
EvalScope supports use in Python environments. Users can install EvalScope via pip or from source code. Here are examples of both installation methods:
|
|||
|
|
|
|||
|
|
```bash
|
|||
|
|
#pip
|
|||
|
|
pip install evalscope[perf] -U
|
|||
|
|
#git
|
|||
|
|
git clone https://github.com/modelscope/evalscope.git
|
|||
|
|
cd evalscope
|
|||
|
|
pip install -e '.[perf]'
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
#### 2.Dataset preparation script
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|||
|
|
from evalscope.utils.io_utils import dump_jsonl_data
|
|||
|
|
import os # Step 1: Import the os module
|
|||
|
|
|
|||
|
|
schema = CollectionSchema(
|
|||
|
|
name="VL-Test",
|
|||
|
|
datasets=[
|
|||
|
|
CollectionSchema(
|
|||
|
|
name="PureText",
|
|||
|
|
weight=1,
|
|||
|
|
datasets=[
|
|||
|
|
DatasetInfo(
|
|||
|
|
name="mmlu_pro",
|
|||
|
|
weight=1,
|
|||
|
|
task_type="exam",
|
|||
|
|
tags=["en"],
|
|||
|
|
args={"few_shot_num": 0},
|
|||
|
|
),
|
|||
|
|
DatasetInfo(
|
|||
|
|
name="ifeval",
|
|||
|
|
weight=1,
|
|||
|
|
task_type="instruction",
|
|||
|
|
tags=["en"],
|
|||
|
|
args={"few_shot_num": 0},
|
|||
|
|
),
|
|||
|
|
DatasetInfo(
|
|||
|
|
name="gsm8k",
|
|||
|
|
weight=1,
|
|||
|
|
task_type="math",
|
|||
|
|
tags=["en"],
|
|||
|
|
args={"few_shot_num": 0},
|
|||
|
|
),
|
|||
|
|
],
|
|||
|
|
),
|
|||
|
|
CollectionSchema(
|
|||
|
|
name="Vision",
|
|||
|
|
weight=2,
|
|||
|
|
datasets=[
|
|||
|
|
DatasetInfo(
|
|||
|
|
name="math_vista",
|
|||
|
|
weight=1,
|
|||
|
|
task_type="math",
|
|||
|
|
tags=["en"],
|
|||
|
|
args={"few_shot_num": 0},
|
|||
|
|
),
|
|||
|
|
DatasetInfo(
|
|||
|
|
name="mmmu_pro",
|
|||
|
|
weight=1,
|
|||
|
|
task_type="exam",
|
|||
|
|
tags=["en"],
|
|||
|
|
args={"few_shot_num": 0},
|
|||
|
|
),
|
|||
|
|
],
|
|||
|
|
),
|
|||
|
|
],
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# get the mixed data
|
|||
|
|
mixed_data = WeightedSampler(schema).sample(1000)
|
|||
|
|
|
|||
|
|
output_path = "outputs/vl_test.jsonl" # Step 2: Define the output file path
|
|||
|
|
output_dir = os.path.dirname(output_path) # Step 3: Obtain the directory name
|
|||
|
|
if not os.path.exists(output_dir): # Step 4: Check if the directory exists
|
|||
|
|
os.makedirs(output_dir, exist_ok=True) # Step 5: Automatically create directories
|
|||
|
|
|
|||
|
|
|
|||
|
|
# dump the mixed data to a jsonl file
|
|||
|
|
dump_jsonl_data(mixed_data, output_path) # Step 6: Securely write to the file
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
Dataset composition visualization:
|
|||
|
|
|
|||
|
|
```
|
|||
|
|
┌───────────────────────────────────────┐
|
|||
|
|
│ VL-Test (1000 samples) │
|
|||
|
|
├─────────────────┬─────────────────────┤
|
|||
|
|
│ PureText │ Vision │
|
|||
|
|
│ (333 samples) │ (667 samples) │
|
|||
|
|
├─────────────────┼─────────────────────┤
|
|||
|
|
│ • mmlu_pro │ • math_vista │
|
|||
|
|
│ • ifeval │ • mmmu_pro │
|
|||
|
|
│ • gsm8k │ │
|
|||
|
|
└─────────────────┴─────────────────────┘
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
#### 3.Test
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
from dotenv import dotenv_values
|
|||
|
|
|
|||
|
|
from evalscope import TaskConfig, run_task
|
|||
|
|
from evalscope.constants import EvalType
|
|||
|
|
|
|||
|
|
task_cfg = TaskConfig(
|
|||
|
|
model="Qwen2.5-VL-7B-Instruct",
|
|||
|
|
api_url="http://localhost:8804/v1",
|
|||
|
|
api_key="EMPTY",
|
|||
|
|
eval_type=EvalType.SERVICE,
|
|||
|
|
datasets=[
|
|||
|
|
"data_collection",
|
|||
|
|
],
|
|||
|
|
dataset_args={
|
|||
|
|
"data_collection": {
|
|||
|
|
"local_path": "../outputs/vl_test.jsonl",
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
eval_batch_size=5,
|
|||
|
|
generation_config={
|
|||
|
|
"max_tokens": 30000, # The maximum number of tokens that can be generated should be set to a large value to avoid output truncation.
|
|||
|
|
"temperature": 0.6, # Sampling temperature (recommended value from qwen report)
|
|||
|
|
"top_p": 0.95, # top-p sampling (recommended value from qwen report)
|
|||
|
|
"top_k": 20, # Top-k sampling (recommended value from qwen report)
|
|||
|
|
"n": 1, # Number of responses generated per request
|
|||
|
|
"repetition_penalty": 1.0, # 1.0 = Penalty disabled, >1.0 = Penalty repeated.
|
|||
|
|
},
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
run_task(task_cfg=task_cfg)
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
Parameter Tuning Guide:
|
|||
|
|
|
|||
|
|
| Parameter | Current value | Effect | Adjustment suggestions |
|
|||
|
|
| ----------------- | ------------- | ---------------------------------------- | -------------------------------------------------------- |
|
|||
|
|
| `temperature` | 0.6 | Control output diversity | Math problems ↓ 0.3 / Creative writing ↑ 0.9 |
|
|||
|
|
| `top_p` | 0.95 | Filtering low-probability tokens | Reduce "nonsense" |
|
|||
|
|
| `eval_batch_size` | 5 | Number of requests processed in parallel | With sufficient video memory, it can be increased to 10. |
|
|||
|
|
|
|||
|
|
Run the test:
|
|||
|
|
|
|||
|
|
```bash
|
|||
|
|
#!/bin/bash
|
|||
|
|
# ========================================
|
|||
|
|
# Step 1: Set the log file path
|
|||
|
|
# ========================================
|
|||
|
|
LOG_FILE="accuracy_$(date +%Y%m%d_%H%M).log"
|
|||
|
|
|
|||
|
|
# ========================================
|
|||
|
|
# Step 2: Execute the Python script and capture all output
|
|||
|
|
# Meaning of 2>&1:
|
|||
|
|
# - 2 represents standard error output (stderr)
|
|||
|
|
# ->& represents redirection and merging
|
|||
|
|
# - 1 represents standard output (stdout)
|
|||
|
|
# Function: Merges error messages into standard output as well.
|
|||
|
|
# ========================================
|
|||
|
|
python accuracy.py 2>&1 | tee "$LOG_FILE"
|
|||
|
|
|
|||
|
|
# ========================================
|
|||
|
|
# Step 3: Check Execution Status
|
|||
|
|
# ${PIPESTATUS[0]} Get the exit code of the first command (Python) in the pipeline
|
|||
|
|
# ========================================
|
|||
|
|
EXIT_CODE=${PIPESTATUS[0]}
|
|||
|
|
if [ $EXIT_CODE -eq 0 ]; then
|
|||
|
|
echo "✅ Evaluation completed! Log saved to: $LOG_FILE"
|
|||
|
|
else
|
|||
|
|
echo "❌ Evaluation failed! Exit code: $EXIT_CODE Please check the log: $LOG_FILE"
|
|||
|
|
fi
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
#### 4.Common problem fixes
|
|||
|
|
|
|||
|
|
##### 4.1 NLTK resource missing fix
|
|||
|
|
|
|||
|
|
```bash
|
|||
|
|
Resource punkt_tab not found.
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
Solution:
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
import nltk
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
# Step 1: Set the download path (select a writable directory)
|
|||
|
|
download_dir = "/workspace/myenv/nltk_data"
|
|||
|
|
os.makedirs(download_dir, exist_ok=True)
|
|||
|
|
|
|||
|
|
# Step 2: Configure NLTK data path
|
|||
|
|
nltk.data.path.append(download_dir)
|
|||
|
|
|
|||
|
|
# Step 3: Download necessary resources
|
|||
|
|
print("🔽 Start downloading punkt_tab resource...")
|
|||
|
|
try:
|
|||
|
|
nltk.download("punkt_tab", download_dir=download_dir)
|
|||
|
|
print("✅ Download successful!")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Download failed: {e}")
|
|||
|
|
print("💡 Alternative: Download manually from GitHub")
|
|||
|
|
print(
|
|||
|
|
" URL: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip"
|
|||
|
|
)
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
repair:
|
|||
|
|
|
|||
|
|
```bash
|
|||
|
|
# Activate environment
|
|||
|
|
source /workspace/myenv/bin/activate
|
|||
|
|
|
|||
|
|
# Run the repair script
|
|||
|
|
python fix_nltk.py
|
|||
|
|
|
|||
|
|
# Rerun the test
|
|||
|
|
bash run_accuracy_test.sh
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
#### 5.Results Display
|
|||
|
|
|
|||
|
|
```bash
|
|||
|
|
+-------------+---------------------+--------------+---------------+-------+
|
|||
|
|
| task_type | metric | dataset_name | average_score | count |
|
|||
|
|
+-------------+---------------------+--------------+---------------+-------+
|
|||
|
|
| exam | acc | mmmu_pro | 0.521 | 334 |
|
|||
|
|
| math | acc | math_vista | 0.6066 | 333 |
|
|||
|
|
| exam | acc | mmlu_pro | 0.5405 | 111 |
|
|||
|
|
| instruction | prompt_level_strict | ifeval | 0.6937 | 111 |
|
|||
|
|
| math | acc | gsm8k | 0.8288 | 111 |
|
|||
|
|
+-------------+---------------------+--------------+---------------+-------+
|
|||
|
|
```
|