From 0421c941333f3be9c14db73c25f80e084a234411 Mon Sep 17 00:00:00 2001 From: Cherrytest Date: Mon, 17 Feb 2025 10:01:32 +0000 Subject: [PATCH] Update README.md --- README.md | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 390e4bd..f60e28c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - --- license: apache-2.0 language: @@ -7,9 +6,11 @@ pipeline_tag: image-text-to-text tags: - multimodal library_name: transformers +base_model: +- Qwen/Qwen2.5-VL-7B-Instruct --- -# Qwen2.5-VL-7B-Instruct-AWQ +# Qwen2.5-VL-7B-Instruct-AWQ Chat @@ -98,25 +99,25 @@ from qwen_vl_utils import process_vision_info # default: Load the model on the available device(s) model = Qwen2_5_VLForConditionalGeneration.from_pretrained( - "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto" + "Qwen/Qwen2.5-VL-7B-Instruct-AWQ", torch_dtype="auto", device_map="auto" ) # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios. # model = Qwen2_5_VLForConditionalGeneration.from_pretrained( -# "Qwen/Qwen2.5-VL-7B-Instruct", +# "Qwen/Qwen2.5-VL-7B-Instruct-AWQ", # torch_dtype=torch.bfloat16, # attn_implementation="flash_attention_2", # device_map="auto", # ) # default processer -processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") +processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct-AWQ") # The default range for the number of visual tokens per image in the model is 4-16384. # You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost. # min_pixels = 256*28*28 # max_pixels = 1280*28*28 -# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) +# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct-AWQ", min_pixels=min_pixels, max_pixels=max_pixels) messages = [ { @@ -206,14 +207,14 @@ The model supports a wide range of resolution inputs. By default, it uses the na min_pixels = 256 * 28 * 28 max_pixels = 1280 * 28 * 28 processor = AutoProcessor.from_pretrained( - "Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels + "Qwen/Qwen2.5-VL-7B-Instruct-AWQ", min_pixels=min_pixels, max_pixels=max_pixels ) ``` Besides, We provide two methods for fine-grained control over the image size input to the model: 1. Define min_pixels and max_pixels: Images will be resized to maintain their aspect ratio within the range of min_pixels and max_pixels. - + 2. Specify exact dimensions: Directly set `resized_height` and `resized_width`. These values will be rounded to the nearest multiple of 28. ```python @@ -273,6 +274,26 @@ However, it should be noted that this method has a significant impact on the per At the same time, for long video inputs, since MRoPE itself is more economical with ids, the max_position_embeddings can be directly modified to a larger value, such as 64k. +### Benchmark +#### Performance of Quantized Models +This section reports the generation performance of quantized models (including GPTQ and AWQ) of the Qwen2.5-VL series. Specifically, we report: + +- MMMU_VAL (Accuracy) +- DocVQA_VAL (Accuracy) +- MMBench_DEV_EN (Accuracy) +- MathVista_MINI (Accuracy) + +We use [VLMEvalkit](https://github.com/open-compass/VLMEvalKit) to evaluate all models. + +| Model Size | Quantization | MMMU_VAL | DocVQA_VAL | MMBench_EDV_EN | MathVista_MINI | +| --- | --- | --- | --- | --- | --- | +| Qwen2.5-VL-72B-Instruct | BF16
([🤗](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)[🤖](https://modelscope.cn/models/qwen/Qwen2.5-VL-72B-Instruct)) | 70.0 | 96.1 | 88.2 | 75.3 | +| | AWQ
([🤗](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct-AWQ)[🤖](https://modelscope.cn/models/qwen/Qwen2.5-VL-72B-Instruct-AWQ)) | 69.1 | 96.0 | 87.9 | 73.8 | +| Qwen2.5-VL-7B-Instruct | BF16
([🤗](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)[🤖](https://modelscope.cn/models/qwen/Qwen2.5-VL-7B-Instruct)) | 58.4 | 94.9 | 84.1 | 67.9 | +| | AWQ
([🤗](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct-AWQ)[🤖](https://modelscope.cn/models/qwen/Qwen2.5-VL-7B-Instruct-AWQ)) | 55.6 | 94.6 | 84.2 | 64.7 | +| Qwen2.5-VL-3B-Instruct | BF16
([🤗](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)[🤖](https://modelscope.cn/models/qwen/Qwen2.5-VL-3B-Instruct)) | 51.7 | 93.0 | 79.8 | 61.4 | +| | AWQ
([🤗](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct-AWQ)[🤖](https://modelscope.cn/models/qwen/Qwen2.5-VL-3B-Instruct-AWQ)) | 49.1 | 91.8 | 78.0 | 58.8 | + ## Citation