fix profile run for vl model (#5136)
### What this PR does / why we need it?
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: 李少鹏 <lishaopeng21@huawei.com>
This commit is contained in:
@@ -6,7 +6,6 @@ tasks:
|
|||||||
metrics:
|
metrics:
|
||||||
- name: "acc,none"
|
- name: "acc,none"
|
||||||
value: 0.58
|
value: 0.58
|
||||||
max_model_len: 8192
|
|
||||||
tensor_parallel_size: 2
|
tensor_parallel_size: 2
|
||||||
gpu_memory_utilization: 0.7
|
gpu_memory_utilization: 0.7
|
||||||
enable_expert_parallel: True
|
enable_expert_parallel: True
|
||||||
|
|||||||
@@ -6,6 +6,5 @@ tasks:
|
|||||||
metrics:
|
metrics:
|
||||||
- name: "acc,none"
|
- name: "acc,none"
|
||||||
value: 0.55
|
value: 0.55
|
||||||
max_model_len: 8192
|
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
gpu_memory_utilization: 0.7
|
gpu_memory_utilization: 0.7
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ def test_multimodal_vl(prompt_template):
|
|||||||
images = [image] * len(img_questions)
|
images = [image] * len(img_questions)
|
||||||
prompts = prompt_template(img_questions)
|
prompts = prompt_template(img_questions)
|
||||||
with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
|
with VllmRunner("Qwen/Qwen3-VL-8B-Instruct",
|
||||||
max_model_len=4096,
|
|
||||||
mm_processor_kwargs={
|
mm_processor_kwargs={
|
||||||
"min_pixels": 28 * 28,
|
"min_pixels": 28 * 28,
|
||||||
"max_pixels": 1280 * 28 * 28,
|
"max_pixels": 1280 * 28 * 28,
|
||||||
|
|||||||
@@ -232,6 +232,7 @@ class NPUWorker(WorkerBase):
|
|||||||
# Init ModelRunner here, so that we have access to self.device.
|
# Init ModelRunner here, so that we have access to self.device.
|
||||||
self.model_runner = NPUModelRunner(self.vllm_config, self.device)
|
self.model_runner = NPUModelRunner(self.vllm_config, self.device)
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
def determine_available_memory(self) -> int:
|
def determine_available_memory(self) -> int:
|
||||||
# Profile the memory usage of the model and get the maximum number of
|
# Profile the memory usage of the model and get the maximum number of
|
||||||
# cache blocks that can be allocated with the remaining free memory.
|
# cache blocks that can be allocated with the remaining free memory.
|
||||||
|
|||||||
Reference in New Issue
Block a user