diff --git a/tests/e2e/models/configs/Qwen3-VL-30B-A3B-Instruct.yaml b/tests/e2e/models/configs/Qwen3-VL-30B-A3B-Instruct.yaml index 5b5dc050..9c98249c 100644 --- a/tests/e2e/models/configs/Qwen3-VL-30B-A3B-Instruct.yaml +++ b/tests/e2e/models/configs/Qwen3-VL-30B-A3B-Instruct.yaml @@ -6,7 +6,6 @@ tasks: metrics: - name: "acc,none" value: 0.58 -max_model_len: 8192 tensor_parallel_size: 2 gpu_memory_utilization: 0.7 enable_expert_parallel: True diff --git a/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml index 8803a120..96581e54 100644 --- a/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml +++ b/tests/e2e/models/configs/Qwen3-VL-8B-Instruct.yaml @@ -6,6 +6,5 @@ tasks: metrics: - name: "acc,none" value: 0.55 -max_model_len: 8192 batch_size: 32 gpu_memory_utilization: 0.7 diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index c120ef2d..4cdfd7c2 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -39,7 +39,6 @@ def test_multimodal_vl(prompt_template): images = [image] * len(img_questions) prompts = prompt_template(img_questions) with VllmRunner("Qwen/Qwen3-VL-8B-Instruct", - max_model_len=4096, mm_processor_kwargs={ "min_pixels": 28 * 28, "max_pixels": 1280 * 28 * 28, diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 3e1f3f59..f05ef69a 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -232,6 +232,7 @@ class NPUWorker(WorkerBase): # Init ModelRunner here, so that we have access to self.device. self.model_runner = NPUModelRunner(self.vllm_config, self.device) + @torch.inference_mode() def determine_available_memory(self) -> int: # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory.