[main] flashcomm_v1 optim in Qwen Dense Models (#2802)
### What this PR does / why we need it?
Flashcomm_v1 optim in Qwen Dense Models.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
CI passed with new added/existing test.
- vLLM version: v0.10.1.1
- vLLM main:
5e537f45b4
Co-authored-by: 1024daniel <xxltju324@gmail.com>
This commit is contained in:
@@ -23,6 +23,7 @@ Run `pytest tests/test_offline_inference.py`.
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -30,6 +31,8 @@ from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
QWEN_DENSE_MODELS = ["Qwen/QwQ-32B", "Qwen/Qwen-32B"]
|
||||
|
||||
|
||||
def test_models_distributed_QwQ():
|
||||
example_prompts = [
|
||||
@@ -150,3 +153,23 @@ def test_sp_for_qwen3_moe() -> None:
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM": "1"})
|
||||
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=4,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
Reference in New Issue
Block a user