add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
"""Compare the with and without prefix caching.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_LEN_LEN = [
# Example models with sliding window.
("bigcode/starcoder2-3b", 4096, 16384),
# ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
# Confirm model with sliding window works.
# config has "use_sliding_window": false
("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
# config has no sliding window attribute.
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
]
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
def test_disable_sliding_window(model_len_len, ):
model, sliding_len, full_len = model_len_len
vllm_disabled_model = LLM(model, disable_sliding_window=True)
vllm_disabled_model.generate("Hi my name is")
model_config = vllm_disabled_model.llm_engine.model_config
assert model_config.max_model_len == sliding_len, (
"Max len expected to equal sliding_len of %s, but got %s", sliding_len,
model_config.max_model_len)
del vllm_disabled_model
cleanup_dist_env_and_memory()
vllm_enabled_model = LLM(model, disable_sliding_window=False)
vllm_enabled_model.generate("Hi my name is")
model_config = vllm_enabled_model.llm_engine.model_config
assert model_config.max_model_len == full_len, (
"Max len expected to equal full_len of %s, but got %s", full_len,
model_config.max_model_len)
del vllm_enabled_model
cleanup_dist_env_and_memory()

View File

@@ -0,0 +1,121 @@
"""Compare the with and without prefix caching.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm import SamplingParams, TokensPrompt
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
]
UNSTABLE_PROMPT_SEQUENCE = [
([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
([0] * 588) + ([8] * 1539),
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["MLU_FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("cached_position", [0, 1])
@pytest.mark.parametrize("block_size", [16])
def test_mixed_requests(
hf_runner,
vllm_runner,
example_prompts,
model: str,
backend: str,
dtype: str,
max_tokens: int,
cached_position: int,
block_size: int,
monkeypatch,
) -> None:
"""
Test the case when some sequences have the prefix cache hit
and the others don't. The cached position determines where
the sequence is at among the batch of prefills.
"""
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
'''
=============================
Modify by vllm_mlu
=============================
NOTE: Since the kv cache memory is too big for small models hich would trigger
large tensor problem in flash attention, we need to specify the num_gpu_blocks_override to 500
'''
cached_prompt = example_prompts[cached_position]
with vllm_runner(
model,
dtype=dtype,
enable_prefix_caching=True,
block_size=block_size,
num_gpu_blocks_override=500,
) as vllm_model:
# Run the first prompt so the cache is populated
vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
# Run all the promopts
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
# Verify number of cached tokens
for i in range(len(req_outputs)):
if i == cached_position:
expected_num_cached_tokens = (
len(req_outputs[i].prompt_token_ids) //
block_size) * block_size
else:
expected_num_cached_tokens = 0
assert req_outputs[
i].num_cached_tokens == expected_num_cached_tokens
vllm_outputs = [
(output.prompt_token_ids + list(output.outputs[0].token_ids),
output.prompt + output.outputs[0].text) for output in req_outputs
]
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
'''
=============================
Modify by vllm_mlu
=============================
NOTE: use Qwen2-7B-Instruct instand of Qwen2.5-0.5B-Instruct
'''
@pytest.mark.parametrize("backend", ["MLU_FLASH_ATTN"])
def test_unstable_prompt_sequence(
vllm_runner,
backend: str,
monkeypatch,
) -> None:
override_backend_env_variable(monkeypatch, backend)
with vllm_runner(
"Qwen/Qwen2-7B-Instruct",
enable_chunked_prefill=True,
enable_prefix_caching=True,
max_model_len=4096,
) as vllm_model:
for prompt in UNSTABLE_PROMPT_SEQUENCE:
vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
SamplingParams(max_tokens=1))