[CI] add xlite e2e test (#5305)

### What this PR does / why we need it?
add xlite e2e test

- vLLM version: release/v0.13.0
- vLLM main:
5fbfa8d9ef

Signed-off-by: DaweiChang <405739598@qq.com>
This commit is contained in:
Magnus
2025-12-25 09:17:06 +08:00
committed by GitHub
parent 6d25372baa
commit a9fccbeb30
2 changed files with 38 additions and 38 deletions

View File

@@ -20,18 +20,21 @@ Compare the outputs of vLLM with and without xlite.
Run `pytest tests/e2e/singlecard/test_xlite.py`.
"""
import os
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"
MODELS = [
"Qwen/Qwen3-0.6B",
]
@pytest.mark.skip
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models_with_xlite_decode_only(
@@ -43,7 +46,6 @@ def test_models_with_xlite_decode_only(
"The capital of France is", "The future of AI is"
]
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
block_size=128,
@@ -52,24 +54,22 @@ def test_models_with_xlite_decode_only(
"enabled": True
}},
) as runner:
vllm_xlite_outputs = runner.model.generate(prompts, sampling_params)
vllm_xlite_outputs_list = runner.generate_greedy(prompts,
max_tokens=max_tokens)
for idx in range(len(vllm_xlite_outputs_list)):
vllm_xlite_outputs_list[idx] = ([0],
vllm_xlite_outputs_list[idx][1])
with VllmRunner(
model,
block_size=128,
max_model_len=1024,
enforce_eager=True,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_xlite_outputs_list = []
for output in vllm_xlite_outputs:
vllm_xlite_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_xlite_answers = [
"Hello, my name is Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
'The president of the United States is the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
'The capital of France is Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital',
'The future of AI is not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
]
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = ([([0], answer)
for answer in vllm_xlite_answers])
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
@@ -79,7 +79,6 @@ def test_models_with_xlite_decode_only(
)
@pytest.mark.skip
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models_with_xlite_full_mode(
@@ -91,7 +90,6 @@ def test_models_with_xlite_full_mode(
"The capital of France is", "The future of AI is"
]
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
block_size=128,
@@ -103,24 +101,22 @@ def test_models_with_xlite_full_mode(
}
},
) as runner:
vllm_xlite_outputs = runner.model.generate(prompts, sampling_params)
vllm_xlite_outputs_list = runner.generate_greedy(prompts,
max_tokens=max_tokens)
for idx in range(len(vllm_xlite_outputs_list)):
vllm_xlite_outputs_list[idx] = ([0],
vllm_xlite_outputs_list[idx][1])
with VllmRunner(
model,
block_size=128,
max_model_len=1024,
enforce_eager=True,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_xlite_outputs_list = []
for output in vllm_xlite_outputs:
vllm_xlite_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_xlite_answers = [
"Hello, my name is Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
'The president of the United States is the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
'The capital of France is Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital',
"The future of AI is not just about the technology itself, but about how we use it to solve real-world problems. As AI continues to evolve, it's important to consider the ethical"
]
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = ([([0], answer)
for answer in vllm_xlite_answers])
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,

View File

@@ -143,7 +143,7 @@ class LlamaXliteModel(XliteModel):
config.moe_tp_size = 1
config.attn_type = AttnMHA
config.weight_nz = envs_ascend.VLLM_ASCEND_ENABLE_NZ
config.weight_nz = envs_ascend.VLLM_ASCEND_ENABLE_NZ == 2
scheduler_config = vllm_config.scheduler_config
max_batch_size = scheduler_config.max_num_seqs
max_seq_len = vllm_config.model_config.max_model_len
@@ -257,8 +257,12 @@ class XliteWrapper:
if not with_prefill or self.full_mode:
batch = attn_metadata.num_prefills + attn_metadata.num_decodes
seq_lens = attn_metadata.seq_lens[:batch]
query_lens = attn_metadata.query_start_loc_cpu[
1:] - attn_metadata.query_start_loc_cpu[:-1]
seq_tensor = torch.cat([
torch.tensor([0]),
torch.tensor(attn_metadata.actual_seq_lengths_q)
],
dim=0)
query_lens = seq_tensor[1:] - seq_tensor[:-1]
query_lens = query_lens[:batch]
cached_lens = seq_lens - query_lens