[CI/UT] fix spec ut in vllm-ascend main and vllm main (#759)

### What this PR does / why we need it?
#### 1. fix spec ut in vllm-ascend main and vllm main
As https://github.com/vllm-project/vllm-ascend/pull/694 and
https://github.com/vllm-project/vllm-ascend/pull/749 verify, Now,
vllm-ascend main and vllm 0.8.5, spec UT is happy, but vllm-ascend main
and vllm main, CI is fail.

I found the reason is a triton bug
https://github.com/triton-lang/triton/issues/2266, but i I didn't figure
it out that why the bug did not effect vllm-ascend main and vllm 0.8.5,
maybe the usage of triton have changed when vllm 0.8.5 to latest main

As the bug describe, I changed the minimum block_size in UT from 8 to
16, and the modification is verified locally to be effective.

#### 2. modify some case skip form.
I modified some commented out cases to skipif form, which is more
standardized.

### Does this PR introduce _any_ user-facing change?
None

### How was this patch tested?
CI

Signed-off-by: mengwei805 <mengwei25@huawei.com>
This commit is contained in:
wemaster
2025-05-10 09:45:56 +08:00
committed by GitHub
parent 58d2f85c4a
commit 19c8e134e4
5 changed files with 265 additions and 273 deletions

View File

@@ -153,8 +153,7 @@ jobs:
- "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py" - "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"
- name: Run vllm-project/vllm-ascend Speculative Decode test - name: Run vllm-project/vllm-ascend Speculative Decode test
# speculative decode seems will cause oom issue, only disable it now on ci test with vLLM main if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
if: matrix.vllm_verison == 'v0.8.5.post1' && steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
run: | run: |
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process

View File

@@ -186,110 +186,109 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
["disable_logprobs"]) ["disable_logprobs"])
# TODO: Open it when vllm-ascend support graph mode and @pytest.mark.skipif(True, reason="Open it when graph mode ready.")
# @pytest.mark.parametrize( @pytest.mark.parametrize(
# "common_llm_kwargs", "common_llm_kwargs",
# [{ [{
# "enforce_eager": False, "enforce_eager": False,
# # Print spec metrics. # Print spec metrics.
# "disable_log_stats": False, "disable_log_stats": False,
# # Precision # Precision
# "dtype": PRECISION, "dtype": PRECISION,
# # Main model # Main model
# "model_name": MAIN_MODEL, "model_name": MAIN_MODEL,
# }]) }])
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
# @pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
# { {
# "speculative_config": { "speculative_config": {
# "model": SPEC_MODEL, "model": SPEC_MODEL,
# "num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
# }, },
# }, },
# ]) ])
# @pytest.mark.parametrize("output_len", [ @pytest.mark.parametrize("output_len", [
# 128, 128,
# ]) ])
# @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("batch_size", [1, 32])
# @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE) @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
# def test_medusa_e2e_greedy_correctness_cuda_graph( def test_medusa_e2e_greedy_correctness_cuda_graph(
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
# seed: int, prefill_chunk_size: int): seed: int, prefill_chunk_size: int):
# """Verify greedy equality with cuda graph enabled and different """Verify greedy equality with cuda graph enabled and different
# batch sizes.""" batch sizes."""
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
# run_equality_correctness_test(vllm_runner, run_equality_correctness_test(vllm_runner,
# common_llm_kwargs, common_llm_kwargs,
# per_test_common_llm_kwargs, per_test_common_llm_kwargs,
# baseline_llm_kwargs, baseline_llm_kwargs,
# test_llm_kwargs, test_llm_kwargs,
# batch_size, batch_size,
# max_output_len=output_len, max_output_len=output_len,
# seed=seed, seed=seed,
# temperature=0.0) temperature=0.0)
# TODO: There is a problem with the preemptive scheduling in the current
# version, which makes this case fail. Please release this case after the
# preemptive scheduling problem is solved.
# @pytest.mark.parametrize(
# "common_llm_kwargs",
# [{
# "block_size": 8,
# # 2 for small prompt, 256//8 for generated.
# "num_gpu_blocks_override": 2 + 256 // 8,
# "max_model_len": (2 + 256 // 8) * 8,
# # Skip cuda graph recording for fast test. @pytest.mark.skipif(True, reason="Open it when preempt ready.")
# "enforce_eager": True, @pytest.mark.parametrize(
"common_llm_kwargs",
[{
"block_size": 16,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8,
# # Precision # Skip cuda graph recording for fast test.
# "dtype": PRECISION, "enforce_eager": True,
# # Main model # Precision
# "model_name": MAIN_MODEL, "dtype": PRECISION,
# }])
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) # Main model
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) "model_name": MAIN_MODEL,
# @pytest.mark.parametrize("test_llm_kwargs", [ }])
# { @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
# "speculative_config": { @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
# "model": SPEC_MODEL, @pytest.mark.parametrize("test_llm_kwargs", [
# "num_speculative_tokens": MAX_SPEC_TOKENS, {
# }, "speculative_config": {
# }, "model": SPEC_MODEL,
# ]) "num_speculative_tokens": MAX_SPEC_TOKENS,
# @pytest.mark.parametrize( },
# "output_len", },
# [ ])
# # Use small output len for fast test. @pytest.mark.parametrize(
# 128, "output_len",
# ]) [
# @pytest.mark.parametrize("batch_size", [4]) # Use small output len for fast test.
# @pytest.mark.parametrize("seed", [1]) 128,
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE) ])
# def test_medusa_e2e_greedy_correctness_with_preemption( @pytest.mark.parametrize("batch_size", [4])
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @pytest.mark.parametrize("seed", [1])
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
# seed: int, prefill_chunk_size: int): def test_medusa_e2e_greedy_correctness_with_preemption(
# """Verify greedy equality, even when some sequences are preempted mid- vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# generation. baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
# """ seed: int, prefill_chunk_size: int):
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) """Verify greedy equality, even when some sequences are preempted mid-
# run_equality_correctness_test(vllm_runner, generation.
# common_llm_kwargs, """
# per_test_common_llm_kwargs, maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
# baseline_llm_kwargs, run_equality_correctness_test(vllm_runner,
# test_llm_kwargs, common_llm_kwargs,
# batch_size, per_test_common_llm_kwargs,
# max_output_len=output_len, baseline_llm_kwargs,
# seed=seed, test_llm_kwargs,
# temperature=0.0) batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@@ -294,124 +294,120 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
disable_seed=True) disable_seed=True)
# TODO: There is a problem with the preemptive scheduling in the current @pytest.mark.skipif(True, reason="Open it when preempt ready.")
# version, which makes this case fail. Please release this case after the @pytest.mark.parametrize(
# preemptive scheduling problem is solved. "common_llm_kwargs",
# @pytest.mark.parametrize( [{
# "common_llm_kwargs", "block_size": 16,
# [{ # 2 for small prompt, 256//8 for generated.
# "block_size": 8, "num_gpu_blocks_override": 2 + 256 // 8,
# # 2 for small prompt, 256//8 for generated. "max_model_len": (2 + 256 // 8) * 8,
# "num_gpu_blocks_override": 2 + 256 // 8,
# "max_model_len": (2 + 256 // 8) * 8,
# # Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
# "enforce_eager": True, "enforce_eager": True,
# # Precision # Precision
# "dtype": PRECISION, "dtype": PRECISION,
# # Main model # Main model
# "model_name": MAIN_MODEL, "model_name": MAIN_MODEL,
# }]) }])
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
# @pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
# { {
# "speculative_config": { "speculative_config": {
# "model": SPEC_MODEL, "model": SPEC_MODEL,
# }, },
# }, },
# ]) ])
# @pytest.mark.parametrize( @pytest.mark.parametrize(
# "output_len", "output_len",
# [ [
# # Use small output len for fast test. # Use small output len for fast test.
# 128, 128,
# ]) ])
# @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("batch_size", [4])
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1) @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
# @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
# def test_mlp_e2e_greedy_correctness_with_preemption( def test_mlp_e2e_greedy_correctness_with_preemption(
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
# prefill_chunk_size: int, seed: int): prefill_chunk_size: int, seed: int):
# """Verify greedy equality, even when some sequences are preempted mid- """Verify greedy equality, even when some sequences are preempted mid-
# generation. generation.
# """ """
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
# run_equality_correctness_test(vllm_runner, run_equality_correctness_test(vllm_runner,
# common_llm_kwargs, common_llm_kwargs,
# per_test_common_llm_kwargs, per_test_common_llm_kwargs,
# baseline_llm_kwargs, baseline_llm_kwargs,
# test_llm_kwargs, test_llm_kwargs,
# batch_size, batch_size,
# max_output_len=output_len, max_output_len=output_len,
# seed=seed, seed=seed,
# temperature=0.0) temperature=0.0)
# TODO: There is a problem with the preemptive scheduling in the current
# version, which makes this case fail. Please release this case after the
# preemptive scheduling problem is solved.
# @pytest.mark.parametrize(
# "common_llm_kwargs",
# [{
# "block_size": 8,
# # 2 for small prompt, 256//8 for generated.
# "num_gpu_blocks_override": 2 + 256 // 8,
# "max_model_len": (2 + 256 // 8) * 8,
# # Skip cuda graph recording for fast test. @pytest.mark.skipif(True, reason="Open it when preempt ready.")
# "enforce_eager": True, @pytest.mark.parametrize(
"common_llm_kwargs",
[{
"block_size": 16,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8,
# # Precision # Skip cuda graph recording for fast test.
# "dtype": PRECISION, "enforce_eager": True,
# # Main model # Precision
# "model_name": MAIN_MODEL, "dtype": PRECISION,
# }])
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
# @pytest.mark.parametrize("test_llm_kwargs", [
# {
# "speculative_config": {
# "model": SPEC_MODEL,
# },
# },
# ])
# @pytest.mark.parametrize(
# "output_len",
# [
# # Use small output len for fast test.
# 128,
# ])
# @pytest.mark.parametrize("batch_size", [4])
# @pytest.mark.parametrize("seed", [1])
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
# def test_mlp_e2e_greedy_correctness_with_padding(
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
# prefill_chunk_size: int, seed: int):
# """Verify greedy equality when the vocab dimension is padded
# """
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
# # Default pad_to is 64, test model has vocab_size of 32000 # Main model
# def patched_pad_vocab_size(vocab_size, pad_to=None): "model_name": MAIN_MODEL,
# return pad_vocab_size(vocab_size, pad_to=32064) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": SPEC_MODEL,
},
},
])
@pytest.mark.parametrize(
"output_len",
[
# Use small output len for fast test.
128,
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
def test_mlp_e2e_greedy_correctness_with_padding(
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
prefill_chunk_size: int, seed: int):
"""Verify greedy equality when the vocab dimension is padded
"""
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
# # NOTE: Compared with vLLM, the patch method has been modified # Default pad_to is 64, test model has vocab_size of 32000
# from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size def patched_pad_vocab_size(vocab_size, pad_to=None):
# pad_vocab_size = patched_pad_vocab_size return pad_vocab_size(vocab_size, pad_to=32064)
# run_equality_correctness_test(vllm_runner,
# common_llm_kwargs, # NOTE: Compared with vLLM, the patch method has been modified
# per_test_common_llm_kwargs, pad_vocab_size = patched_pad_vocab_size # noqa: F811
# baseline_llm_kwargs, run_equality_correctness_test(vllm_runner,
# test_llm_kwargs, common_llm_kwargs,
# batch_size, per_test_common_llm_kwargs,
# max_output_len=output_len, baseline_llm_kwargs,
# seed=seed, test_llm_kwargs,
# temperature=0.0) batch_size,
max_output_len=output_len,
seed=seed,
temperature=0.0)
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@@ -307,7 +307,7 @@ def test_mtp_e2e_quant_greedy_correctness_torchair_graph(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"block_size": 8, "block_size": 16,
# 2 for small prompt, 256//8 for generated. # 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8, "num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8, "max_model_len": (2 + 256 // 8) * 8,

View File

@@ -173,71 +173,69 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
["disable_logprobs"]) ["disable_logprobs"])
# TODO: There is a problem with the preemptive scheduling in the current @pytest.mark.skipif(True, reason="Open it when preempt ready.")
# version, which makes this case fail. Please release this case after the @pytest.mark.parametrize(
# preemptive scheduling problem is solved. "common_llm_kwargs",
# @pytest.mark.parametrize( [{
# "common_llm_kwargs", "block_size": 16,
# [{ # 2 for small prompt, 256//8 for generated.
# "block_size": 8, "num_gpu_blocks_override": 2 + 256 // 8,
# # 2 for small prompt, 256//8 for generated. "max_model_len": (2 + 256 // 8) * 8,
# "num_gpu_blocks_override": 2 + 256 // 8,
# "max_model_len": (2 + 256 // 8) * 8,
# # Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
# "enforce_eager": True, "enforce_eager": True,
# }]) }])
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [ @pytest.mark.parametrize("per_test_common_llm_kwargs", [
# { {
# "model_name": "JackFram/llama-160m", "model_name": "JackFram/llama-160m",
# }, },
# ]) ])
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
# @pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
# { {
# "speculative_config": { "speculative_config": {
# "method": "ngram", "method": "ngram",
# "num_speculative_tokens": 5, "num_speculative_tokens": 5,
# "prompt_lookup_max": 3, "prompt_lookup_max": 3,
# }, },
# "enable_chunked_prefill": False, "enable_chunked_prefill": False,
# }, },
# { {
# "speculative_config": { "speculative_config": {
# "method": "ngram", "method": "ngram",
# "num_speculative_tokens": 5, "num_speculative_tokens": 5,
# "prompt_lookup_max": 3, "prompt_lookup_max": 3,
# "disable_mqa_scorer": True, "disable_mqa_scorer": True,
# }, },
# "enable_chunked_prefill": True, "enable_chunked_prefill": True,
# "max_num_batched_tokens": 4, "max_num_batched_tokens": 4,
# "max_num_seqs": 4 "max_num_seqs": 4
# }, },
# ]) ])
# @pytest.mark.parametrize( @pytest.mark.parametrize(
# "output_len", "output_len",
# [ [
# # Use small output len for fast test. # Use small output len for fast test.
# 256, 256,
# ]) ])
# @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("batch_size", [4])
# @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
# def test_ngram_e2e_greedy_correctness_with_preemption( def test_ngram_e2e_greedy_correctness_with_preemption(
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
# seed: int): seed: int):
# """Verify greedy equality, even when some sequences are preempted mid- """Verify greedy equality, even when some sequences are preempted mid-
# generation. generation.
# """ """
# run_equality_correctness_test(vllm_runner, run_equality_correctness_test(vllm_runner,
# common_llm_kwargs, common_llm_kwargs,
# per_test_common_llm_kwargs, per_test_common_llm_kwargs,
# baseline_llm_kwargs, baseline_llm_kwargs,
# test_llm_kwargs, test_llm_kwargs,
# batch_size, batch_size,
# max_output_len=output_len, max_output_len=output_len,
# temperature=0, temperature=0,
# seed=seed) seed=seed)
@pytest.mark.parametrize( @pytest.mark.parametrize(