diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 7a6f71c..52ef401 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -153,8 +153,7 @@ jobs: - "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py" - name: Run vllm-project/vllm-ascend Speculative Decode test - # speculative decode seems will cause oom issue, only disable it now on ci test with vLLM main - if: matrix.vllm_verison == 'v0.8.5.post1' && steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule' + if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule' run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process diff --git a/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py b/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py index 26cd11b..76c200b 100644 --- a/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/singlecard/spec_decode/e2e/test_medusa_correctness.py @@ -186,110 +186,109 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ["disable_logprobs"]) -# TODO: Open it when vllm-ascend support graph mode and -# @pytest.mark.parametrize( -# "common_llm_kwargs", -# [{ -# "enforce_eager": False, +@pytest.mark.skipif(True, reason="Open it when graph mode ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "enforce_eager": False, -# # Print spec metrics. -# "disable_log_stats": False, + # Print spec metrics. + "disable_log_stats": False, -# # Precision -# "dtype": PRECISION, + # Precision + "dtype": PRECISION, -# # Main model -# "model_name": MAIN_MODEL, -# }]) -# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -# @pytest.mark.parametrize("test_llm_kwargs", [ -# { -# "speculative_config": { -# "model": SPEC_MODEL, -# "num_speculative_tokens": MAX_SPEC_TOKENS, -# }, -# }, -# ]) -# @pytest.mark.parametrize("output_len", [ -# 128, -# ]) -# @pytest.mark.parametrize("batch_size", [1, 32]) -# @pytest.mark.parametrize("seed", [1]) -# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE) -# def test_medusa_e2e_greedy_correctness_cuda_graph( -# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, -# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, -# seed: int, prefill_chunk_size: int): -# """Verify greedy equality with cuda graph enabled and different -# batch sizes.""" -# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) -# run_equality_correctness_test(vllm_runner, -# common_llm_kwargs, -# per_test_common_llm_kwargs, -# baseline_llm_kwargs, -# test_llm_kwargs, -# batch_size, -# max_output_len=output_len, -# seed=seed, -# temperature=0.0) + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE) +def test_medusa_e2e_greedy_correctness_cuda_graph( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + seed: int, prefill_chunk_size: int): + """Verify greedy equality with cuda graph enabled and different + batch sizes.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + run_equality_correctness_test(vllm_runner, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + max_output_len=output_len, + seed=seed, + temperature=0.0) -# TODO: There is a problem with the preemptive scheduling in the current -# version, which makes this case fail. Please release this case after the -# preemptive scheduling problem is solved. -# @pytest.mark.parametrize( -# "common_llm_kwargs", -# [{ -# "block_size": 8, -# # 2 for small prompt, 256//8 for generated. -# "num_gpu_blocks_override": 2 + 256 // 8, -# "max_model_len": (2 + 256 // 8) * 8, -# # Skip cuda graph recording for fast test. -# "enforce_eager": True, +@pytest.mark.skipif(True, reason="Open it when preempt ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 16, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, -# # Precision -# "dtype": PRECISION, + # Skip cuda graph recording for fast test. + "enforce_eager": True, -# # Main model -# "model_name": MAIN_MODEL, -# }]) -# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -# @pytest.mark.parametrize("test_llm_kwargs", [ -# { -# "speculative_config": { -# "model": SPEC_MODEL, -# "num_speculative_tokens": MAX_SPEC_TOKENS, -# }, -# }, -# ]) -# @pytest.mark.parametrize( -# "output_len", -# [ -# # Use small output len for fast test. -# 128, -# ]) -# @pytest.mark.parametrize("batch_size", [4]) -# @pytest.mark.parametrize("seed", [1]) -# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE) -# def test_medusa_e2e_greedy_correctness_with_preemption( -# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, -# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, -# seed: int, prefill_chunk_size: int): -# """Verify greedy equality, even when some sequences are preempted mid- -# generation. -# """ -# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) -# run_equality_correctness_test(vllm_runner, -# common_llm_kwargs, -# per_test_common_llm_kwargs, -# baseline_llm_kwargs, -# test_llm_kwargs, -# batch_size, -# max_output_len=output_len, -# seed=seed, -# temperature=0.0) + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE) +def test_medusa_e2e_greedy_correctness_with_preemption( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + seed: int, prefill_chunk_size: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + run_equality_correctness_test(vllm_runner, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + max_output_len=output_len, + seed=seed, + temperature=0.0) @pytest.mark.parametrize( diff --git a/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py b/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py index e446c60..5a660c4 100644 --- a/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/singlecard/spec_decode/e2e/test_mlp_correctness.py @@ -294,124 +294,120 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, disable_seed=True) -# TODO: There is a problem with the preemptive scheduling in the current -# version, which makes this case fail. Please release this case after the -# preemptive scheduling problem is solved. -# @pytest.mark.parametrize( -# "common_llm_kwargs", -# [{ -# "block_size": 8, -# # 2 for small prompt, 256//8 for generated. -# "num_gpu_blocks_override": 2 + 256 // 8, -# "max_model_len": (2 + 256 // 8) * 8, +@pytest.mark.skipif(True, reason="Open it when preempt ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 16, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, -# # Skip cuda graph recording for fast test. -# "enforce_eager": True, + # Skip cuda graph recording for fast test. + "enforce_eager": True, -# # Precision -# "dtype": PRECISION, + # Precision + "dtype": PRECISION, -# # Main model -# "model_name": MAIN_MODEL, -# }]) -# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -# @pytest.mark.parametrize("test_llm_kwargs", [ -# { -# "speculative_config": { -# "model": SPEC_MODEL, -# }, -# }, -# ]) -# @pytest.mark.parametrize( -# "output_len", -# [ -# # Use small output len for fast test. -# 128, -# ]) -# @pytest.mark.parametrize("batch_size", [4]) -# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1) -# @pytest.mark.parametrize("seed", [1]) -# def test_mlp_e2e_greedy_correctness_with_preemption( -# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, -# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, -# prefill_chunk_size: int, seed: int): -# """Verify greedy equality, even when some sequences are preempted mid- -# generation. -# """ -# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) -# run_equality_correctness_test(vllm_runner, -# common_llm_kwargs, -# per_test_common_llm_kwargs, -# baseline_llm_kwargs, -# test_llm_kwargs, -# batch_size, -# max_output_len=output_len, -# seed=seed, -# temperature=0.0) + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + }, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_e2e_greedy_correctness_with_preemption( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + prefill_chunk_size: int, seed: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + run_equality_correctness_test(vllm_runner, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + max_output_len=output_len, + seed=seed, + temperature=0.0) -# TODO: There is a problem with the preemptive scheduling in the current -# version, which makes this case fail. Please release this case after the -# preemptive scheduling problem is solved. -# @pytest.mark.parametrize( -# "common_llm_kwargs", -# [{ -# "block_size": 8, -# # 2 for small prompt, 256//8 for generated. -# "num_gpu_blocks_override": 2 + 256 // 8, -# "max_model_len": (2 + 256 // 8) * 8, -# # Skip cuda graph recording for fast test. -# "enforce_eager": True, +@pytest.mark.skipif(True, reason="Open it when preempt ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 16, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, -# # Precision -# "dtype": PRECISION, + # Skip cuda graph recording for fast test. + "enforce_eager": True, -# # Main model -# "model_name": MAIN_MODEL, -# }]) -# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -# @pytest.mark.parametrize("test_llm_kwargs", [ -# { -# "speculative_config": { -# "model": SPEC_MODEL, -# }, -# }, -# ]) -# @pytest.mark.parametrize( -# "output_len", -# [ -# # Use small output len for fast test. -# 128, -# ]) -# @pytest.mark.parametrize("batch_size", [4]) -# @pytest.mark.parametrize("seed", [1]) -# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1) -# def test_mlp_e2e_greedy_correctness_with_padding( -# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, -# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, -# prefill_chunk_size: int, seed: int): -# """Verify greedy equality when the vocab dimension is padded -# """ -# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + # Precision + "dtype": PRECISION, -# # Default pad_to is 64, test model has vocab_size of 32000 -# def patched_pad_vocab_size(vocab_size, pad_to=None): -# return pad_vocab_size(vocab_size, pad_to=32064) + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + }, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1) +def test_mlp_e2e_greedy_correctness_with_padding( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + prefill_chunk_size: int, seed: int): + """Verify greedy equality when the vocab dimension is padded + """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) -# # NOTE: Compared with vLLM, the patch method has been modified -# from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size -# pad_vocab_size = patched_pad_vocab_size -# run_equality_correctness_test(vllm_runner, -# common_llm_kwargs, -# per_test_common_llm_kwargs, -# baseline_llm_kwargs, -# test_llm_kwargs, -# batch_size, -# max_output_len=output_len, -# seed=seed, -# temperature=0.0) + # Default pad_to is 64, test model has vocab_size of 32000 + def patched_pad_vocab_size(vocab_size, pad_to=None): + return pad_vocab_size(vocab_size, pad_to=32064) + + # NOTE: Compared with vLLM, the patch method has been modified + pad_vocab_size = patched_pad_vocab_size # noqa: F811 + run_equality_correctness_test(vllm_runner, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + max_output_len=output_len, + seed=seed, + temperature=0.0) @pytest.mark.parametrize( diff --git a/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py b/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py index 3e159d4..dc30ea6 100644 --- a/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py +++ b/tests/singlecard/spec_decode/e2e/test_mtp_correctness.py @@ -307,7 +307,7 @@ def test_mtp_e2e_quant_greedy_correctness_torchair_graph( @pytest.mark.parametrize( "common_llm_kwargs", [{ - "block_size": 8, + "block_size": 16, # 2 for small prompt, 256//8 for generated. "num_gpu_blocks_override": 2 + 256 // 8, "max_model_len": (2 + 256 // 8) * 8, diff --git a/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py b/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py index 18ed731..39130f9 100644 --- a/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/singlecard/spec_decode/e2e/test_ngram_correctness.py @@ -173,71 +173,69 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ["disable_logprobs"]) -# TODO: There is a problem with the preemptive scheduling in the current -# version, which makes this case fail. Please release this case after the -# preemptive scheduling problem is solved. -# @pytest.mark.parametrize( -# "common_llm_kwargs", -# [{ -# "block_size": 8, -# # 2 for small prompt, 256//8 for generated. -# "num_gpu_blocks_override": 2 + 256 // 8, -# "max_model_len": (2 + 256 // 8) * 8, +@pytest.mark.skipif(True, reason="Open it when preempt ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 16, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, -# # Skip cuda graph recording for fast test. -# "enforce_eager": True, -# }]) -# @pytest.mark.parametrize("per_test_common_llm_kwargs", [ -# { -# "model_name": "JackFram/llama-160m", -# }, -# ]) -# @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -# @pytest.mark.parametrize("test_llm_kwargs", [ -# { -# "speculative_config": { -# "method": "ngram", -# "num_speculative_tokens": 5, -# "prompt_lookup_max": 3, -# }, -# "enable_chunked_prefill": False, -# }, -# { -# "speculative_config": { -# "method": "ngram", -# "num_speculative_tokens": 5, -# "prompt_lookup_max": 3, -# "disable_mqa_scorer": True, -# }, -# "enable_chunked_prefill": True, -# "max_num_batched_tokens": 4, -# "max_num_seqs": 4 -# }, -# ]) -# @pytest.mark.parametrize( -# "output_len", -# [ -# # Use small output len for fast test. -# 256, -# ]) -# @pytest.mark.parametrize("batch_size", [4]) -# @pytest.mark.parametrize("seed", [1]) -# def test_ngram_e2e_greedy_correctness_with_preemption( -# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, -# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, -# seed: int): -# """Verify greedy equality, even when some sequences are preempted mid- -# generation. -# """ -# run_equality_correctness_test(vllm_runner, -# common_llm_kwargs, -# per_test_common_llm_kwargs, -# baseline_llm_kwargs, -# test_llm_kwargs, -# batch_size, -# max_output_len=output_len, -# temperature=0, -# seed=seed) + # Skip cuda graph recording for fast test. + "enforce_eager": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "model_name": "JackFram/llama-160m", + }, +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "method": "ngram", + "num_speculative_tokens": 5, + "prompt_lookup_max": 3, + }, + "enable_chunked_prefill": False, + }, + { + "speculative_config": { + "method": "ngram", + "num_speculative_tokens": 5, + "prompt_lookup_max": 3, + "disable_mqa_scorer": True, + }, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 256, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_ngram_e2e_greedy_correctness_with_preemption( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + run_equality_correctness_test(vllm_runner, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + max_output_len=output_len, + temperature=0, + seed=seed) @pytest.mark.parametrize(