[CI/UT] fix spec ut in vllm-ascend main and vllm main (#759)
### What this PR does / why we need it? #### 1. fix spec ut in vllm-ascend main and vllm main As https://github.com/vllm-project/vllm-ascend/pull/694 and https://github.com/vllm-project/vllm-ascend/pull/749 verify, Now, vllm-ascend main and vllm 0.8.5, spec UT is happy, but vllm-ascend main and vllm main, CI is fail. I found the reason is a triton bug https://github.com/triton-lang/triton/issues/2266, but i I didn't figure it out that why the bug did not effect vllm-ascend main and vllm 0.8.5, maybe the usage of triton have changed when vllm 0.8.5 to latest main As the bug describe, I changed the minimum block_size in UT from 8 to 16, and the modification is verified locally to be effective. #### 2. modify some case skip form. I modified some commented out cases to skipif form, which is more standardized. ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? CI Signed-off-by: mengwei805 <mengwei25@huawei.com>
This commit is contained in:
3
.github/workflows/vllm_ascend_test.yaml
vendored
3
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -153,8 +153,7 @@ jobs:
|
|||||||
- "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"
|
- "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"
|
||||||
|
|
||||||
- name: Run vllm-project/vllm-ascend Speculative Decode test
|
- name: Run vllm-project/vllm-ascend Speculative Decode test
|
||||||
# speculative decode seems will cause oom issue, only disable it now on ci test with vLLM main
|
if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
|
||||||
if: matrix.vllm_verison == 'v0.8.5.post1' && steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
|
|
||||||
run: |
|
run: |
|
||||||
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
|
||||||
pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
|
pytest -sv tests/singlecard/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
|
||||||
|
|||||||
@@ -186,110 +186,109 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
|||||||
["disable_logprobs"])
|
["disable_logprobs"])
|
||||||
|
|
||||||
|
|
||||||
# TODO: Open it when vllm-ascend support graph mode and
|
@pytest.mark.skipif(True, reason="Open it when graph mode ready.")
|
||||||
# @pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# "common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
# [{
|
[{
|
||||||
# "enforce_eager": False,
|
"enforce_eager": False,
|
||||||
|
|
||||||
# # Print spec metrics.
|
# Print spec metrics.
|
||||||
# "disable_log_stats": False,
|
"disable_log_stats": False,
|
||||||
|
|
||||||
# # Precision
|
# Precision
|
||||||
# "dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# # Main model
|
# Main model
|
||||||
# "model_name": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
# }])
|
}])
|
||||||
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
# @pytest.mark.parametrize("test_llm_kwargs", [
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
# {
|
{
|
||||||
# "speculative_config": {
|
"speculative_config": {
|
||||||
# "model": SPEC_MODEL,
|
"model": SPEC_MODEL,
|
||||||
# "num_speculative_tokens": MAX_SPEC_TOKENS,
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
# },
|
},
|
||||||
# },
|
},
|
||||||
# ])
|
])
|
||||||
# @pytest.mark.parametrize("output_len", [
|
@pytest.mark.parametrize("output_len", [
|
||||||
# 128,
|
128,
|
||||||
# ])
|
])
|
||||||
# @pytest.mark.parametrize("batch_size", [1, 32])
|
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||||
# @pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
|
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
|
||||||
# def test_medusa_e2e_greedy_correctness_cuda_graph(
|
def test_medusa_e2e_greedy_correctness_cuda_graph(
|
||||||
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
# seed: int, prefill_chunk_size: int):
|
seed: int, prefill_chunk_size: int):
|
||||||
# """Verify greedy equality with cuda graph enabled and different
|
"""Verify greedy equality with cuda graph enabled and different
|
||||||
# batch sizes."""
|
batch sizes."""
|
||||||
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
||||||
# run_equality_correctness_test(vllm_runner,
|
run_equality_correctness_test(vllm_runner,
|
||||||
# common_llm_kwargs,
|
common_llm_kwargs,
|
||||||
# per_test_common_llm_kwargs,
|
per_test_common_llm_kwargs,
|
||||||
# baseline_llm_kwargs,
|
baseline_llm_kwargs,
|
||||||
# test_llm_kwargs,
|
test_llm_kwargs,
|
||||||
# batch_size,
|
batch_size,
|
||||||
# max_output_len=output_len,
|
max_output_len=output_len,
|
||||||
# seed=seed,
|
seed=seed,
|
||||||
# temperature=0.0)
|
temperature=0.0)
|
||||||
|
|
||||||
# TODO: There is a problem with the preemptive scheduling in the current
|
|
||||||
# version, which makes this case fail. Please release this case after the
|
|
||||||
# preemptive scheduling problem is solved.
|
|
||||||
# @pytest.mark.parametrize(
|
|
||||||
# "common_llm_kwargs",
|
|
||||||
# [{
|
|
||||||
# "block_size": 8,
|
|
||||||
# # 2 for small prompt, 256//8 for generated.
|
|
||||||
# "num_gpu_blocks_override": 2 + 256 // 8,
|
|
||||||
# "max_model_len": (2 + 256 // 8) * 8,
|
|
||||||
|
|
||||||
# # Skip cuda graph recording for fast test.
|
@pytest.mark.skipif(True, reason="Open it when preempt ready.")
|
||||||
# "enforce_eager": True,
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
"block_size": 16,
|
||||||
|
# 2 for small prompt, 256//8 for generated.
|
||||||
|
"num_gpu_blocks_override": 2 + 256 // 8,
|
||||||
|
"max_model_len": (2 + 256 // 8) * 8,
|
||||||
|
|
||||||
# # Precision
|
# Skip cuda graph recording for fast test.
|
||||||
# "dtype": PRECISION,
|
"enforce_eager": True,
|
||||||
|
|
||||||
# # Main model
|
# Precision
|
||||||
# "model_name": MAIN_MODEL,
|
"dtype": PRECISION,
|
||||||
# }])
|
|
||||||
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
# Main model
|
||||||
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
"model_name": MAIN_MODEL,
|
||||||
# @pytest.mark.parametrize("test_llm_kwargs", [
|
}])
|
||||||
# {
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
# "speculative_config": {
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
# "model": SPEC_MODEL,
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
# "num_speculative_tokens": MAX_SPEC_TOKENS,
|
{
|
||||||
# },
|
"speculative_config": {
|
||||||
# },
|
"model": SPEC_MODEL,
|
||||||
# ])
|
"num_speculative_tokens": MAX_SPEC_TOKENS,
|
||||||
# @pytest.mark.parametrize(
|
},
|
||||||
# "output_len",
|
},
|
||||||
# [
|
])
|
||||||
# # Use small output len for fast test.
|
@pytest.mark.parametrize(
|
||||||
# 128,
|
"output_len",
|
||||||
# ])
|
[
|
||||||
# @pytest.mark.parametrize("batch_size", [4])
|
# Use small output len for fast test.
|
||||||
# @pytest.mark.parametrize("seed", [1])
|
128,
|
||||||
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
|
])
|
||||||
# def test_medusa_e2e_greedy_correctness_with_preemption(
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
@pytest.mark.parametrize("seed", [1])
|
||||||
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE)
|
||||||
# seed: int, prefill_chunk_size: int):
|
def test_medusa_e2e_greedy_correctness_with_preemption(
|
||||||
# """Verify greedy equality, even when some sequences are preempted mid-
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
# generation.
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
# """
|
seed: int, prefill_chunk_size: int):
|
||||||
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
# run_equality_correctness_test(vllm_runner,
|
generation.
|
||||||
# common_llm_kwargs,
|
"""
|
||||||
# per_test_common_llm_kwargs,
|
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
||||||
# baseline_llm_kwargs,
|
run_equality_correctness_test(vllm_runner,
|
||||||
# test_llm_kwargs,
|
common_llm_kwargs,
|
||||||
# batch_size,
|
per_test_common_llm_kwargs,
|
||||||
# max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
# seed=seed,
|
test_llm_kwargs,
|
||||||
# temperature=0.0)
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
@@ -294,124 +294,120 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
|
|||||||
disable_seed=True)
|
disable_seed=True)
|
||||||
|
|
||||||
|
|
||||||
# TODO: There is a problem with the preemptive scheduling in the current
|
@pytest.mark.skipif(True, reason="Open it when preempt ready.")
|
||||||
# version, which makes this case fail. Please release this case after the
|
@pytest.mark.parametrize(
|
||||||
# preemptive scheduling problem is solved.
|
"common_llm_kwargs",
|
||||||
# @pytest.mark.parametrize(
|
[{
|
||||||
# "common_llm_kwargs",
|
"block_size": 16,
|
||||||
# [{
|
# 2 for small prompt, 256//8 for generated.
|
||||||
# "block_size": 8,
|
"num_gpu_blocks_override": 2 + 256 // 8,
|
||||||
# # 2 for small prompt, 256//8 for generated.
|
"max_model_len": (2 + 256 // 8) * 8,
|
||||||
# "num_gpu_blocks_override": 2 + 256 // 8,
|
|
||||||
# "max_model_len": (2 + 256 // 8) * 8,
|
|
||||||
|
|
||||||
# # Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
# "enforce_eager": True,
|
"enforce_eager": True,
|
||||||
|
|
||||||
# # Precision
|
# Precision
|
||||||
# "dtype": PRECISION,
|
"dtype": PRECISION,
|
||||||
|
|
||||||
# # Main model
|
# Main model
|
||||||
# "model_name": MAIN_MODEL,
|
"model_name": MAIN_MODEL,
|
||||||
# }])
|
}])
|
||||||
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
# @pytest.mark.parametrize("test_llm_kwargs", [
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
# {
|
{
|
||||||
# "speculative_config": {
|
"speculative_config": {
|
||||||
# "model": SPEC_MODEL,
|
"model": SPEC_MODEL,
|
||||||
# },
|
},
|
||||||
# },
|
},
|
||||||
# ])
|
])
|
||||||
# @pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# "output_len",
|
"output_len",
|
||||||
# [
|
[
|
||||||
# # Use small output len for fast test.
|
# Use small output len for fast test.
|
||||||
# 128,
|
128,
|
||||||
# ])
|
])
|
||||||
# @pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
|
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
|
||||||
# @pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
# def test_mlp_e2e_greedy_correctness_with_preemption(
|
def test_mlp_e2e_greedy_correctness_with_preemption(
|
||||||
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
# prefill_chunk_size: int, seed: int):
|
prefill_chunk_size: int, seed: int):
|
||||||
# """Verify greedy equality, even when some sequences are preempted mid-
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
# generation.
|
generation.
|
||||||
# """
|
"""
|
||||||
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
||||||
# run_equality_correctness_test(vllm_runner,
|
run_equality_correctness_test(vllm_runner,
|
||||||
# common_llm_kwargs,
|
common_llm_kwargs,
|
||||||
# per_test_common_llm_kwargs,
|
per_test_common_llm_kwargs,
|
||||||
# baseline_llm_kwargs,
|
baseline_llm_kwargs,
|
||||||
# test_llm_kwargs,
|
test_llm_kwargs,
|
||||||
# batch_size,
|
batch_size,
|
||||||
# max_output_len=output_len,
|
max_output_len=output_len,
|
||||||
# seed=seed,
|
seed=seed,
|
||||||
# temperature=0.0)
|
temperature=0.0)
|
||||||
|
|
||||||
# TODO: There is a problem with the preemptive scheduling in the current
|
|
||||||
# version, which makes this case fail. Please release this case after the
|
|
||||||
# preemptive scheduling problem is solved.
|
|
||||||
# @pytest.mark.parametrize(
|
|
||||||
# "common_llm_kwargs",
|
|
||||||
# [{
|
|
||||||
# "block_size": 8,
|
|
||||||
# # 2 for small prompt, 256//8 for generated.
|
|
||||||
# "num_gpu_blocks_override": 2 + 256 // 8,
|
|
||||||
# "max_model_len": (2 + 256 // 8) * 8,
|
|
||||||
|
|
||||||
# # Skip cuda graph recording for fast test.
|
@pytest.mark.skipif(True, reason="Open it when preempt ready.")
|
||||||
# "enforce_eager": True,
|
@pytest.mark.parametrize(
|
||||||
|
"common_llm_kwargs",
|
||||||
|
[{
|
||||||
|
"block_size": 16,
|
||||||
|
# 2 for small prompt, 256//8 for generated.
|
||||||
|
"num_gpu_blocks_override": 2 + 256 // 8,
|
||||||
|
"max_model_len": (2 + 256 // 8) * 8,
|
||||||
|
|
||||||
# # Precision
|
# Skip cuda graph recording for fast test.
|
||||||
# "dtype": PRECISION,
|
"enforce_eager": True,
|
||||||
|
|
||||||
# # Main model
|
# Precision
|
||||||
# "model_name": MAIN_MODEL,
|
"dtype": PRECISION,
|
||||||
# }])
|
|
||||||
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
|
||||||
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
||||||
# @pytest.mark.parametrize("test_llm_kwargs", [
|
|
||||||
# {
|
|
||||||
# "speculative_config": {
|
|
||||||
# "model": SPEC_MODEL,
|
|
||||||
# },
|
|
||||||
# },
|
|
||||||
# ])
|
|
||||||
# @pytest.mark.parametrize(
|
|
||||||
# "output_len",
|
|
||||||
# [
|
|
||||||
# # Use small output len for fast test.
|
|
||||||
# 128,
|
|
||||||
# ])
|
|
||||||
# @pytest.mark.parametrize("batch_size", [4])
|
|
||||||
# @pytest.mark.parametrize("seed", [1])
|
|
||||||
# @pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
|
|
||||||
# def test_mlp_e2e_greedy_correctness_with_padding(
|
|
||||||
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
||||||
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
|
||||||
# prefill_chunk_size: int, seed: int):
|
|
||||||
# """Verify greedy equality when the vocab dimension is padded
|
|
||||||
# """
|
|
||||||
# maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
|
||||||
|
|
||||||
# # Default pad_to is 64, test model has vocab_size of 32000
|
# Main model
|
||||||
# def patched_pad_vocab_size(vocab_size, pad_to=None):
|
"model_name": MAIN_MODEL,
|
||||||
# return pad_vocab_size(vocab_size, pad_to=32064)
|
}])
|
||||||
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
|
{
|
||||||
|
"speculative_config": {
|
||||||
|
"model": SPEC_MODEL,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"output_len",
|
||||||
|
[
|
||||||
|
# Use small output len for fast test.
|
||||||
|
128,
|
||||||
|
])
|
||||||
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
|
@pytest.mark.parametrize("seed", [1])
|
||||||
|
@pytest.mark.parametrize("prefill_chunk_size", PREFILL_CHUNK_SIZE_1)
|
||||||
|
def test_mlp_e2e_greedy_correctness_with_padding(
|
||||||
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
|
prefill_chunk_size: int, seed: int):
|
||||||
|
"""Verify greedy equality when the vocab dimension is padded
|
||||||
|
"""
|
||||||
|
maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
|
||||||
|
|
||||||
# # NOTE: Compared with vLLM, the patch method has been modified
|
# Default pad_to is 64, test model has vocab_size of 32000
|
||||||
# from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
|
def patched_pad_vocab_size(vocab_size, pad_to=None):
|
||||||
# pad_vocab_size = patched_pad_vocab_size
|
return pad_vocab_size(vocab_size, pad_to=32064)
|
||||||
# run_equality_correctness_test(vllm_runner,
|
|
||||||
# common_llm_kwargs,
|
# NOTE: Compared with vLLM, the patch method has been modified
|
||||||
# per_test_common_llm_kwargs,
|
pad_vocab_size = patched_pad_vocab_size # noqa: F811
|
||||||
# baseline_llm_kwargs,
|
run_equality_correctness_test(vllm_runner,
|
||||||
# test_llm_kwargs,
|
common_llm_kwargs,
|
||||||
# batch_size,
|
per_test_common_llm_kwargs,
|
||||||
# max_output_len=output_len,
|
baseline_llm_kwargs,
|
||||||
# seed=seed,
|
test_llm_kwargs,
|
||||||
# temperature=0.0)
|
batch_size,
|
||||||
|
max_output_len=output_len,
|
||||||
|
seed=seed,
|
||||||
|
temperature=0.0)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
@@ -307,7 +307,7 @@ def test_mtp_e2e_quant_greedy_correctness_torchair_graph(
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"common_llm_kwargs",
|
"common_llm_kwargs",
|
||||||
[{
|
[{
|
||||||
"block_size": 8,
|
"block_size": 16,
|
||||||
# 2 for small prompt, 256//8 for generated.
|
# 2 for small prompt, 256//8 for generated.
|
||||||
"num_gpu_blocks_override": 2 + 256 // 8,
|
"num_gpu_blocks_override": 2 + 256 // 8,
|
||||||
"max_model_len": (2 + 256 // 8) * 8,
|
"max_model_len": (2 + 256 // 8) * 8,
|
||||||
|
|||||||
@@ -173,71 +173,69 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
|
|||||||
["disable_logprobs"])
|
["disable_logprobs"])
|
||||||
|
|
||||||
|
|
||||||
# TODO: There is a problem with the preemptive scheduling in the current
|
@pytest.mark.skipif(True, reason="Open it when preempt ready.")
|
||||||
# version, which makes this case fail. Please release this case after the
|
@pytest.mark.parametrize(
|
||||||
# preemptive scheduling problem is solved.
|
"common_llm_kwargs",
|
||||||
# @pytest.mark.parametrize(
|
[{
|
||||||
# "common_llm_kwargs",
|
"block_size": 16,
|
||||||
# [{
|
# 2 for small prompt, 256//8 for generated.
|
||||||
# "block_size": 8,
|
"num_gpu_blocks_override": 2 + 256 // 8,
|
||||||
# # 2 for small prompt, 256//8 for generated.
|
"max_model_len": (2 + 256 // 8) * 8,
|
||||||
# "num_gpu_blocks_override": 2 + 256 // 8,
|
|
||||||
# "max_model_len": (2 + 256 // 8) * 8,
|
|
||||||
|
|
||||||
# # Skip cuda graph recording for fast test.
|
# Skip cuda graph recording for fast test.
|
||||||
# "enforce_eager": True,
|
"enforce_eager": True,
|
||||||
# }])
|
}])
|
||||||
# @pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
|
||||||
# {
|
{
|
||||||
# "model_name": "JackFram/llama-160m",
|
"model_name": "JackFram/llama-160m",
|
||||||
# },
|
},
|
||||||
# ])
|
])
|
||||||
# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||||
# @pytest.mark.parametrize("test_llm_kwargs", [
|
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||||
# {
|
{
|
||||||
# "speculative_config": {
|
"speculative_config": {
|
||||||
# "method": "ngram",
|
"method": "ngram",
|
||||||
# "num_speculative_tokens": 5,
|
"num_speculative_tokens": 5,
|
||||||
# "prompt_lookup_max": 3,
|
"prompt_lookup_max": 3,
|
||||||
# },
|
},
|
||||||
# "enable_chunked_prefill": False,
|
"enable_chunked_prefill": False,
|
||||||
# },
|
},
|
||||||
# {
|
{
|
||||||
# "speculative_config": {
|
"speculative_config": {
|
||||||
# "method": "ngram",
|
"method": "ngram",
|
||||||
# "num_speculative_tokens": 5,
|
"num_speculative_tokens": 5,
|
||||||
# "prompt_lookup_max": 3,
|
"prompt_lookup_max": 3,
|
||||||
# "disable_mqa_scorer": True,
|
"disable_mqa_scorer": True,
|
||||||
# },
|
},
|
||||||
# "enable_chunked_prefill": True,
|
"enable_chunked_prefill": True,
|
||||||
# "max_num_batched_tokens": 4,
|
"max_num_batched_tokens": 4,
|
||||||
# "max_num_seqs": 4
|
"max_num_seqs": 4
|
||||||
# },
|
},
|
||||||
# ])
|
])
|
||||||
# @pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
# "output_len",
|
"output_len",
|
||||||
# [
|
[
|
||||||
# # Use small output len for fast test.
|
# Use small output len for fast test.
|
||||||
# 256,
|
256,
|
||||||
# ])
|
])
|
||||||
# @pytest.mark.parametrize("batch_size", [4])
|
@pytest.mark.parametrize("batch_size", [4])
|
||||||
# @pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
# def test_ngram_e2e_greedy_correctness_with_preemption(
|
def test_ngram_e2e_greedy_correctness_with_preemption(
|
||||||
# vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||||
# baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||||
# seed: int):
|
seed: int):
|
||||||
# """Verify greedy equality, even when some sequences are preempted mid-
|
"""Verify greedy equality, even when some sequences are preempted mid-
|
||||||
# generation.
|
generation.
|
||||||
# """
|
"""
|
||||||
# run_equality_correctness_test(vllm_runner,
|
run_equality_correctness_test(vllm_runner,
|
||||||
# common_llm_kwargs,
|
common_llm_kwargs,
|
||||||
# per_test_common_llm_kwargs,
|
per_test_common_llm_kwargs,
|
||||||
# baseline_llm_kwargs,
|
baseline_llm_kwargs,
|
||||||
# test_llm_kwargs,
|
test_llm_kwargs,
|
||||||
# batch_size,
|
batch_size,
|
||||||
# max_output_len=output_len,
|
max_output_len=output_len,
|
||||||
# temperature=0,
|
temperature=0,
|
||||||
# seed=seed)
|
seed=seed)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
Reference in New Issue
Block a user