add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/multi_step/__init__.py
Normal file
0
vllm-v0.6.2/tests/multi_step/__init__.py
Normal file
239
vllm-v0.6.2/tests/multi_step/test_correctness_async_llm.py
Normal file
239
vllm-v0.6.2/tests/multi_step/test_correctness_async_llm.py
Normal file
@@ -0,0 +1,239 @@
|
||||
# Test the AsyncLLMEngine with multi-step-decoding
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(override_backend_env_variable): Only support MLU_FLASH_ATTN backend,
|
||||
no need to override backend env variable.
|
||||
'''
|
||||
# from tests.kernels.utils import override_backend_env_variable
|
||||
|
||||
from ..models.utils import check_logprobs_close
|
||||
from ..utils import (completions_with_server_args, get_client_text_generations,
|
||||
get_client_text_logprob_generations)
|
||||
|
||||
MODELS = [
|
||||
"JackFram/llama-160m",
|
||||
]
|
||||
NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps
|
||||
NUM_PROMPTS = [10]
|
||||
|
||||
DEFAULT_SERVER_ARGS: List[str] = [
|
||||
"--disable-log-requests",
|
||||
"--worker-use-ray",
|
||||
"--gpu-memory-utilization",
|
||||
"0.3",
|
||||
"--swap-space",
|
||||
"16",
|
||||
]
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(tp_size&pp_size): Only test tp_size * pp_size <= 2
|
||||
@brief(attention_backend): Only test MLU_FLASH_ATTN backend
|
||||
'''
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize(("tp_size, pp_size"), [
|
||||
(2, 1),
|
||||
(1, 2),
|
||||
])
|
||||
@pytest.mark.parametrize("eager_mode", [False, True])
|
||||
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
|
||||
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("is_async", [True])
|
||||
@pytest.mark.parametrize("attention_backend", ["MLU_FLASH_ATTN"])
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_step(
|
||||
example_prompts,
|
||||
model: str,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
eager_mode: int,
|
||||
num_scheduler_steps: int,
|
||||
num_prompts: int,
|
||||
is_async: bool,
|
||||
num_logprobs: Optional[int],
|
||||
attention_backend: str,
|
||||
enable_chunked_prefill: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
|
||||
client/server environment.
|
||||
|
||||
Set up an engine with single-step scheduling as a ground-truth reference.
|
||||
|
||||
Send a completions API request to both engines with the same prompts.
|
||||
|
||||
Validate:
|
||||
* Generated tokens match
|
||||
* Generated logprobs are all very close
|
||||
|
||||
Args:
|
||||
example_prompts: test fixture providing example prompts
|
||||
model: model under test (same for single- and multi-step engines)
|
||||
tp_size: degree of tensor-parallelism
|
||||
pp_size: degree of pipeline-parallelism
|
||||
eager_mode
|
||||
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
|
||||
GPU -> CPU output transfer
|
||||
num_prompts: number of example prompts under test
|
||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||
completions endpoint; `None` -> no logprobs
|
||||
"""
|
||||
if enable_chunked_prefill and \
|
||||
(pp_size > 1 or attention_backend != "FLASH_ATTN"):
|
||||
pytest.skip("Multi-step with Chunked-Prefill only supports"
|
||||
"PP=1 and FLASH_ATTN backend")
|
||||
|
||||
# override_backend_env_variable(monkeypatch, attention_backend)
|
||||
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
|
||||
server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
|
||||
ms_server_args = DEFAULT_SERVER_ARGS + \
|
||||
["--num-scheduler-steps", f"{num_scheduler_steps}"]
|
||||
|
||||
if not is_async:
|
||||
ms_server_args += ["--disable-async-output-proc"]
|
||||
|
||||
if eager_mode:
|
||||
ms_server_args.append("--enforce-eager")
|
||||
|
||||
if enable_chunked_prefill:
|
||||
ms_server_args.append("--enable-chunked-prefill")
|
||||
|
||||
distributed_args = [
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
]
|
||||
|
||||
# Spin up client/server & issue completion API requests.
|
||||
# Default `max_wait_seconds` is 240 but was empirically
|
||||
# was raised 3x to 720 *just for this test* due to
|
||||
# observed timeouts in GHA CI
|
||||
ref_completions = await completions_with_server_args(
|
||||
prompts,
|
||||
model,
|
||||
server_args + distributed_args,
|
||||
num_logprobs,
|
||||
max_wait_seconds=5 * 240)
|
||||
test_completions = await completions_with_server_args(
|
||||
prompts,
|
||||
model,
|
||||
ms_server_args + distributed_args,
|
||||
num_logprobs,
|
||||
max_wait_seconds=5 * 240)
|
||||
|
||||
# Assert multi-step scheduling produces identical tokens
|
||||
# to single-step scheduling.
|
||||
ref_generations = get_client_text_generations(ref_completions)
|
||||
test_generations = get_client_text_generations(test_completions)
|
||||
assert ref_generations == test_generations
|
||||
|
||||
# Assert multi-step scheduling produces nearly-identical logprobs
|
||||
# to single-step scheduling.
|
||||
ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
|
||||
test_text_logprobs = get_client_text_logprob_generations(test_completions)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=ref_text_logprobs,
|
||||
outputs_1_lst=test_text_logprobs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("tp_size, pp_size"), [
|
||||
(1, 2),
|
||||
])
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_step_pp_smoke(
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""
|
||||
Smoke test for the vLLM engine with multi-step scheduling in an
|
||||
OpenAI-protocol client/server environment.
|
||||
|
||||
This tests compares the outputs between multi-step scheduling and
|
||||
single-step scheduling. Notably, this test lets the engines generate
|
||||
more tokens (default is 5) and test for an exact match over all the
|
||||
tokens.
|
||||
|
||||
Args:
|
||||
tp_size: degree of tensor-parallelism
|
||||
pp_size: degree of pipeline-parallelism
|
||||
eager_mode
|
||||
"""
|
||||
|
||||
model = "JackFram/llama-160m"
|
||||
num_scheduler_steps = 8
|
||||
attention_backend = "MLU_FLASH_ATTN"
|
||||
max_num_seqs = 3
|
||||
|
||||
# override_backend_env_variable(monkeypatch, attention_backend)
|
||||
|
||||
# Prompt from the ShareGPT dataset
|
||||
prompts = [
|
||||
"in the jtbd context whats a push?", # codespell:ignore
|
||||
"in the jtbd context whats a push?", # codespell:ignore
|
||||
"in the jtbd context whats a push?", # codespell:ignore
|
||||
"in the jtbd context whats a push?", # codespell:ignore
|
||||
]
|
||||
# Use varying max_tokens to introduce scheduling randomness.
|
||||
max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
|
||||
assert len(prompts) == len(max_tokens)
|
||||
|
||||
test_args = [
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size), "--pipeline-parallel-size",
|
||||
str(pp_size), "--max-num-seqs",
|
||||
str(max_num_seqs)
|
||||
]
|
||||
|
||||
server_args = DEFAULT_SERVER_ARGS + test_args
|
||||
ms_server_args = DEFAULT_SERVER_ARGS + \
|
||||
["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
|
||||
test_args
|
||||
|
||||
# Spin up client/server & issue completion API requests.
|
||||
# Default `max_wait_seconds` is 240 but was empirically
|
||||
# was raised 3x to 720 *just for this test* due to
|
||||
# observed timeouts in GHA CI
|
||||
ref_completions = await completions_with_server_args(
|
||||
prompts=prompts,
|
||||
model_name=model,
|
||||
server_cli_args=server_args,
|
||||
num_logprobs=None,
|
||||
max_wait_seconds=5 * 240,
|
||||
max_tokens=max_tokens)
|
||||
|
||||
test_completions = await completions_with_server_args(
|
||||
prompts=prompts,
|
||||
model_name=model,
|
||||
server_cli_args=ms_server_args,
|
||||
num_logprobs=None,
|
||||
max_wait_seconds=5 * 240,
|
||||
max_tokens=max_tokens)
|
||||
|
||||
# Assert multi-step scheduling produces identical tokens
|
||||
# to single-step scheduling.
|
||||
ref_generations = get_client_text_generations(ref_completions)
|
||||
test_generations = get_client_text_generations(test_completions)
|
||||
|
||||
assert ref_generations == test_generations
|
||||
352
vllm-v0.6.2/tests/multi_step/test_correctness_llm.py
Normal file
352
vllm-v0.6.2/tests/multi_step/test_correctness_llm.py
Normal file
@@ -0,0 +1,352 @@
|
||||
# Test the LLMEngine with multi-step-decoding
|
||||
|
||||
import copy
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"JackFram/llama-160m",
|
||||
]
|
||||
NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps
|
||||
NUM_PROMPTS = [10]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
|
||||
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
|
||||
@pytest.mark.parametrize("num_logprobs", [None, 5])
|
||||
def test_multi_step_llm(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
tp_size: int,
|
||||
enable_chunked_prefill: bool,
|
||||
max_tokens: int,
|
||||
enforce_eager: int,
|
||||
num_scheduler_steps: int,
|
||||
num_prompts: int,
|
||||
num_logprobs: Optional[int],
|
||||
) -> None:
|
||||
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
|
||||
|
||||
Set up a HuggingFace (HF) transformers model as a ground-truth reference.
|
||||
|
||||
Prompt them with the same example prompts.
|
||||
|
||||
Validate:
|
||||
* Generated tokens match
|
||||
* Generated logprobs are all very close
|
||||
|
||||
Args:
|
||||
hf_runner: HF transformers model runner fixture
|
||||
vllm_runner: vLLM model runner fixture
|
||||
example_prompts: test fixture providing example prompts
|
||||
model: model under test (same for single- and multi-step engines)
|
||||
dtype: tensor datatype for engine to utilize
|
||||
tp_size: degree of tensor-parallelism
|
||||
enable_chunked_prefill: chunked-prefill on/off
|
||||
max_tokens: the maximum number of tokens to generate
|
||||
enforce_eager
|
||||
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
|
||||
GPU -> CPU output transfer
|
||||
num_prompts: number of example prompts under test
|
||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||
completions endpoint; `None` -> 1 logprob returned.
|
||||
"""
|
||||
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.3,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
) as vllm_model:
|
||||
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
|
||||
if num_logprobs is None else
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs))
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
|
||||
if num_logprobs is None else
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts, max_tokens, num_logprobs))
|
||||
|
||||
if num_logprobs is None:
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
else:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
|
||||
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
|
||||
@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
|
||||
def test_multi_step_llm_w_prompt_logprobs(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
tp_size: int,
|
||||
max_tokens: int,
|
||||
enforce_eager: int,
|
||||
num_scheduler_steps: int,
|
||||
num_prompts: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int],
|
||||
) -> None:
|
||||
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
|
||||
|
||||
Set up a vLLM engine instance w/ single-step scheduling as a ground-truth
|
||||
reference.
|
||||
|
||||
Prompt them with the same example prompts.
|
||||
|
||||
Validate:
|
||||
* All generated logprobs are all very close
|
||||
|
||||
Args:
|
||||
hf_runner: HF transformers model runner fixture
|
||||
vllm_runner: vLLM model runner fixture
|
||||
example_prompts: test fixture providing example prompts
|
||||
model: model under test (same for single- and multi-step engines)
|
||||
dtype: tensor datatype for engine to utilize
|
||||
tp_size: degree of tensor-parallelism
|
||||
max_tokens: the maximum number of tokens to generate
|
||||
enforce_eager
|
||||
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
|
||||
GPU -> CPU output transfer
|
||||
num_prompts: number of example prompts under test
|
||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||
completions endpoint; `None` -> no logprobs
|
||||
num_prompt_logprobs: number of logprobs to return for each prompt token;
|
||||
note that this argument is not supported by the
|
||||
OpenAI completions endpoint.
|
||||
"""
|
||||
|
||||
prompts = example_prompts
|
||||
if len(prompts) < num_prompts:
|
||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||
prompts = prompts[:num_prompts]
|
||||
assert len(prompts) == num_prompts
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.3,
|
||||
tensor_parallel_size=tp_size,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.3,
|
||||
tensor_parallel_size=tp_size,
|
||||
) as vllm_model:
|
||||
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_prompt_logprobs=num_prompt_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=single_step_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
|
||||
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
|
||||
@pytest.mark.parametrize("num_logprobs", [None, 5])
|
||||
def test_multi_step_llm_chunked_prefill_prefix_cache(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
tp_size: int,
|
||||
max_tokens: int,
|
||||
enforce_eager: int,
|
||||
num_scheduler_steps: int,
|
||||
num_prompts: int,
|
||||
num_logprobs: Optional[int],
|
||||
) -> None:
|
||||
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
|
||||
|
||||
Set up contrived scenario which tests for a possible failure mode of
|
||||
scheduling with multi-step+"single-step chunked prefill"+APC
|
||||
|
||||
"single-step chunked prefill" here refers to the current vLLM multi-step+
|
||||
chunked-prefill implementation, which requires that a prefill may only
|
||||
be scheduled in the same step as decodes if the prefill prompt fits in a
|
||||
single chunk (note that "complete" multi-step+chunked-prefill would allow
|
||||
a prefill to span multiple chunks & multiple steps but that is not yet
|
||||
the case.)
|
||||
|
||||
"APC" is short for "automatic prefix caching".
|
||||
|
||||
This test creates a scenario where the scheduler must decide whether/how
|
||||
to schedule a prefill with a prompt that exceeds the available token budget.
|
||||
The correct behavior for multi-step+"single-step chunked prefill"+APC is to
|
||||
put off scheduling the prefill until a future step.
|
||||
|
||||
Validate that:
|
||||
* Multi-step kernels do not raise an exception due to incorrect scheduler
|
||||
behavior
|
||||
* Generated tokens match between
|
||||
multi-step+"single-step chunked prefill"+APC and
|
||||
single-step scheduling.
|
||||
* (If logprobs are enabled) check logprobs are close enough
|
||||
|
||||
Args:
|
||||
vllm_runner: vLLM model runner fixture
|
||||
example_prompts: test fixture providing example prompts
|
||||
model: model under test (same for single- and multi-step engines)
|
||||
dtype: tensor datatype for engine to utilize
|
||||
tp_size: degree of tensor-parallelism
|
||||
max_tokens: the maximum number of tokens to generate
|
||||
enforce_eager
|
||||
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
|
||||
GPU -> CPU output transfer
|
||||
num_prompts: number of example prompts under test
|
||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||
completions endpoint; `None` -> 1 logprob returned.
|
||||
"""
|
||||
|
||||
# Set up contrived test for correct scheduling behavior with
|
||||
# multi-step+"single-step chunked prefill"+APC.
|
||||
#
|
||||
# Assume block_size=16
|
||||
#
|
||||
# Assume max_num_batched_tokens=48
|
||||
# => Per-step token budget=48
|
||||
#
|
||||
# 1. Scheduler schedules 0th prompt (24 tokens)
|
||||
# => Remaining token budget=24
|
||||
# 2. Scheduler attempts to schedule 1st prompt (30 tokens)
|
||||
# * 30 tokens exceeds 24 token remaining budget
|
||||
# * Correct behavior: do not schedule this prompt in this step
|
||||
# * Incorrect behavior: schedule prompt chunk
|
||||
# * `do_sample=False` for this prompt in this step
|
||||
# * Chunk size = (remaining tokens // block size) * block size
|
||||
#
|
||||
# The Incorrect scheduling behavior - if it occurs - will cause an exception
|
||||
# in the model runner resulting from `do_sample=False`.
|
||||
assert len(example_prompts) >= 2
|
||||
challenge_prompts = copy.deepcopy(example_prompts)
|
||||
challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
|
||||
'inference and serving engine for LLMs.\n'
|
||||
) # 24 tok
|
||||
challenge_prompts[1] = (
|
||||
'Briefly describe the major milestones in the '
|
||||
'development of artificial intelligence from 1950 to 2020.\n'
|
||||
) # 30 tok
|
||||
|
||||
# If necessary, adjust the length of `challenge_prompts` to match
|
||||
# `num_prompts`
|
||||
if len(challenge_prompts) < num_prompts:
|
||||
challenge_prompts = (challenge_prompts *
|
||||
((num_prompts // len(challenge_prompts)) + 1))
|
||||
challenge_prompts = challenge_prompts[:num_prompts]
|
||||
assert len(challenge_prompts) == num_prompts
|
||||
|
||||
# Single-step scheduler baseline
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.3,
|
||||
tensor_parallel_size=tp_size,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
max_model_len=48,
|
||||
max_num_batched_tokens=48,
|
||||
max_num_seqs=4,
|
||||
block_size=16,
|
||||
) as vllm_model:
|
||||
outputs_baseline = (vllm_model.generate_greedy(
|
||||
challenge_prompts, max_tokens) if num_logprobs is None else
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
challenge_prompts, max_tokens, num_logprobs))
|
||||
|
||||
# multi-step+"single-step chunked prefill"+APC
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.3,
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=True,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
max_model_len=48,
|
||||
max_num_batched_tokens=48,
|
||||
max_num_seqs=4,
|
||||
block_size=16,
|
||||
) as vllm_model:
|
||||
outputs_w_features = (vllm_model.generate_greedy(
|
||||
challenge_prompts, max_tokens) if num_logprobs is None else
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
challenge_prompts, max_tokens, num_logprobs))
|
||||
|
||||
if num_logprobs is None:
|
||||
# No-logprobs test
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=outputs_baseline,
|
||||
outputs_1_lst=outputs_w_features,
|
||||
name_0="multi-step",
|
||||
name_1="multi-step+features",
|
||||
)
|
||||
else:
|
||||
# Yes-logprobs test
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_baseline,
|
||||
outputs_1_lst=outputs_w_features,
|
||||
name_0="multi-step",
|
||||
name_1="multi-step+features",
|
||||
)
|
||||
Reference in New Issue
Block a user