forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
69
vllm-v0.6.2/tests/models/decoder_only/language/test_aqlm.py
Normal file
69
vllm-v0.6.2/tests/models/decoder_only/language/test_aqlm.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""Compare the outputs of a AQLM model between vLLM and HF Transformers
|
||||
|
||||
Run `pytest tests/models/test_aqlm.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
# These ground truth generations were generated using `transformers==4.38.1
|
||||
# aqlm==1.1.0 torch==2.2.0`
|
||||
# and the below code:
|
||||
# ```python
|
||||
# from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
|
||||
# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
|
||||
# torch_dtype="auto", device_map="cuda").cuda()
|
||||
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
# outputs = []
|
||||
# for prompt in example_prompts:
|
||||
# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
|
||||
# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
|
||||
# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
|
||||
# print(outputs)
|
||||
# ```
|
||||
ground_truth_generations = [
|
||||
'\n### Features\n\n- **High-throughput**: v',
|
||||
'The major milestones in the development of artificial intelligence from '
|
||||
'195',
|
||||
'Compare and contrast artificial intelligence with human intelligence in '
|
||||
'terms of processing information. The',
|
||||
'Explain the difference between supervised and unsupervised learning.'
|
||||
'\nExplain',
|
||||
'Write a short story about a robot that dreams for the first time. The',
|
||||
'Analyze the impact of the COVID-19 pandemic on global economic',
|
||||
'The Mona Lisa is a painting by Leonardo da Vinci, and it',
|
||||
'The early bird catches the worm.\nThe early bird catches the'
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
|
||||
reason="AQLM is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [16])
|
||||
@pytest.mark.parametrize("num_logprobs", [1])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
# loop through the prompts to compare against the ground truth generations
|
||||
for prompt_idx in range(len(example_prompts)):
|
||||
vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
|
||||
prompt_idx]
|
||||
|
||||
print("Prompt: ", repr(example_prompts[prompt_idx]))
|
||||
print("Reference output:", repr(ground_truth_generations[prompt_idx]))
|
||||
print("Output output: ", repr(vllm_output_str))
|
||||
assert vllm_output_str == ground_truth_generations[prompt_idx]
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
This tests bigger models and use half precision.
|
||||
|
||||
Run `pytest tests/models/test_big_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(MODELS): Only test Llama-2-7b-hf, disable gpt-j-6b.
|
||||
'''
|
||||
MODELS = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
|
||||
# "Deci/DeciLM-7b", # Broken
|
||||
# "tiiuae/falcon-7b", # Broken
|
||||
# "EleutherAI/gpt-j-6b",
|
||||
# "mosaicml/mpt-7b", # Broken
|
||||
# "Qwen/Qwen1.5-0.5B" # Broken,
|
||||
]
|
||||
|
||||
#TODO: remove this after CPU float16 support ready
|
||||
target_dtype = "float"
|
||||
if torch.cuda.is_available():
|
||||
target_dtype = "half"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
def test_model_print(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
100
vllm-v0.6.2/tests/models/decoder_only/language/test_fp8.py
Normal file
100
vllm-v0.6.2/tests/models/decoder_only/language/test_fp8.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# flake8: noqa
|
||||
"""Tests fp8 models against ground truth generation
|
||||
Note: these tests will only pass on L4 GPU.
|
||||
"""
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,base_model,test_model,scale_path",
|
||||
[
|
||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
|
||||
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct", None),
|
||||
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
|
||||
"meta-llama/Llama-2-7b-chat-hf",
|
||||
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
|
||||
])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
kv_cache_dtype: str,
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
scale_path: Optional[str],
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
backend: str,
|
||||
tensor_parallel_size: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""
|
||||
Only checks log probs match to cover the discrepancy in
|
||||
numerical sensitive kernels.
|
||||
"""
|
||||
override_backend_env_variable(monkeypatch, backend)
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
with vllm_runner(
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
extra_kwargs = {}
|
||||
if scale_path is not None:
|
||||
extra_kwargs["quantization_param_path"] = scale_path
|
||||
|
||||
with vllm_runner(
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
**extra_kwargs,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=baseline_outputs,
|
||||
outputs_1_lst=test_outputs,
|
||||
name_0="fp16_kv_cache",
|
||||
name_1="fp8_kv_cache",
|
||||
)
|
||||
87
vllm-v0.6.2/tests/models/decoder_only/language/test_gguf.py
Normal file
87
vllm-v0.6.2/tests/models/decoder_only/language/test_gguf.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Tests gguf models against unquantized models generations
|
||||
Note: To pass the test, quantization higher than Q4 should be used
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
|
||||
("meta-llama/Llama-3.2-1B-Instruct",
|
||||
"bartowski/Llama-3.2-1B-Instruct-GGUF",
|
||||
"Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct",
|
||||
"bartowski/Llama-3.2-1B-Instruct-GGUF",
|
||||
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
|
||||
("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
|
||||
"qwen2-1_5b-instruct-q4_k_m.gguf"),
|
||||
("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
|
||||
"Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
def test_models(
|
||||
num_gpus_available,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
original_model,
|
||||
gguf_id,
|
||||
gguf_path,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
gguf_model = hf_hub_download(gguf_id, filename=gguf_path)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(original_model)
|
||||
messages = [[{
|
||||
'role': 'user',
|
||||
'content': prompt
|
||||
}] for prompt in example_prompts]
|
||||
example_prompts = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
# Run unquantized model.
|
||||
with vllm_runner(model_name=original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as original_model:
|
||||
|
||||
original_outputs = original_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
# Run gguf model.
|
||||
with vllm_runner(model_name=gguf_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as gguf_model:
|
||||
gguf_outputs = gguf_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=original_outputs,
|
||||
outputs_1_lst=gguf_outputs,
|
||||
name_0="original",
|
||||
name_1="gguf",
|
||||
)
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Compares the outputs of gptq vs gptq_marlin
|
||||
Note: GPTQ and Marlin do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 5 selections of each other.
|
||||
Note: Marlin internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
||||
up to 3 times to see if we pass.
|
||||
|
||||
Run `pytest tests/models/test_gptq_marlin.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = [
|
||||
# act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
||||
|
||||
# 8-bit, act_order==True, group_size=channelwise
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
||||
|
||||
# 4-bit, act_order==True, group_size=128
|
||||
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_name, revision = model
|
||||
|
||||
# Run marlin.
|
||||
with vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype=dtype,
|
||||
quantization="marlin",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1) as gptq_marlin_model:
|
||||
|
||||
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
|
||||
|
||||
# Run gptq.
|
||||
# The naive gptq kernel doesn't support bf16 yet.
|
||||
# Here we always compare fp16/bf16 gpt marlin kernel
|
||||
# to fp16 gptq kernel.
|
||||
with vllm_runner(model_name=model_name,
|
||||
revision=revision,
|
||||
dtype="half",
|
||||
quantization="gptq",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=gptq_marlin_outputs,
|
||||
name_0="gptq",
|
||||
name_1="gptq_marlin",
|
||||
)
|
||||
@@ -0,0 +1,73 @@
|
||||
"""Compare the outputs of a GPTQ model to a Marlin_24 model.
|
||||
|
||||
Note: GPTQ and Marlin_24 do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
||||
|
||||
Run `pytest tests/models/test_marlin_24.py`.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_marlin: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
model_pairs = [
|
||||
# 4-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
|
||||
# # 4-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
||||
|
||||
# 8-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
|
||||
# # 8-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
|
||||
reason="Marlin24 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model_pair: ModelPair,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model_pair.model_marlin,
|
||||
dtype=dtype,
|
||||
quantization="gptq_marlin_24") as marlin_24_model:
|
||||
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model_pair.model_gptq, dtype=dtype,
|
||||
quantization="gptq") as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=marlin_24_outputs,
|
||||
name_0="gptq",
|
||||
name_1="marlin_24",
|
||||
)
|
||||
@@ -0,0 +1,41 @@
|
||||
"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_granite.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
"ibm/PowerLM-3b",
|
||||
"ibm/PowerMoE-3b",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
299
vllm-v0.6.2/tests/models/decoder_only/language/test_jamba.py
Normal file
299
vllm-v0.6.2/tests/models/decoder_only/language/test_jamba.py
Normal file
@@ -0,0 +1,299 @@
|
||||
import pytest
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.worker.model_runner import _get_graph_batch_size
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["ai21labs/Jamba-tiny-dev"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
model_kwargs={
|
||||
"use_mamba_kernels":
|
||||
False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_batching(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# To pass the small model tests, we need full precision.
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
for_loop_outputs.append(
|
||||
vllm_model.generate_greedy([prompt], max_tokens)[0])
|
||||
|
||||
batched_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=for_loop_outputs,
|
||||
outputs_1_lst=batched_outputs,
|
||||
name_0="for_loop_vllm",
|
||||
name_1="batched_vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_mamba_prefill_chunking_with_parallel_sampling(
|
||||
hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
# Tests prefill chunking in conjunction with n>1, in this case,
|
||||
# prefill is populated with decoding tokens and we test that it
|
||||
# doesn't fail This test might fail if cache is not allocated
|
||||
# correctly for n > 1 decoding steps inside a
|
||||
# chunked prefill forward pass (where we have both prefills
|
||||
# and decoding together )
|
||||
sampling_params = SamplingParams(n=3,
|
||||
temperature=1,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=30,
|
||||
max_num_seqs=10 # forces prefill chunks with decoding
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
|
||||
model: str, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
# numeric error during prefill chucking produces different generation
|
||||
# compared to w/o prefill chunking for those examples, removed them for now
|
||||
example_prompts.pop(7)
|
||||
example_prompts.pop(2)
|
||||
example_prompts.pop(1)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
model_kwargs={
|
||||
"use_mamba_kernels":
|
||||
False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}) as hf_model:
|
||||
non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=5,
|
||||
max_num_seqs=2) as vllm_model:
|
||||
chunked = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=chunked,
|
||||
outputs_1_lst=non_chunked,
|
||||
name_0="chunked",
|
||||
name_1="non_chunked",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [15])
|
||||
def test_parallel_sampling(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for_loop_outputs = []
|
||||
for _ in range(10):
|
||||
for_loop_outputs.append(
|
||||
# using example_prompts index 1 instead of 0 since with 0 the
|
||||
# logprobs get really close and the test doesn't pass
|
||||
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
|
||||
[0])
|
||||
sampling_params = SamplingParams(n=10,
|
||||
temperature=0.001,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
|
||||
sampling_params)
|
||||
token_ids, texts = n_lt_1_outputs[0]
|
||||
n_lt_1_outputs = [(token_id, text)
|
||||
for token_id, text in zip(token_ids, texts)]
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=n_lt_1_outputs,
|
||||
outputs_1_lst=for_loop_outputs,
|
||||
name_0="vllm_n_lt_1_outputs",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_mamba_cache_cg_padding(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# This test is for verifying that mamba cache is padded to CG captured
|
||||
# batch size. If it's not, a torch RuntimeError will be raised because
|
||||
# tensor dimensions aren't compatible
|
||||
while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
|
||||
example_prompts.append(example_prompts[0])
|
||||
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
except RuntimeError:
|
||||
pytest.fail(
|
||||
"Couldn't run batch size which is not equal to a Cuda Graph "
|
||||
"captured batch size. "
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_models_preemption_recompute(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# Tests that outputs are identical with and w/o preemtions (recompute)
|
||||
assert dtype == "float"
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = False
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=preempt_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="vllm_preepmtions",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Jamba inner state management doesn't
|
||||
# collapse in case where the number of incoming requests and
|
||||
# finished_requests_ids is larger than the maximum mamba block capacity.
|
||||
# This could generally happen due to the fact that Jamba does support
|
||||
# statelessness mechanism where it can cleanup new incoming requests in
|
||||
# a single step.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
|
||||
except ValueError:
|
||||
pytest.fail("Jamba inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily ")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_state_cleanup(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Jamba state is cleaned up between
|
||||
# steps, If its not cleaned, an error would be expected.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
pytest.fail("Jamba inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids")
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
def test_jamba_distributed_produces_identical_generation(
|
||||
vllm_runner, model: str, dtype: str, max_tokens: int,
|
||||
example_prompts) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model:
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_outputs_tp_1,
|
||||
outputs_1_lst=vllm_outputs_tp_2,
|
||||
name_0="vllm_tp_1",
|
||||
name_1="vllm_tp_2",
|
||||
)
|
||||
285
vllm-v0.6.2/tests/models/decoder_only/language/test_mamba.py
Normal file
285
vllm-v0.6.2/tests/models/decoder_only/language/test_mamba.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
|
||||
|
||||
Run `pytest tests/models/test_mamba.py`.
|
||||
"""
|
||||
import pytest
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.worker.model_runner import _get_graph_batch_size
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
|
||||
|
||||
|
||||
# Use lower-level interfaces to create this greedy generator, as mamba will
|
||||
# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
|
||||
def generate_greedy(model_name, example_prompts, max_tokens):
|
||||
# Create a text generation pipeline
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
|
||||
# Generate texts from the prompts
|
||||
outputs = []
|
||||
for prompt in example_prompts:
|
||||
# Tokenize the input prompt with truncation
|
||||
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
|
||||
input_ids = inputs["input_ids"].to(model.device)
|
||||
|
||||
# Generate text using the model's generate method directly
|
||||
generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
|
||||
generated_text = tokenizer.decode(generated_ids[0],
|
||||
skip_special_tokens=True)
|
||||
|
||||
outputs.append((generated_ids[0].tolist(), generated_text))
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
hf_outputs = generate_greedy(model, example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_batching(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# To pass the small model tests, we need full precision.
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
for_loop_outputs.append(
|
||||
vllm_model.generate_greedy([prompt], max_tokens)[0])
|
||||
|
||||
batched_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=for_loop_outputs,
|
||||
outputs_1_lst=batched_outputs,
|
||||
name_0="for_loop_vllm",
|
||||
name_1="batched_vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
|
||||
model: str, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
# Tests chunked prefill in conjunction with n>1. In this case, prefill is
|
||||
# populated with decoding tokens and we test that it doesn't fail.
|
||||
# This test might fail if cache is not allocated correctly for n > 1
|
||||
# decoding steps inside a chunked prefill forward pass (where we have both
|
||||
# prefill and decode together )
|
||||
sampling_params = SamplingParams(n=3,
|
||||
temperature=1,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=30,
|
||||
max_num_seqs=10 # forces prefill chunks with decoding
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
|
||||
def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int) -> None:
|
||||
"""
|
||||
Checks exact match decode between huggingface model and vllm runner with
|
||||
chunked prefill.
|
||||
"""
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
non_chunked = generate_greedy(model, example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
chunked = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=chunked,
|
||||
outputs_1_lst=non_chunked,
|
||||
name_0="chunked",
|
||||
name_1="non_chunked",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [15])
|
||||
def test_parallel_sampling(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for_loop_outputs = []
|
||||
for _ in range(10):
|
||||
for_loop_outputs.append(
|
||||
# using example_prompts index 1 instead of 0 since with 0 the
|
||||
# logprobs get really close and the test doesn't pass
|
||||
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
|
||||
[0])
|
||||
sampling_params = SamplingParams(n=10,
|
||||
temperature=0.001,
|
||||
seed=0,
|
||||
max_tokens=max_tokens)
|
||||
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
|
||||
sampling_params)
|
||||
token_ids, texts = n_lt_1_outputs[0]
|
||||
n_lt_1_outputs = [(token_id, text)
|
||||
for token_id, text in zip(token_ids, texts)]
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=n_lt_1_outputs,
|
||||
outputs_1_lst=for_loop_outputs,
|
||||
name_0="vllm_n_lt_1_outputs",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_mamba_cache_cg_padding(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# This test is for verifying that mamba cache is padded to CG captured
|
||||
# batch size. If it's not, a torch RuntimeError will be raised because
|
||||
# tensor dimensions aren't compatible
|
||||
while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
|
||||
example_prompts.append(example_prompts[0])
|
||||
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
except RuntimeError:
|
||||
pytest.fail(
|
||||
"Couldn't run batch size which is not equal to a Cuda Graph "
|
||||
"captured batch size. "
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_models_preemption_recompute(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# Tests that outputs are identical with and w/o preemtions (recompute)
|
||||
assert dtype == "float"
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
vllm_model.model.llm_engine.scheduler[
|
||||
0].ENABLE_ARTIFICIAL_PREEMPT = False
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=preempt_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="vllm_preepmtions",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Mamba inner state management doesn't
|
||||
# collapse in case where the number of incoming requests and
|
||||
# finished_requests_ids is larger than the maximum Mamba block capacity.
|
||||
# This could generally happen due to the fact that Mamba does support
|
||||
# statelessness mechanism where it can cleanup new incoming requests in
|
||||
# a single step.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
|
||||
except ValueError:
|
||||
pytest.fail("Mamba inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily ")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_state_cleanup(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
example_prompts,
|
||||
) -> None:
|
||||
# This test is for verifying that the Mamba state is cleaned up between
|
||||
# steps, If its not cleaned, an error would be expected.
|
||||
try:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
pytest.fail("Mamba inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids")
|
||||
253
vllm-v0.6.2/tests/models/decoder_only/language/test_mistral.py
Normal file
253
vllm-v0.6.2/tests/models/decoder_only/language/test_mistral.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa
|
||||
MistralToolParser)
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.1",
|
||||
]
|
||||
|
||||
MISTRAL_FORMAT_MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
# uses the v3-Tekken tokenizer
|
||||
"mistralai/Ministral-8B-Instruct-2410",
|
||||
# Mistral-Nemo is to big for CI, but passes locally
|
||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
SYMBOLIC_LANG_PROMPTS = [
|
||||
"勇敢な船乗りについての詩を書く", # japanese
|
||||
"寫一首關於勇敢的水手的詩", # chinese
|
||||
"ပုံပြင်လေးပြောပြပါ်:\n", # burmese
|
||||
"Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625
|
||||
]
|
||||
|
||||
# for function calling
|
||||
TOOLS = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
},
|
||||
}, {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"description": "Rewrites text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"required": [],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The input text to rewrite."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
MSGS = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"assistant",
|
||||
"content":
|
||||
"",
|
||||
"tool_calls": [{
|
||||
"id": "bbc5b7ede",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name":
|
||||
"rewrite",
|
||||
"arguments":
|
||||
'{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa
|
||||
}
|
||||
}]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"content":
|
||||
"{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa
|
||||
"tool_call_id": "bbc5b7ede",
|
||||
"name": "rewrite"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "---\n\nMy English needs improving, maybe I make errors"
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": ("Can you tell me what the temperate"
|
||||
" will be in Dallas, in fahrenheit?")
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
tokenizer_mode="mistral") as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_mistral_format(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="auto",
|
||||
load_format="safetensors",
|
||||
config_format="hf",
|
||||
) as hf_format_model:
|
||||
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
) as mistral_format_model:
|
||||
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_format_outputs,
|
||||
outputs_1_lst=mistral_format_outputs,
|
||||
name_0="hf",
|
||||
name_1="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_symbolic_languages(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral") as vllm_model:
|
||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||
msg = {"role": "user", "content": prompt}
|
||||
outputs = vllm_model.model.chat([msg],
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("model",
|
||||
MISTRAL_FORMAT_MODELS) # v1 can't do func calling
|
||||
def test_mistral_function_calling(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral") as vllm_model:
|
||||
|
||||
msgs = copy.deepcopy(MSGS)
|
||||
outputs = vllm_model.model.chat(msgs,
|
||||
tools=TOOLS,
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
tool_parser = MistralToolParser(tokenizer)
|
||||
|
||||
model_output = outputs[0].outputs[0].text.strip()
|
||||
assert model_output.startswith(tool_parser.bot_token), model_output
|
||||
parsed_message = tool_parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert parsed_message.tools_called
|
||||
assert parsed_message.tool_calls[0].id == "0UAqFzWsD"
|
||||
assert parsed_message.tool_calls[
|
||||
0].function.name == "get_current_weather"
|
||||
assert parsed_message.tool_calls[
|
||||
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
|
||||
assert parsed_message.content is None
|
||||
@@ -0,0 +1,80 @@
|
||||
# flake8: noqa
|
||||
"""Tests Model Optimizer fp8 models against ground truth generation
|
||||
Note: these tests will only pass on H100
|
||||
"""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"nvidia/Llama-3.1-8B-Instruct-FP8": [
|
||||
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
|
||||
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
|
||||
'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
|
||||
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
|
||||
'**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
|
||||
'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
|
||||
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
|
||||
'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# This test compares against golden strings for exact match since
|
||||
# there is no baseline implementation to compare against
|
||||
# and is unstable w.r.t specifics of the fp8 implementation or
|
||||
# the hardware being run on.
|
||||
# Disabled to prevent it from breaking the build
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Prevent unstable test based on golden strings from breaking the build.")
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
model = LLM(
|
||||
model=model_name,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
quantization="modelopt",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
formatted_prompts = [
|
||||
tokenizer.apply_chat_template([{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
for prompt in example_prompts
|
||||
]
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
generations: List[str] = []
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
outputs = model.generate(prompt, params)
|
||||
generations.append(outputs[0].outputs[0].text)
|
||||
del model
|
||||
|
||||
print(model_name, generations)
|
||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||
for i in range(len(example_prompts)):
|
||||
generated_str = generations[i]
|
||||
expected_str = expected_strs[i]
|
||||
assert expected_str == generated_str, (
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
|
||||
@@ -0,0 +1,88 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(MODELS): Only test gpt2, Llama-3.2-1B-Instruct, opt-125m.
|
||||
'''
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# pytest.param(
|
||||
# "bigscience/bloom-560m", # bloom - testing alibi slopes
|
||||
# marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
# ),
|
||||
pytest.param(
|
||||
"openai-community/gpt2", # gpt2
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
# pytest.param("Milos/slovak-gpt-j-405M"), # gptj
|
||||
# pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode
|
||||
# pytest.param("EleutherAI/pythia-70m"), # gpt_neox
|
||||
# pytest.param(
|
||||
# "google/gemma-1.1-2b-it", # gemma
|
||||
# marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
# ),
|
||||
pytest.param(
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # llama
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
# pytest.param(
|
||||
# "openbmb/MiniCPM3-4B",
|
||||
# # fused_moe not supported on CPU
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
pytest.param(
|
||||
"facebook/opt-125m", # opt
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
# pytest.param(
|
||||
# "microsoft/phi-2", # phi
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
# pytest.param(
|
||||
# "Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
# pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
|
||||
# pytest.param("bigcode/starcoder2-3b"), # starcoder2
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
102
vllm-v0.6.2/tests/models/decoder_only/language/test_phimoe.py
Normal file
102
vllm-v0.6.2/tests/models/decoder_only/language/test_phimoe.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_phimoe.py`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
]
|
||||
|
||||
|
||||
def test_phimoe_routing_function():
|
||||
from vllm.model_executor.models.phimoe import phimoe_routing_function
|
||||
test_case = {
|
||||
0: {
|
||||
"hidden_states":
|
||||
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False).view(4, 2),
|
||||
"gating_output":
|
||||
torch.tensor([0.1, 0.2, 0.3, 0.4],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False),
|
||||
"topk":
|
||||
2,
|
||||
"renormalize":
|
||||
False,
|
||||
},
|
||||
1: {
|
||||
"hidden_states":
|
||||
torch.tensor([1, 2, 3, 4, 5, 6, 7, 8],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False).view(4, 2),
|
||||
"gating_output":
|
||||
torch.tensor([0.4, 0.2, 0.3, 0.4],
|
||||
dtype=torch.float32,
|
||||
requires_grad=False),
|
||||
"topk":
|
||||
2,
|
||||
"renormalize":
|
||||
False,
|
||||
}
|
||||
}
|
||||
|
||||
ground_truth = {
|
||||
0: {
|
||||
"topk_weights":
|
||||
torch.tensor([1., 1.], dtype=torch.float32, requires_grad=False),
|
||||
"topk_ids":
|
||||
torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
1: {
|
||||
"topk_weights":
|
||||
torch.tensor([0.5, 1.], dtype=torch.float32, requires_grad=False),
|
||||
"topk_ids":
|
||||
torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
|
||||
}
|
||||
}
|
||||
|
||||
for test_id in test_case:
|
||||
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
|
||||
assert torch.allclose(topk_weights,
|
||||
ground_truth[test_id]["topk_weights"])
|
||||
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(condition=current_platform.is_cpu(),
|
||||
reason="This test takes a lot time to run on CPU, "
|
||||
"and vllm CI's disk space is not enough for this model.")
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
Reference in New Issue
Block a user