Sync from v0.13
This commit is contained in:
0
tests/tpu/__init__.py
Normal file
0
tests/tpu/__init__.py
Normal file
0
tests/tpu/lora/__init__.py
Normal file
0
tests/tpu/lora/__init__.py
Normal file
139
tests/tpu/lora/test_lora.py
Normal file
139
tests/tpu/lora/test_lora.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from torch_xla._internal import tpu
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
# This file contains tests to ensure that LoRA works correctly on the TPU
|
||||
# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
|
||||
# for this. The adapters are:
|
||||
# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
|
||||
# from 1 to 4.
|
||||
|
||||
# These adapters are trained using a standard huggingface peft training script,
|
||||
# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
|
||||
# 100 training iterations with a training batch size of 100.
|
||||
|
||||
|
||||
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
|
||||
return vllm.LLM(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
max_model_len=256,
|
||||
max_num_seqs=8,
|
||||
tensor_parallel_size=tp,
|
||||
enable_lora=True,
|
||||
max_loras=num_loras,
|
||||
max_lora_rank=8,
|
||||
)
|
||||
|
||||
|
||||
TPU_TENSOR_PARALLEL_SIZES = (
|
||||
[1, tpu.num_available_chips()] if tpu.num_available_chips() > 1 else [1]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
|
||||
def test_single_lora(tp: int):
|
||||
"""
|
||||
This test ensures we can run a single LoRA adapter on the TPU backend.
|
||||
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
|
||||
will force Qwen2.5-3B-Instruct to claim 1+1=1.
|
||||
"""
|
||||
|
||||
llm = setup_vllm(1, tp)
|
||||
|
||||
prompt = "What is 1+1? \n"
|
||||
|
||||
lora_request = LoRARequest(
|
||||
"lora_adapter_1",
|
||||
1,
|
||||
"Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
|
||||
)
|
||||
output = (
|
||||
llm.generate(
|
||||
prompt,
|
||||
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
|
||||
lora_request=lora_request,
|
||||
)[0]
|
||||
.outputs[0]
|
||||
.text
|
||||
)
|
||||
|
||||
answer = output.strip()[0]
|
||||
|
||||
assert answer.isdigit()
|
||||
assert int(answer) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
|
||||
def test_lora_hotswapping(tp: int):
|
||||
"""
|
||||
This test ensures we can run multiple LoRA adapters on the TPU backend, even
|
||||
if we only have space to store 1.
|
||||
|
||||
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
|
||||
will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
|
||||
"""
|
||||
|
||||
lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
|
||||
lora_requests = [
|
||||
LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
|
||||
for i in range(1, 5)
|
||||
]
|
||||
|
||||
llm = setup_vllm(1, tp)
|
||||
|
||||
prompt = "What is 1+1? \n"
|
||||
|
||||
for i, req in enumerate(lora_requests):
|
||||
output = (
|
||||
llm.generate(
|
||||
prompt,
|
||||
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
|
||||
lora_request=req,
|
||||
)[0]
|
||||
.outputs[0]
|
||||
.text
|
||||
)
|
||||
answer = output.strip()[0]
|
||||
|
||||
assert answer.isdigit()
|
||||
assert int(answer) == i + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
|
||||
def test_multi_lora(tp: int):
|
||||
"""
|
||||
This test ensures we can run multiple LoRA adapters on the TPU backend, when
|
||||
we have enough space to store all of them.
|
||||
|
||||
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
|
||||
will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
|
||||
"""
|
||||
lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
|
||||
lora_requests = [
|
||||
LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
|
||||
for i in range(1, 5)
|
||||
]
|
||||
|
||||
llm = setup_vllm(4, tp)
|
||||
|
||||
prompt = "What is 1+1? \n"
|
||||
|
||||
for i, req in enumerate(lora_requests):
|
||||
output = (
|
||||
llm.generate(
|
||||
prompt,
|
||||
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
|
||||
lora_request=req,
|
||||
)[0]
|
||||
.outputs[0]
|
||||
.text
|
||||
)
|
||||
|
||||
answer = output.strip()[0]
|
||||
|
||||
assert answer.isdigit()
|
||||
assert int(output.strip()[0]) == i + 1
|
||||
86
tests/tpu/test_compilation.py
Normal file
86
tests/tpu/test_compilation.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import glob
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import depyf
|
||||
|
||||
|
||||
def test_tpu_compilation():
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with depyf.prepare_debug(temp_dir):
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly;",
|
||||
"The greatest glory in living lies not in never falling,",
|
||||
]
|
||||
answers = [
|
||||
" or, through inaction",
|
||||
" what is essential ",
|
||||
" but in rising ",
|
||||
]
|
||||
|
||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||
N = 1
|
||||
sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=N, max_tokens=16)
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2-1.5B-Instruct",
|
||||
max_num_batched_tokens=256,
|
||||
max_model_len=256,
|
||||
max_num_seqs=32,
|
||||
enforce_eager=False,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py"))
|
||||
)
|
||||
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
|
||||
# We should only trigger Dynamo compilation 2 times:
|
||||
# 1. Forward pass without kv_caches
|
||||
# 2. Forward pass with kv_caches
|
||||
# Check we have 2 compiled codes
|
||||
assert len(compiled_codes) == 2
|
||||
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
|
||||
def extract_compiled_index(s):
|
||||
parts = s.replace(".", "_").split("_")
|
||||
numbers = [int(part) for part in parts if part.isdigit()]
|
||||
return numbers[0]
|
||||
|
||||
# Check all the compilations are as expected. The dump files include the
|
||||
# captured graph for the forward function of the nn.Module.
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
|
||||
key=lambda s: extract_compiled_index(s),
|
||||
)
|
||||
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
|
||||
# The first compilation should not have any kv_caches
|
||||
with open(compiled_fns[0]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# The second compilation should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[1]) as f:
|
||||
content = f.read()
|
||||
assert kv_cache_prefix in content and attn_prefix in content
|
||||
34
tests/tpu/test_custom_dispatcher.py
Normal file
34
tests/tpu/test_custom_dispatcher.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import CompilationMode
|
||||
|
||||
from ..utils import compare_two_settings
|
||||
|
||||
# --enforce-eager on TPU causes graph compilation
|
||||
# this times out default Health Check in the MQLLMEngine,
|
||||
# so we set the timeout here to 30s
|
||||
|
||||
|
||||
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_RPC_TIMEOUT", "30000")
|
||||
compare_two_settings(
|
||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||
arg1=[
|
||||
"--max-model-len=256",
|
||||
"--max-num-seqs=32",
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
|
||||
],
|
||||
arg2=[
|
||||
"--max-model-len=256",
|
||||
"--max-num-seqs=32",
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
|
||||
],
|
||||
env1={},
|
||||
env2={},
|
||||
)
|
||||
88
tests/tpu/test_moe_pallas.py
Normal file
88
tests/tpu/test_moe_pallas.py
Normal file
@@ -0,0 +1,88 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for the Pallas MOE implementation.
|
||||
|
||||
Run `pytest tests/kernels/moe/test_moe_pallas.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch_xla
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
|
||||
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
|
||||
fused_moe as torch_moe,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if not current_platform.is_tpu():
|
||||
pytest.skip("This test needs a TPU.", allow_module_level=True)
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
EP_SIZE = [1]
|
||||
TOP_KS = [2, 6]
|
||||
|
||||
|
||||
# The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
|
||||
@pytest.mark.parametrize("m", [8, 16, 64, 2048])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
def test_pallas_moe(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
ep_size: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
with torch.device(xm.xla_device()):
|
||||
a = torch.randn((m, k), dtype=dtype) / 10
|
||||
w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
|
||||
w2 = torch.randn((e, k, n), dtype=dtype) / 10
|
||||
|
||||
score = torch.randn((m, e), dtype=dtype)
|
||||
|
||||
# TODO: Support ep
|
||||
if ep_size > 1:
|
||||
pytest.skip("No support for ep_size > 1 yet")
|
||||
else:
|
||||
e_map = None
|
||||
|
||||
# Run both implementations
|
||||
torch_output = torch_moe(
|
||||
hidden_states=a,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
gating_output=score,
|
||||
topk=topk,
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False,
|
||||
)
|
||||
|
||||
pallas_output = pallas_moe(
|
||||
hidden_states=a,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
gating_output=score,
|
||||
topk=topk,
|
||||
global_num_experts=e,
|
||||
expert_map=e_map,
|
||||
renormalize=False,
|
||||
)
|
||||
torch_xla.sync(wait=False)
|
||||
|
||||
# Compare outputs
|
||||
torch.testing.assert_close(
|
||||
pallas_output.cpu(),
|
||||
torch_output.cpu(),
|
||||
atol=2e-2,
|
||||
rtol=0,
|
||||
)
|
||||
52
tests/tpu/test_quantization_accuracy.py
Normal file
52
tests/tpu/test_quantization_accuracy.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
|
||||
|
||||
@dataclass
|
||||
class GSM8KAccuracyTestConfig:
|
||||
model_name: str
|
||||
expected_value: float
|
||||
|
||||
def get_model_args(self) -> str:
|
||||
return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
|
||||
|
||||
|
||||
# NOTE: Accuracy scores measured on GPUs.
|
||||
ACCURACY_CONFIGS = [
|
||||
GSM8KAccuracyTestConfig(
|
||||
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
expected_value=0.76,
|
||||
), # no bias
|
||||
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
|
||||
# so only one of these tests can run in a single call to pytest. As
|
||||
# a follow-up, move this into the LM-EVAL section of the CI.
|
||||
# GSM8KAccuracyTestConfig(
|
||||
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
|
||||
# expected_value=0.66), # bias in QKV layers
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
|
||||
def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=config.get_model_args(),
|
||||
tasks="gsm8k",
|
||||
batch_size="auto",
|
||||
)
|
||||
|
||||
EXPECTED_VALUE = config.expected_value
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (
|
||||
measured_value - RTOL < EXPECTED_VALUE
|
||||
and measured_value + RTOL > EXPECTED_VALUE
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
Reference in New Issue
Block a user