Refactor e2e CI (#2276)
Refactor E2E CI to make it clear and faster
1. remove some uesless e2e test
2. remove some uesless function
3. Make sure all test runs with VLLMRunner to avoid oom error
4. Make sure all ops test end with torch.empty_cache to avoid oom error
5. run the test one by one to avoid resource limit error
- vLLM version: v0.10.1.1
- vLLM main:
a344a5aa0a
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -16,94 +16,34 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Optional
|
||||
from vllm import SamplingParams
|
||||
|
||||
import torch
|
||||
|
||||
# Set tolerance to 1 for quant ops
|
||||
DEFAULT_ATOL = 1e-3
|
||||
DEFAULT_RTOL = 1e-3
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
def apply_top_k_top_p(
|
||||
logits: torch.Tensor,
|
||||
k: Optional[torch.Tensor],
|
||||
p: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
"""Apply top-k and top-p masks to the logits.
|
||||
def test_models_topk() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
|
||||
If a top-p is used, this function will sort the logits tensor,
|
||||
which can be slow for large batches.
|
||||
|
||||
The logits tensor may be updated in-place.
|
||||
"""
|
||||
logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
|
||||
|
||||
if k is not None:
|
||||
# Apply top-k.
|
||||
top_k_mask = logits_sort.size(1) - k.to(torch.long) # shape: B
|
||||
# Get all the top_k values.
|
||||
top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
|
||||
top_k_mask = logits_sort < top_k_mask
|
||||
logits_sort.masked_fill_(top_k_mask, -float("inf"))
|
||||
|
||||
if p is not None:
|
||||
# Apply top-p.
|
||||
probs_sort = logits_sort.softmax(dim=-1)
|
||||
probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
|
||||
top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
|
||||
# at least one
|
||||
top_p_mask[:, -1] = False
|
||||
logits_sort.masked_fill_(top_p_mask, -float("inf"))
|
||||
|
||||
# Re-sort the probabilities.
|
||||
logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
|
||||
return logits
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
def apply_top_k_top_p_new(
|
||||
logits: torch.Tensor,
|
||||
k: Optional[torch.Tensor],
|
||||
p: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
batch_size, vocab_size = logits.shape
|
||||
logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
|
||||
def test_models_prompt_logprobs() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
|
||||
# Apply top-k.
|
||||
boundary = logits_sort.gather(1, (vocab_size - k).unsqueeze(dim=1))
|
||||
top_k_mask = logits_sort < boundary
|
||||
logits_sort.masked_fill_(top_k_mask, -float("inf"))
|
||||
|
||||
if p is not None:
|
||||
# Apply top-p.
|
||||
cutoff = top_k_mask.sum(dim=-1).min()
|
||||
probs_sort = logits_sort.softmax(dim=-1)[:, cutoff:]
|
||||
probs_sum = probs_sort.cumsum(dim=-1)
|
||||
top_p_mask = probs_sum > 1 - p.unsqueeze(dim=1)
|
||||
top_p_mask[:, -1] = True
|
||||
strides = torch.arange(0,
|
||||
batch_size * vocab_size,
|
||||
vocab_size,
|
||||
device=logits.device)
|
||||
flatten_idx = logits_idx[:, cutoff:] + strides.unsqueeze(dim=1)
|
||||
valid_idx = torch.masked_select(flatten_idx, top_p_mask)
|
||||
logits_flatten = logits.flatten()
|
||||
valid_logits = torch.index_select(logits_flatten, 0, valid_idx)
|
||||
logits = torch.empty_like(logits_flatten).fill_(-float("inf"))
|
||||
logits[valid_idx] = valid_logits
|
||||
return logits.reshape(batch_size, vocab_size)
|
||||
|
||||
|
||||
# test with leading dimension and merge seqlen and batch_size as num_tokens
|
||||
@torch.inference_mode()
|
||||
def test_apply_top_k_top_p() -> None:
|
||||
logits = torch.randn((128, 7168)).npu()
|
||||
k = torch.Tensor([-1]).int().npu()
|
||||
p = torch.Tensor([1]).int().npu()
|
||||
logits_new = apply_top_k_top_p_new(logits, k, p)
|
||||
logits_old = apply_top_k_top_p(logits, k, p)
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(logits_new,
|
||||
logits_old,
|
||||
atol=DEFAULT_ATOL,
|
||||
rtol=DEFAULT_RTOL)
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens=5,
|
||||
num_logprobs=1)
|
||||
|
||||
Reference in New Issue
Block a user