Files
xc-llm-ascend/tests/ut/sample/test_rejection_sampler.py
ZongYuan Zhan d8e15dae6c Optimize some rejectsampler functions to make npu op launch non-blocking (#4587)
### What this PR does / why we need it?
- Vetorize the loop (but change not output) in some rejectsampler
functions include: `expand_pytorch`, `sample_recovered_tokens_pytorch`,
`rejection_random_sample_pytorch`, `sample_recovered_tokens`.
- Remove synchronize-launch torchnpu operator in them to accelerate
sampling + MTP postprocess.

### Does this PR introduce _any_ user-facing change?
- No

### How was this patch tested?
- We tested this change with the serve&bench command:
```
===== serve =====
vllm serve $LOCAL_CKPT_DIR \
        --host 0.0.0.0 \
        --port 8000 \
        --data-parallel-size 4 \
        --data-parallel-size-local 2 \
        --data-parallel-address $MASTER_NODE_IP \
        --data-parallel-start-rank $((2*VC_TASK_INDEX)) \
        --data-parallel-rpc-port 13387 \
        --tensor-parallel-size 8 \
        --seed 1024 \
        --enable-expert-parallel \
        --served-model-name $NAME \
        --max-model-len 4096 \
        --max-num-seqs 16 \
        --trust-remote-code \
        --gpu-memory-utilization 0.90 \
        $headless \
	    --speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 1}' \
        --additional-config '{"ascend_scheduler_config":{"enabled":false, "enable_chunked_prefill":true, "chunked_prefill_enabled":true}}' 

==== bench =====
vllm bench serve --model $LOCAL_CKPT_DIR  --served-model-name DeepseekV3ForCausalLM \
--dataset-name spec_bench --spec-bench-output-len 2048 \
--dataset-path question.jsonl \
--top-p 1.0 --temperature 0.8 \
--ignore-eos \
--num-prompts 64  --trust-remote-code --base-url "http://0.0.0.0:8000" --request-rate 64
```
- In this case, our rj optimization can reduce TPOT from 84.94ms to
64.61ms, about 23% gain.

## before
<img width="1068" height="830" alt="image"
src="https://github.com/user-attachments/assets/278ac878-b49d-4588-b87c-316ca4d537f5"
/>

## after
<img width="781" height="756" alt="image"
src="https://github.com/user-attachments/assets/0c6d37ad-ed77-40b3-a1be-4933c468365c"
/>

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: ZongYuan Zhan <zhanzy178@gmail.com>
Co-authored-by: Yizhou <136800916+yiz-liu@users.noreply.github.com>
2025-12-29 14:10:39 +08:00

252 lines
9.3 KiB
Python

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from unittest.mock import patch
import torch
from tests.ut.base import TestBase
from vllm_ascend.sample.rejection_sampler import (
expand_batch_to_tokens, expand_pytorch, rejection_greedy_sample_pytorch,
rejection_random_sample_pytorch, sample_recovered_tokens_pytorch)
# Global constants
PLACEHOLDER_TOKEN_ID = -1
GREEDY_TEMPERATURE = 0.0
MAX_SPEC_LEN = 8 # Used as MAX_NUM_TOKENS in expand_batch_to_tokens
def mock_pin_memory(original_func):
def func_wo_pin_memory(*args, **kwargs):
if kwargs.get('pin_memory', False):
kwargs['pin_memory'] = False
return original_func(*args, **kwargs)
return func_wo_pin_memory
class TestAscendRejectionSampler(TestBase):
@patch('torch.arange', new=mock_pin_memory(torch.arange))
@patch('torch.ones', new=mock_pin_memory(torch.ones))
@patch('torch.full', new=mock_pin_memory(torch.full))
@patch('torch.tensor', new=mock_pin_memory(torch.tensor))
def test_rejection_greedy_sample_pytorch(self):
"""Test greedy rejection sampling: stop when draft doesn't match, otherwise append bonus token"""
batch_size = 2
max_spec_len = 2
output_token_ids = torch.full((batch_size, max_spec_len + 1),
PLACEHOLDER_TOKEN_ID)
cu_num_draft_tokens = torch.tensor([2, 4])
num_draft_tokens = [2, 2]
draft_token_ids = torch.tensor([10, 11, 20, 21])
target_argmax = torch.tensor([10, 99, 20, 22])
bonus_token_ids = torch.tensor([[100], [200]])
is_greedy = torch.tensor([True, True])
rejection_greedy_sample_pytorch(
output_token_ids,
cu_num_draft_tokens,
draft_token_ids,
target_argmax,
bonus_token_ids,
num_draft_tokens,
max_spec_len,
is_greedy,
)
assert output_token_ids[0, 0].item() == 10
assert output_token_ids[0, 1].item() == 99
assert output_token_ids[1, 0].item() == 20
assert output_token_ids[1, 2].item() == PLACEHOLDER_TOKEN_ID
@patch('torch.arange', new=mock_pin_memory(torch.arange))
@patch('torch.ones', new=mock_pin_memory(torch.ones))
@patch('torch.full', new=mock_pin_memory(torch.full))
@patch('torch.tensor', new=mock_pin_memory(torch.tensor))
def test_rejection_random_sample_pytorch(self):
"""Test random rejection sampling: accept based on uniform probability"""
batch_size = 2
max_spec_len = 3
output_token_ids = torch.full((batch_size, max_spec_len + 1),
PLACEHOLDER_TOKEN_ID)
cu_num_draft_tokens = torch.tensor([2, 1])
draft_token_ids = torch.tensor([1, 0, 2])
draft_probs = torch.tensor([
[0.0, 0.6, 0.0, 0.4], # vocab_size=4
[0.1, 0.2, 0.3, 0.4],
[0.5, 0.5, 0.0, 0.0],
])
target_probs = torch.tensor([
[0.0, 0.8, 0.0, 0.2],
[0.2, 0.1, 0.3, 0.4],
[0.9, 0.1, 0.0, 0.0],
])
bonus_token_ids = torch.tensor([[100], [200]])
recovered_token_ids = torch.tensor([1, 2, 3])
uniform_probs = torch.tensor([0.7, 0.6, 0.5])
is_greedy = torch.tensor([False, False])
vocab_size = 4
rejection_random_sample_pytorch(
output_token_ids,
cu_num_draft_tokens,
draft_token_ids,
draft_probs,
target_probs,
bonus_token_ids,
recovered_token_ids,
uniform_probs,
is_greedy,
max_spec_len,
vocab_size,
IS_NGRAM=False,
)
assert output_token_ids[0, 0].item() == 1
assert output_token_ids[0, 1].item() == 0
assert output_token_ids[0, 2].item() == 100
@patch('torch.arange', new=mock_pin_memory(torch.arange))
@patch('torch.ones', new=mock_pin_memory(torch.ones))
@patch('torch.full', new=mock_pin_memory(torch.full))
@patch('torch.tensor', new=mock_pin_memory(torch.tensor))
def test_expand_pytorch(self):
"""Test expand_pytorch functionality"""
input_ptr = torch.tensor([10, 20, 30], dtype=torch.int32)
cu_num_tokens_ptr = torch.tensor([2, 5, 7])
output_ptr = torch.empty(7, dtype=torch.int32)
expand_pytorch(
output_ptr,
input_ptr,
cu_num_tokens_ptr,
replace_from=0,
replace_to=0,
MAX_NUM_TOKENS=MAX_SPEC_LEN,
)
expected = torch.tensor([10, 10, 20, 20, 20, 30, 30])
assert torch.equal(output_ptr, expected)
@patch('torch.arange', new=mock_pin_memory(torch.arange))
@patch('torch.ones', new=mock_pin_memory(torch.ones))
@patch('torch.full', new=mock_pin_memory(torch.full))
@patch('torch.tensor', new=mock_pin_memory(torch.tensor))
def test_expand_batch_to_tokens(self):
"""Test expand_batch_to_tokens wrapper"""
x = torch.tensor([10, 20, 30])
cu_num_tokens = torch.tensor([2, 5, 7])
num_tokens = 7
# Test PyTorch path
with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", False):
with patch("vllm_ascend.sample.rejection_sampler.expand_pytorch"
) as mock_pytorch:
expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
mock_pytorch.assert_called_once()
args = mock_pytorch.call_args[0]
assert (args[1] == x).all()
assert (args[2] == cu_num_tokens).all()
# Test Triton kernel path
with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", True):
with patch("vllm_ascend.sample.rejection_sampler.expand_kernel"
) as mock_triton:
expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
# grid = triton.cdiv(n, BLOCK_SIZE) = triton.cdiv(3, 2) = 2
mock_triton.__getitem__.assert_called_once_with((2, ))
call_args = mock_triton.__getitem__.return_value.call_args[0]
assert (call_args[1] == x).all()
assert (call_args[2] == cu_num_tokens).all()
# Run actual function
with patch("vllm_ascend.sample.rejection_sampler.HAS_TRITON", False):
result = expand_batch_to_tokens(x, cu_num_tokens, num_tokens)
expected = torch.tensor([10, 10, 20, 20, 20, 30, 30])
assert torch.equal(result, expected)
@patch('torch.arange', new=mock_pin_memory(torch.arange))
@patch('torch.ones', new=mock_pin_memory(torch.ones))
@patch('torch.full', new=mock_pin_memory(torch.full))
@patch('torch.tensor', new=mock_pin_memory(torch.tensor))
def test_sample_recovered_tokens_pytorch_ngram(self):
"""Test recovered token sampling under n-gram mode"""
output_token_ids = torch.empty(2, dtype=torch.int32)
cu_num_draft_tokens = torch.tensor([1, 2])
draft_token_ids = torch.tensor([1, 2])
draft_probs = None
target_probs = torch.tensor([
[0.1, 0.2, 0.7],
[0.3, 0.3, 0.4],
])
q = torch.tensor([
[0.1, 0.2, 0.7],
[0.5, 0.4, 0.1],
])
vocab_size = 3
sample_recovered_tokens_pytorch(
output_token_ids,
cu_num_draft_tokens,
draft_token_ids,
draft_probs,
target_probs,
q,
vocab_size,
IS_NGRAM=True,
)
assert output_token_ids[0].item() == 0
assert output_token_ids[1].item() == 1
@patch('torch.arange', new=mock_pin_memory(torch.arange))
@patch('torch.ones', new=mock_pin_memory(torch.ones))
@patch('torch.full', new=mock_pin_memory(torch.full))
@patch('torch.tensor', new=mock_pin_memory(torch.tensor))
def test_sample_recovered_tokens_pytorch_autoregressive(self):
"""Test recovered token sampling for autoregressive models"""
output_token_ids = torch.empty(2, dtype=torch.int32)
cu_num_draft_tokens = torch.tensor([1, 2])
draft_token_ids = torch.tensor([0, 1])
draft_probs = torch.tensor([
[0.6, 0.1, 0.3],
[0.2, 0.7, 0.1],
])
target_probs = torch.tensor([
[0.8, 0.1, 0.1],
[0.3, 0.6, 0.1],
])
q = torch.tensor([
[0.5, 0.3, 0.2],
[0.1, 0.8, 0.1],
])
vocab_size = 3
sample_recovered_tokens_pytorch(
output_token_ids,
cu_num_draft_tokens,
draft_token_ids,
draft_probs,
target_probs,
q,
vocab_size,
IS_NGRAM=False,
)
assert output_token_ids[0].item() == 0
assert output_token_ids[1].item() == 0