xc-llm-ascend/vllm_ascend/patch/worker/patch_common/patch_sampler.py

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Optional

import torch
import torch_npu
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample

from vllm_ascend import envs


def apply_top_k_top_p(
    logits: torch.Tensor,
    k: torch.Tensor,
    p: torch.Tensor,
) -> torch.Tensor:
    if p is not None and k is not None:
        # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
        return torch_npu.npu_top_k_top_p(logits, p, k)

    probs = logits.softmax(dim=-1)
    probs_sort, _ = probs.sort(dim=-1, descending=False)

    if k is not None:
        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
        top_k_count = top_k_count.unsqueeze(dim=1)
        top_k_cutoff = probs_sort.gather(-1, top_k_count)

        # Make sure the no top-k rows are no-op.
        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))

        elements_to_discard = probs < top_k_cutoff
        logits.masked_fill_(elements_to_discard, -float("inf"))

    if p is not None:
        cumprob = torch.cumsum(probs_sort, dim=-1)
        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
        top_p_mask[:, -1] = False  # at least one

        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
        top_p_cutoff = probs_sort.gather(-1, top_p_count)
        elements_to_discard = probs < top_p_cutoff
        logits.masked_fill_(elements_to_discard, -float("inf"))

    return logits


def topk_topp_forward_native(
    self,
    logits: torch.Tensor,
    generators: dict[int, torch.Generator],
    k: Optional[torch.Tensor],
    p: Optional[torch.Tensor],
) -> torch.Tensor:
    """
    PyTorch-native implementation of top-k and top-p sampling.

    The logits tensor may be updated in-place.
    """
    logits = apply_top_k_top_p(logits, k, p)
    probs = logits.softmax(dim=-1, dtype=torch.float32)
    return random_sample(probs, generators)


if envs.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION:
    TopKTopPSampler.forward_native = topk_topp_forward_native
[Perf] add patch to optimize apply_topk_topp (#1732) ### What this PR does / why we need it? Performance optimization for apply_top_k_top_p ### Does this PR introduce _any_ user-facing change? Use VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION to enable this feature ### How was this patch tested? e2e & ut - vLLM version: v0.9.2 - vLLM main: https://github.com/vllm-project/vllm/commit/6a9e6b2abf88181f93a1959fe16291c3f1696329 Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> 2025-07-11 15:32:02 +08:00			`#`
			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# SPDX-License-Identifier: Apache-2.0`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`from typing import Optional`

			`import torch`
			`import torch_npu`
			`from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample`

			`from vllm_ascend import envs`


			`def apply_top_k_top_p(`
			`logits: torch.Tensor,`
			`k: torch.Tensor,`
			`p: torch.Tensor,`
			`) -> torch.Tensor:`
			`if p is not None and k is not None:`
			`# npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)`
			`return torch_npu.npu_top_k_top_p(logits, p, k)`

			`probs = logits.softmax(dim=-1)`
			`probs_sort, _ = probs.sort(dim=-1, descending=False)`

			`if k is not None:`
			`top_k_count = probs_sort.size(1) - k.to(torch.long) # shape: (batch, )`
			`top_k_count = top_k_count.unsqueeze(dim=1)`
			`top_k_cutoff = probs_sort.gather(-1, top_k_count)`

			`# Make sure the no top-k rows are no-op.`
			`no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)`
			`top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))`

			`elements_to_discard = probs < top_k_cutoff`
			`logits.masked_fill_(elements_to_discard, -float("inf"))`

			`if p is not None:`
			`cumprob = torch.cumsum(probs_sort, dim=-1)`
			`top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)`
			`top_p_mask[:, -1] = False # at least one`

			`top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)`
			`top_p_cutoff = probs_sort.gather(-1, top_p_count)`
			`elements_to_discard = probs < top_p_cutoff`
			`logits.masked_fill_(elements_to_discard, -float("inf"))`

			`return logits`


			`def topk_topp_forward_native(`
			`self,`
			`logits: torch.Tensor,`
			`generators: dict[int, torch.Generator],`
			`k: Optional[torch.Tensor],`
			`p: Optional[torch.Tensor],`
			`) -> torch.Tensor:`
			`"""`
			`PyTorch-native implementation of top-k and top-p sampling.`

			`The logits tensor may be updated in-place.`
			`"""`
			`logits = apply_top_k_top_p(logits, k, p)`
			`probs = logits.softmax(dim=-1, dtype=torch.float32)`
			`return random_sample(probs, generators)`


			`if envs.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION:`
			`TopKTopPSampler.forward_native = topk_topp_forward_native`