model runner v2 support triton of penalty (#5854)

### What this PR does / why we need it? Optimized operator performance and add ut test ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? test in qwen2.5 7b vl, ops time approved 90% - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef this pr is for # https://github.com/vllm-project/vllm-ascend/issues/5208 Signed-off-by: shiyuan680 <917935075@qq.com>
2026-01-20 20:26:05 +08:00
parent afabb49f00
commit cea48c2a34
2 changed files with 231 additions and 3 deletions
--- a/vllm_ascend/worker/v2/sample/penalties.py
+++ b/vllm_ascend/worker/v2/sample/penalties.py
@@ -72,7 +72,8 @@ def _penalties_and_temperature_kernel(
            block,
            mask=mask,
        )
-        output_bin_mask = output_bin_counts > 0
+        # to use vector core, if use > 0 will use scalar to slow down performance
+        output_bin_mask = output_bin_counts != 0

        # Apply repetition penalties.
        if use_rep_penalty:
@@ -83,8 +84,15 @@ def _penalties_and_temperature_kernel(
                packed_block,
                mask=packed_block < tl.cdiv(vocab_size, 32),
            )
-            prompt_bin_mask = (packed_mask[:, None] >>
-                               (tl.arange(0, 32)[None, :])) & 1
+            # the compiler itself does not optimize right-shift operations, so we change the same func
+            bit_masks = 1 << tl.arange(0, 32)
+            bit_masks_expanded = bit_masks[None, :]
+
+            packed_expanded = packed_mask[:, None]
+            bits_matrix = (packed_expanded & bit_masks_expanded) != 0
+
+            prompt_bin_mask = bits_matrix.reshape(BLOCK_SIZE)
+
            prompt_bin_mask = prompt_bin_mask.to(tl.int1)
            prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)