[Performance] Pre-issued exponential distribution operator. (#4908)

Pre-issued exponential distribution operator. Result: Single inference saves 200-300 microseconds. before： <img width="2257" height="1058" alt="2" src="https://github.com/user-attachments/assets/c1da19e2-a439-42cb-9d7c-c0218e61fd4c" /> After： <img width="2211" height="342" alt="image" src="https://github.com/user-attachments/assets/03c84292-c802-4755-949c-4266a9a72fc0" /> - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-12-11 23:02:51 +08:00
parent 0fbe0831ec
commit a6ef3ac4e4
3 changed files with 43 additions and 3 deletions
--- a/tests/ut/sample/test_sampler.py
+++ b/tests/ut/sample/test_sampler.py
@@ -17,9 +17,12 @@ class TestAscendSampler(TestBase):

 class TestAscendTopKTopPSampler(TestBase):

+    @mock.patch("vllm_ascend.sample.sampler.random_sample")
    @mock.patch("torch_npu.npu_top_k_top_p")
-    def test_npu_topk_topp_called_when_optimized(self, mock_npu_op):
+    def test_npu_topk_topp_called_when_optimized(self, mock_npu_op,
+                                                 mock_random_sample):
        mock_npu_op.return_value = (torch.randn(1, 3))
+        mock_random_sample.return_value = torch.randn(3)
        sampler = AscendTopKTopPSampler()

        logits = torch.tensor([[1.0, 2.0, 3.0]])