optimize the funtion of computing topk and topp in sampler. (#970)

### What this PR does / why we need it? Optimize the performance of calculation logic in sampler and deepseekv2. ### Does this PR introduce _any_ user-facing change? Added VLLM_ENABLE_TOPK_OPTIMZE config in sampler ### How was this patch tested? pytest test_sampler.py Signed-off-by: wangxiaoxin (A) <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin (A) <wangxiaoxin7@huawei.com> Co-authored-by: ZhengWG <zwg0606@gmail.com>
2025-06-05 16:42:18 +08:00
parent e1ab6d318e
commit 908a851a77
9 changed files with 330 additions and 3 deletions
--- a/tests/multicard/test_offline_inference_distributed.py
+++ b/tests/multicard/test_offline_inference_distributed.py
@@ -21,8 +21,10 @@
 Run `pytest tests/test_offline_inference.py`.
 """
 import os
+from unittest.mock import patch

 import vllm  # noqa: F401
+from vllm import SamplingParams

 from tests.conftest import VllmRunner

@@ -57,3 +59,25 @@ def test_models_distributed_DeepSeek():
            distributed_executor_backend="mp",
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMZE": "1"})
+def test_models_distributed_topk() -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner(
+            "deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
+            tensor_parallel_size=4,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)