[Performance] Add async exponential while model executing (#4501)

### What this PR does / why we need it? Add a control to enable the exponential distribution operator overlapping with model executing (default is OFF due to this feature might not perform well on MOE models, i.e. For Qwen3-30B). Enable async exponential overlapping will provides performance improvement. Also, overlapping the exponential operator with module execution can cover the performance drop introduced by AICPU-version's exponential operator. **UPDATE**: (12/12) Now our overlap will use the same stream that introduced in this pr: #4908 . We move the `do_async_exponential` from `model_runner_v1.py` to `sampler.py`. Now we are using `additional_config` to enable async exponential: Add `"enable_async_exponential": 1` in `addition_config`. Now we **ONLY** support default exponential/AI-CPU exponential, the old `"enable_async_exponential": 2` option has been aborted to keep consistency. ### Does this PR introduce _any_ user-facing change? **YES**, added a new `additional_config` : `"enable_async_exponential": 1`. When `enable_async_exponential` is set to 1, we enable the async exponential and overlap with model runner. When `enable_async_exponential` is set to 0 (default is 0), we disable the async exponential, but exponential will still running on a different stream using stream introduced in #4908. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: YuhanBai <yuhan.bai0830@gmail.com> Signed-off-by: YuhanBai yuhan.bai0830@gmail.com
2025-12-20 21:23:21 +08:00
parent 58773af708
commit 5d02eed16f
5 changed files with 60 additions and 0 deletions
--- a/docs/source/user_guide/configuration/additional_config.md
+++ b/docs/source/user_guide/configuration/additional_config.md
@@ -42,6 +42,7 @@ The following table lists additional configuration options available in vLLM Asc
 | `expert_map_record_path`            | str  | `None`  | Save the expert load calculation results to a new expert table in the specified directory.                                                  |
 | `init_redundancy_expert`            | int  | `0`     | Specify redundant experts during initialization.                                                                                              |
 | `dump_config`                      | str | `None`  | Configuration file path for msprobe dump(eager mode).                                                                                          |
+| `enable_async_exponential`          | int  | `0`     | Whether to enable async exponential overlap. To enable async exponential, set this config to 1.                                                                |

 The details of each configuration option are as follows:

--- a/tests/e2e/singlecard/test_sampler.py
+++ b/tests/e2e/singlecard/test_sampler.py
@@ -47,3 +47,21 @@ def test_models_prompt_logprobs() -> None:
        runner.generate_greedy_logprobs(example_prompts,
                                        max_tokens=5,
                                        num_logprobs=1)
+
+
+def test_exponential_overlap() -> None:
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=1.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner("Qwen/Qwen3-0.6B",
+                    max_model_len=8192,
+                    gpu_memory_utilization=0.7,
+                    additional_config={
+                        "enable_async_exponential": 1,
+                    }) as runner:
+        runner.generate(example_prompts, sampling_params)
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -161,6 +161,11 @@ class AscendConfig:
                                              False):
            kv_cfg.engine_id = f"{kv_cfg.engine_id}-{uuid4().hex}"
            kv_cfg._engine_id_patched = True
+        self.enable_async_exponential = additional_config.get(
+            "enable_async_exponential", 0)
+        if self.enable_async_exponential not in (0, 1):
+            raise AssertionError(
+                "Enable async exponential can only be set to 0 or 1.")


 class FinegrainedTPConfig:
--- a/vllm_ascend/sample/sampler.py
+++ b/vllm_ascend/sample/sampler.py
@@ -3,6 +3,7 @@ import torch_npu
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 from vllm.v1.sample.sampler import Sampler

+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type,
                               global_stream, npu_stream_switch)

@@ -41,10 +42,35 @@ class AscendSampler(Sampler):
        # TODO: support logprobs_mode in vllm-ascend
        super().__init__(logprobs_mode=logprobs_mode)
        self.topk_topp_sampler = AscendTopKTopPSampler()
+        self.async_exponential_event = torch.npu.Event()
+
+    def set_q_event(self, q, event):
+        self.topk_topp_sampler.set_q_event(q, event)
+
+    def do_async_exponential(self, b_s, head_dim, generators):
+        # Calculating exponential randoms in a different stream
+        # and overlapping with model executing.
+        with torch.npu.stream(global_stream()):
+            global_stream().wait_stream(torch.npu.current_stream())
+            q = torch.empty((b_s, head_dim), device="npu", dtype=torch.float32)
+            # Goes to async exponential with AI-CPU exponential or default exponential.
+            if len(generators) != q.shape[0]:
+                q.exponential_()
+            if generators:
+                for i, generator in generators.items():
+                    q[i].exponential_(generator=generator)
+            self.async_exponential_event.record()
+        self.set_q_event(q, self.async_exponential_event)


 class AscendTopKTopPSampler(TopKTopPSampler):

+    def set_q_event(self, q, event):
+        # Pass in async exponential results.
+        # Also pass in event to prevent synchronize errors.
+        self.q = q
+        self.async_event = event
+
    def _apply_top_k_top_p(
        self,
        logits: torch.Tensor,
@@ -99,4 +125,8 @@ class AscendTopKTopPSampler(TopKTopPSampler):
            logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)

        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        if get_ascend_config().enable_async_exponential == 1:
+            # Add synchronize to prevent synchronize error.
+            self.async_event.synchronize()
+            return probs.div_(self.q).argmax(dim=-1).view(-1), logits_to_return
        return random_sample(probs, generators), logits_to_return
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1385,6 +1385,12 @@ class NPUModelRunner(GPUModelRunner):
        aclgraph_runtime_mode, batch_descriptor = \
            self.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)

+        if self.ascend_config.enable_async_exponential != 0:
+            self.sampler.do_async_exponential(
+                b_s=logits_indices.shape[0],
+                head_dim=self.model_config.get_vocab_size(),
+                generators=self.input_batch.sampling_metadata.generators)
+
        # Run forward pass
        with ProfileExecuteDuration().capture_async("forward"):
            with set_ascend_forward_context(