diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md
index 3775683c..8f91f43e 100644
--- a/docs/source/user_guide/configuration/additional_config.md
+++ b/docs/source/user_guide/configuration/additional_config.md
@@ -42,6 +42,7 @@ The following table lists additional configuration options available in vLLM Asc
 | `expert_map_record_path`            | str  | `None`  | Save the expert load calculation results to a new expert table in the specified directory.                                                  |
 | `init_redundancy_expert`            | int  | `0`     | Specify redundant experts during initialization.                                                                                              |
 | `dump_config`                      | str | `None`  | Configuration file path for msprobe dump(eager mode).                                                                                          |
+| `enable_async_exponential`          | int  | `0`     | Whether to enable async exponential overlap. To enable async exponential, set this config to 1.                                                                |
 
 The details of each configuration option are as follows:
 
diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py
index 424343b0..73055f3a 100644
--- a/tests/e2e/singlecard/test_sampler.py
+++ b/tests/e2e/singlecard/test_sampler.py
@@ -47,3 +47,21 @@ def test_models_prompt_logprobs() -> None:
         runner.generate_greedy_logprobs(example_prompts,
                                         max_tokens=5,
                                         num_logprobs=1)
+
+
+def test_exponential_overlap() -> None:
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=1.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner("Qwen/Qwen3-0.6B",
+                    max_model_len=8192,
+                    gpu_memory_utilization=0.7,
+                    additional_config={
+                        "enable_async_exponential": 1,
+                    }) as runner:
+        runner.generate(example_prompts, sampling_params)
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
index 77b5251d..c3474365 100644
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -161,6 +161,11 @@ class AscendConfig:
                                               False):
             kv_cfg.engine_id = f"{kv_cfg.engine_id}-{uuid4().hex}"
             kv_cfg._engine_id_patched = True
+        self.enable_async_exponential = additional_config.get(
+            "enable_async_exponential", 0)
+        if self.enable_async_exponential not in (0, 1):
+            raise AssertionError(
+                "Enable async exponential can only be set to 0 or 1.")
 
 
 class FinegrainedTPConfig:
diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py
index 1ea661cf..3d4fbe22 100644
--- a/vllm_ascend/sample/sampler.py
+++ b/vllm_ascend/sample/sampler.py
@@ -3,6 +3,7 @@ import torch_npu
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 from vllm.v1.sample.sampler import Sampler
 
+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type,
                                global_stream, npu_stream_switch)
 
@@ -41,10 +42,35 @@ class AscendSampler(Sampler):
         # TODO: support logprobs_mode in vllm-ascend
         super().__init__(logprobs_mode=logprobs_mode)
         self.topk_topp_sampler = AscendTopKTopPSampler()
+        self.async_exponential_event = torch.npu.Event()
+
+    def set_q_event(self, q, event):
+        self.topk_topp_sampler.set_q_event(q, event)
+
+    def do_async_exponential(self, b_s, head_dim, generators):
+        # Calculating exponential randoms in a different stream
+        # and overlapping with model executing.
+        with torch.npu.stream(global_stream()):
+            global_stream().wait_stream(torch.npu.current_stream())
+            q = torch.empty((b_s, head_dim), device="npu", dtype=torch.float32)
+            # Goes to async exponential with AI-CPU exponential or default exponential.
+            if len(generators) != q.shape[0]:
+                q.exponential_()
+            if generators:
+                for i, generator in generators.items():
+                    q[i].exponential_(generator=generator)
+            self.async_exponential_event.record()
+        self.set_q_event(q, self.async_exponential_event)
 
 
 class AscendTopKTopPSampler(TopKTopPSampler):
 
+    def set_q_event(self, q, event):
+        # Pass in async exponential results.
+        # Also pass in event to prevent synchronize errors.
+        self.q = q
+        self.async_event = event
+
     def _apply_top_k_top_p(
         self,
         logits: torch.Tensor,
@@ -99,4 +125,8 @@ class AscendTopKTopPSampler(TopKTopPSampler):
             logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32)
 
         probs = logits.softmax(dim=-1, dtype=torch.float32)
+        if get_ascend_config().enable_async_exponential == 1:
+            # Add synchronize to prevent synchronize error.
+            self.async_event.synchronize()
+            return probs.div_(self.q).argmax(dim=-1).view(-1), logits_to_return
         return random_sample(probs, generators), logits_to_return
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 1b32e359..86036a17 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1385,6 +1385,12 @@ class NPUModelRunner(GPUModelRunner):
         aclgraph_runtime_mode, batch_descriptor = \
             self.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
 
+        if self.ascend_config.enable_async_exponential != 0:
+            self.sampler.do_async_exponential(
+                b_s=logits_indices.shape[0],
+                head_dim=self.model_config.get_vocab_size(),
+                generators=self.input_batch.sampling_metadata.generators)
+
         # Run forward pass
         with ProfileExecuteDuration().capture_async("forward"):
             with set_ascend_forward_context(