diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md index 3775683c..8f91f43e 100644 --- a/docs/source/user_guide/configuration/additional_config.md +++ b/docs/source/user_guide/configuration/additional_config.md @@ -42,6 +42,7 @@ The following table lists additional configuration options available in vLLM Asc | `expert_map_record_path` | str | `None` | Save the expert load calculation results to a new expert table in the specified directory. | | `init_redundancy_expert` | int | `0` | Specify redundant experts during initialization. | | `dump_config` | str | `None` | Configuration file path for msprobe dump(eager mode). | +| `enable_async_exponential` | int | `0` | Whether to enable async exponential overlap. To enable async exponential, set this config to 1. | The details of each configuration option are as follows: diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py index 424343b0..73055f3a 100644 --- a/tests/e2e/singlecard/test_sampler.py +++ b/tests/e2e/singlecard/test_sampler.py @@ -47,3 +47,21 @@ def test_models_prompt_logprobs() -> None: runner.generate_greedy_logprobs(example_prompts, max_tokens=5, num_logprobs=1) + + +def test_exponential_overlap() -> None: + example_prompts = [ + "Hello, my name is", + ] + sampling_params = SamplingParams(max_tokens=5, + temperature=1.0, + top_k=50, + top_p=0.9) + + with VllmRunner("Qwen/Qwen3-0.6B", + max_model_len=8192, + gpu_memory_utilization=0.7, + additional_config={ + "enable_async_exponential": 1, + }) as runner: + runner.generate(example_prompts, sampling_params) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 77b5251d..c3474365 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -161,6 +161,11 @@ class AscendConfig: False): kv_cfg.engine_id = f"{kv_cfg.engine_id}-{uuid4().hex}" kv_cfg._engine_id_patched = True + self.enable_async_exponential = additional_config.get( + "enable_async_exponential", 0) + if self.enable_async_exponential not in (0, 1): + raise AssertionError( + "Enable async exponential can only be set to 0 or 1.") class FinegrainedTPConfig: diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index 1ea661cf..3d4fbe22 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -3,6 +3,7 @@ import torch_npu from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler from vllm.v1.sample.sampler import Sampler +from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.utils import (AscendDeviceType, get_ascend_device_type, global_stream, npu_stream_switch) @@ -41,10 +42,35 @@ class AscendSampler(Sampler): # TODO: support logprobs_mode in vllm-ascend super().__init__(logprobs_mode=logprobs_mode) self.topk_topp_sampler = AscendTopKTopPSampler() + self.async_exponential_event = torch.npu.Event() + + def set_q_event(self, q, event): + self.topk_topp_sampler.set_q_event(q, event) + + def do_async_exponential(self, b_s, head_dim, generators): + # Calculating exponential randoms in a different stream + # and overlapping with model executing. + with torch.npu.stream(global_stream()): + global_stream().wait_stream(torch.npu.current_stream()) + q = torch.empty((b_s, head_dim), device="npu", dtype=torch.float32) + # Goes to async exponential with AI-CPU exponential or default exponential. + if len(generators) != q.shape[0]: + q.exponential_() + if generators: + for i, generator in generators.items(): + q[i].exponential_(generator=generator) + self.async_exponential_event.record() + self.set_q_event(q, self.async_exponential_event) class AscendTopKTopPSampler(TopKTopPSampler): + def set_q_event(self, q, event): + # Pass in async exponential results. + # Also pass in event to prevent synchronize errors. + self.q = q + self.async_event = event + def _apply_top_k_top_p( self, logits: torch.Tensor, @@ -99,4 +125,8 @@ class AscendTopKTopPSampler(TopKTopPSampler): logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) + if get_ascend_config().enable_async_exponential == 1: + # Add synchronize to prevent synchronize error. + self.async_event.synchronize() + return probs.div_(self.q).argmax(dim=-1).view(-1), logits_to_return return random_sample(probs, generators), logits_to_return diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1b32e359..86036a17 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1385,6 +1385,12 @@ class NPUModelRunner(GPUModelRunner): aclgraph_runtime_mode, batch_descriptor = \ self.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora) + if self.ascend_config.enable_async_exponential != 0: + self.sampler.do_async_exponential( + b_s=logits_indices.shape[0], + head_dim=self.model_config.get_vocab_size(), + generators=self.input_batch.sampling_metadata.generators) + # Run forward pass with ProfileExecuteDuration().capture_async("forward"): with set_ascend_forward_context(