From 08441baedd5aacd0445d5b8cc3220d038bec90fb Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Wed, 10 Dec 2025 23:50:18 +0800 Subject: [PATCH] Remove VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION (#4860) VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION is enabled by default for long time. Let's remove it now. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: wangxiyuan --- vllm_ascend/envs.py | 5 ----- vllm_ascend/worker/model_runner_v1.py | 11 ++--------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index d9376acc..8191c82d 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -92,11 +92,6 @@ env_variables: Dict[str, Callable[[], Any]] = { "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE": lambda: int( os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)), - # Whether to enable the topk optimization. It's enabled by default. Please set to False if you hit any issue. - # We'll remove this flag in the future once it's stable enough. - "VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": - lambda: bool( - int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '1'))), # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible # and the mla_pa will be the default path of deepseek decode path. "VLLM_ASCEND_MLA_PA": diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7cc5a417..6d79d340 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -141,6 +141,7 @@ from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.logits_processor import build_logitsprocs from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler +from vllm_ascend.sample.sampler import AscendSampler from vllm_ascend.spec_decode import get_spec_decode_method from vllm_ascend.spec_decode.eagle_proposer import EagleProposer from vllm_ascend.spec_decode.interface import SpecDcodeType @@ -312,15 +313,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): else: self.prefetch_stream = None self.dtype = self.model_config.dtype - if envs_ascend.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION: - # TODO: drop the env config to use ascend sampler by default - from vllm_ascend.sample.sampler import AscendSampler - - self.sampler = AscendSampler() - else: - from vllm.v1.sample.sampler import Sampler - - self.sampler = Sampler() + self.sampler = AscendSampler() self.reorder_batch_threshold: Optional[int] = None # Lazy initialization, these will be set after __init__