diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index d9376acc..8191c82d 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -92,11 +92,6 @@ env_variables: Dict[str, Callable[[], Any]] = { "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE": lambda: int( os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)), - # Whether to enable the topk optimization. It's enabled by default. Please set to False if you hit any issue. - # We'll remove this flag in the future once it's stable enough. - "VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": - lambda: bool( - int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '1'))), # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible # and the mla_pa will be the default path of deepseek decode path. "VLLM_ASCEND_MLA_PA": diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7cc5a417..6d79d340 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -141,6 +141,7 @@ from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.logits_processor import build_logitsprocs from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler +from vllm_ascend.sample.sampler import AscendSampler from vllm_ascend.spec_decode import get_spec_decode_method from vllm_ascend.spec_decode.eagle_proposer import EagleProposer from vllm_ascend.spec_decode.interface import SpecDcodeType @@ -312,15 +313,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin): else: self.prefetch_stream = None self.dtype = self.model_config.dtype - if envs_ascend.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION: - # TODO: drop the env config to use ascend sampler by default - from vllm_ascend.sample.sampler import AscendSampler - - self.sampler = AscendSampler() - else: - from vllm.v1.sample.sampler import Sampler - - self.sampler = Sampler() + self.sampler = AscendSampler() self.reorder_batch_threshold: Optional[int] = None # Lazy initialization, these will be set after __init__