[Refactor] Import global var form vllm instead of overwirte it (#5469)

### What this PR does / why we need it? Import global var form vllm instead of overwirte it, so that we could use the correct global variant value - vLLM version: v0.13.0 - vLLM main: 5326c89803 --------- Signed-off-by: MengqingCao <cmq0113@163.com>
2026-01-07 18:41:45 +08:00
parent 380f089fbf
commit 3f4f2b4ae6
10 changed files with 7 additions and 157 deletions
--- a/vllm_ascend/sample/rejection_sampler.py
+++ b/vllm_ascend/sample/rejection_sampler.py
@@ -4,7 +4,8 @@ from typing import Optional
 import torch
 from vllm.triton_utils import HAS_TRITON, triton
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.rejection_sampler import (GREEDY_TEMPERATURE,
+from vllm.v1.sample.rejection_sampler import (GREEDY_TEMPERATURE, MAX_SPEC_LEN,
+                                              PLACEHOLDER_TOKEN_ID,
                                              generate_uniform_probs)

 from vllm_ascend.ops.triton.reject_sample import (
@@ -13,11 +14,6 @@ from vllm_ascend.ops.triton.reject_sample import (
    sample_recovered_tokens_kernel)
 from vllm_ascend.sample.sampler import apply_top_k_top_p

-PLACEHOLDER_TOKEN_ID = -1
-# Maximum number of speculative draft tokens allowed per request in a single
-# step. This value is chosen to be large enough to handle typical use cases.
-MAX_SPEC_LEN = 32
-

 def apply_sampling_constraints(
    logits: torch.Tensor,  # [num_tokens, vocab_size]