[Refactor] Import global var form vllm instead of overwirte it (#5469)
### What this PR does / why we need it?
Import global var form vllm instead of overwirte it, so that we could
use the correct global variant value
- vLLM version: v0.13.0
- vLLM main:
5326c89803
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID
|
||||
from vllm.v1.spec_decode.eagle import EagleProposer as VllmEagleProposer
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||
@@ -40,8 +41,6 @@ from vllm_ascend.ops.triton.spec_decode.utils import \
|
||||
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
|
||||
from vllm_ascend.utils import shared_expert_dp_enabled
|
||||
|
||||
PADDING_SLOT_ID = -1
|
||||
|
||||
# Currently we will fix block size to a small one since `num_reqs` can't be too large
|
||||
_PREPARE_INPUTS_BLOCK_SIZE = 4
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.spec_decode.eagle import PADDING_SLOT_ID
|
||||
|
||||
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
@@ -18,8 +19,6 @@ from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla
|
||||
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
|
||||
from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable
|
||||
|
||||
PADDING_SLOT_ID = -1
|
||||
|
||||
|
||||
class MtpProposer(EagleProposer):
|
||||
|
||||
|
||||
Reference in New Issue
Block a user