[long_seq] remove long_seq env (#4660)
### What this PR does / why we need it? remove env VLLM_ASCEND_ENABLE_CONTEXT_PARALLEL - vLLM version: v0.12.0 --------- Signed-off-by: LookAround <lixushi@huawei.com> Signed-off-by: ZhangMingWei716 <2894054457@qq.com> Co-authored-by: ZhangMingWei716 <2894054457@qq.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -52,7 +52,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
|
||||
has_kv_transfer_group)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
|
||||
from vllm.distributed.parallel_state import (get_dcp_group, get_dp_group,
|
||||
get_pp_group, get_tp_group,
|
||||
get_pcp_group, get_pp_group,
|
||||
get_tp_group,
|
||||
is_global_first_rank)
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import logger
|
||||
@@ -145,16 +146,9 @@ from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||
AscendDeviceType, ProfileExecuteDuration,
|
||||
enable_sp, get_ascend_device_type, is_enable_nz,
|
||||
is_moe_model, lmhead_tp_enable,
|
||||
prefill_context_parallel_enable)
|
||||
is_moe_model, lmhead_tp_enable)
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
if prefill_context_parallel_enable():
|
||||
from vllm.distributed import get_pcp_group
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_prefill_context_model_parallel_rank,
|
||||
get_prefill_context_model_parallel_world_size)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
@@ -290,10 +284,9 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
||||
self.dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
self.dcp_size = get_dcp_group().world_size
|
||||
self.dcp_rank = get_dcp_group().rank_in_group
|
||||
self.pcp_size = get_prefill_context_model_parallel_world_size(
|
||||
) if prefill_context_parallel_enable() else 1
|
||||
self.pcp_rank = get_prefill_context_model_parallel_rank(
|
||||
) if self.pcp_size > 1 else 0
|
||||
self.pcp_size = get_pcp_group().world_size
|
||||
self.pcp_rank = get_pcp_group(
|
||||
).rank_in_group if self.pcp_size > 1 else 0
|
||||
decode_max_num_seqs = getattr(self.scheduler_config,
|
||||
'decode_max_num_seqs', 0)
|
||||
self.max_num_reqs = max(self.scheduler_config.max_num_seqs,
|
||||
@@ -602,8 +595,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
||||
if self.vllm_config.speculative_config else 0),
|
||||
kernel_block_sizes=[[self.vllm_config.cache_config.block_size]],
|
||||
cp_kv_cache_interleave_size=self.parallel_config.
|
||||
cp_kv_cache_interleave_size
|
||||
if prefill_context_parallel_enable() else 1,
|
||||
cp_kv_cache_interleave_size,
|
||||
)
|
||||
self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
|
||||
dtype=torch.int64)
|
||||
@@ -2742,8 +2734,7 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
|
||||
device=self.device)
|
||||
long_seq_metadata = self._generate_pcp_metadata(num_tokens)
|
||||
if long_seq_metadata is not None:
|
||||
pcp_world_size = get_pcp_group(
|
||||
).world_size if prefill_context_parallel_enable() else 1
|
||||
pcp_world_size = get_pcp_group().world_size
|
||||
dcp_world_size = get_dcp_group().world_size
|
||||
num_computed_tokens_of_pcp_dcp = [[
|
||||
[0] * dcp_world_size for _ in range(pcp_world_size)
|
||||
|
||||
Reference in New Issue
Block a user