[Model] GLM5 adaptation (#6642)
### What this PR does / why we need it?
GLM5 adaptation
1. use torch_npu.npu_lightning_indexer for GLM5
2. forbid eagle proposer when fullgraph mode is enabled because of bugs
3. add quatization config for GLM5
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
by ci
- vLLM main:
978a37c823
---------
Signed-off-by: yydyzr <liuyuncong1@huawei.com>
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -36,7 +36,14 @@ class MtpProposer(EagleProposer):
|
||||
dummy_compute_logits=lambda hidden_states: None,
|
||||
is_profile=False,
|
||||
) -> None:
|
||||
if self.pcp_size * self.dcp_size == 1 and not self.speculative_config.disable_padded_drafter_batch:
|
||||
# Currently, both GLM and DS encounter issues when enabling the fullgraph mode and running on EagleProposer.
|
||||
# Therefore, we temporarily bypass this problem by adding a conditional check for fullgraph.
|
||||
# TODO: this conditional check should be removed after bug fixing.
|
||||
if (
|
||||
self.pcp_size * self.dcp_size == 1
|
||||
and not self.speculative_config.disable_padded_drafter_batch
|
||||
and not self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
):
|
||||
super().dummy_run(
|
||||
num_tokens,
|
||||
with_prefill,
|
||||
@@ -166,7 +173,14 @@ class MtpProposer(EagleProposer):
|
||||
scheduler_output: SchedulerOutput = None,
|
||||
num_scheduled_tokens: int = 0,
|
||||
) -> torch.Tensor:
|
||||
if self.pcp_size * self.dcp_size == 1 and not self.speculative_config.disable_padded_drafter_batch:
|
||||
# Currently, both GLM and DS encounter issues when enabling the fullgraph mode and running on EagleProposer.
|
||||
# Therefore, we temporarily bypass this problem by adding a conditional check for fullgraph.
|
||||
# TODO: this conditional check should be removed after bug fixing.
|
||||
if (
|
||||
self.pcp_size * self.dcp_size == 1
|
||||
and not self.speculative_config.disable_padded_drafter_batch
|
||||
and not self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
):
|
||||
draft_token_ids = super()._propose(
|
||||
target_token_ids,
|
||||
target_positions,
|
||||
|
||||
Reference in New Issue
Block a user