From 386817b4d1c0781abcc5ab5370da3b444882a74d Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:43:11 +0800 Subject: [PATCH] [Model Runner][Performance] Cache the jugement result of is_encoder_decoder to decrease framework overhead (#138) In Model Runner, is_encoder_decoder is exacted from model_config to determin whether vllm is running for enc-dec models. Obtaining this status requires a long call stack, and the CPU overhead is high. So this PR cache this status in __init__ of ModelInputForNPUBuilder. Signed-off-by: hw_whx Co-authored-by: hw_whx --- vllm_ascend/model_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/model_runner.py b/vllm_ascend/model_runner.py index b43d2d1..2bb057f 100644 --- a/vllm_ascend/model_runner.py +++ b/vllm_ascend/model_runner.py @@ -353,6 +353,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]): self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper self.finished_requests_ids = finished_requests_ids self.decode_only = True + self.is_encoder_decoder = self.runner.model_config.is_encoder_decoder # Attention metadata inputs. self.attn_metadata_builder = self.attn_backend.make_metadata_builder( @@ -423,7 +424,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]): encoder_seq_len = 0 - if self.runner.model_config.is_encoder_decoder: + if self.is_encoder_decoder: encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() inter_data = self.init_cached_inter_data( @@ -560,7 +561,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]): context_len = seq_data.get_num_computed_tokens() seq_len = min(seq_len, context_len + token_chunk_size) elif self.runner.scheduler_config.is_multi_step or \ - self.runner.model_config.is_encoder_decoder: + self.is_encoder_decoder: context_len = seq_len - 1 else: context_len = seq_data.get_num_computed_tokens()