[Model Runner][Performance] Cache the jugement result of is_encoder_decoder to decrease framework overhead (#138)
In Model Runner, is_encoder_decoder is exacted from model_config to determin whether vllm is running for enc-dec models. Obtaining this status requires a long call stack, and the CPU overhead is high. So this PR cache this status in __init__ of ModelInputForNPUBuilder. Signed-off-by: hw_whx <wanghexiang7@huawei.com> Co-authored-by: hw_whx <wanghexiang7@huawei.com>
This commit is contained in:
@@ -353,6 +353,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
|
|||||||
self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
|
self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
|
||||||
self.finished_requests_ids = finished_requests_ids
|
self.finished_requests_ids = finished_requests_ids
|
||||||
self.decode_only = True
|
self.decode_only = True
|
||||||
|
self.is_encoder_decoder = self.runner.model_config.is_encoder_decoder
|
||||||
|
|
||||||
# Attention metadata inputs.
|
# Attention metadata inputs.
|
||||||
self.attn_metadata_builder = self.attn_backend.make_metadata_builder(
|
self.attn_metadata_builder = self.attn_backend.make_metadata_builder(
|
||||||
@@ -423,7 +424,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
|
|||||||
|
|
||||||
encoder_seq_len = 0
|
encoder_seq_len = 0
|
||||||
|
|
||||||
if self.runner.model_config.is_encoder_decoder:
|
if self.is_encoder_decoder:
|
||||||
encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
|
encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
|
||||||
|
|
||||||
inter_data = self.init_cached_inter_data(
|
inter_data = self.init_cached_inter_data(
|
||||||
@@ -560,7 +561,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
|
|||||||
context_len = seq_data.get_num_computed_tokens()
|
context_len = seq_data.get_num_computed_tokens()
|
||||||
seq_len = min(seq_len, context_len + token_chunk_size)
|
seq_len = min(seq_len, context_len + token_chunk_size)
|
||||||
elif self.runner.scheduler_config.is_multi_step or \
|
elif self.runner.scheduler_config.is_multi_step or \
|
||||||
self.runner.model_config.is_encoder_decoder:
|
self.is_encoder_decoder:
|
||||||
context_len = seq_len - 1
|
context_len = seq_len - 1
|
||||||
else:
|
else:
|
||||||
context_len = seq_data.get_num_computed_tokens()
|
context_len = seq_data.get_num_computed_tokens()
|
||||||
|
|||||||
Reference in New Issue
Block a user