[CI][UT]enable multimodal ut (#158)
enable multimodal ut --------- Signed-off-by: zouyida <zouyida@huawei.com>
This commit is contained in:
@@ -695,6 +695,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
|
||||
assert image_grid_thw is not None or video_grid_thw is not None, (
|
||||
"mrope embedding type requires multi-modal input mapper "
|
||||
"returns 'image_grid_thw' or 'video_grid_thw'.")
|
||||
second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
|
||||
|
||||
hf_config = self.runner.model_config.hf_config
|
||||
|
||||
@@ -707,14 +708,10 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
|
||||
mrope_input_positions, mrope_position_delta = \
|
||||
MRotaryEmbedding.get_input_positions(
|
||||
token_ids,
|
||||
hf_config,
|
||||
image_grid_thw=image_grid_thw,
|
||||
video_grid_thw=video_grid_thw,
|
||||
image_token_id=hf_config.image_token_id,
|
||||
video_token_id=hf_config.video_token_id,
|
||||
vision_start_token_id=hf_config.vision_start_token_id,
|
||||
vision_end_token_id=hf_config.vision_end_token_id,
|
||||
spatial_merge_size=hf_config.vision_config.
|
||||
spatial_merge_size,
|
||||
second_per_grid_ts=second_per_grid_ts,
|
||||
context_len=inter_data.context_lens[seq_idx],
|
||||
seq_len=inter_data.seq_lens[seq_idx],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user