[CI][UT]enable multimodal ut (#158)

enable multimodal ut --------- Signed-off-by: zouyida <zouyida@huawei.com>
2025-02-27 14:14:43 +08:00
parent 94483775e1
commit 94cd66bba7
2 changed files with 6 additions and 7 deletions
--- a/vllm_ascend/model_runner.py
+++ b/vllm_ascend/model_runner.py
@@ -695,6 +695,7 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
            assert image_grid_thw is not None or video_grid_thw is not None, (
                "mrope embedding type requires multi-modal input mapper "
                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)

            hf_config = self.runner.model_config.hf_config

@@ -707,14 +708,10 @@ class ModelInputForNPUBuilder(ModelRunnerInputBuilderBase[ModelInputForNPU]):
                mrope_input_positions, mrope_position_delta = \
                    MRotaryEmbedding.get_input_positions(
                        token_ids,
+                        hf_config,
                        image_grid_thw=image_grid_thw,
                        video_grid_thw=video_grid_thw,
-                        image_token_id=hf_config.image_token_id,
-                        video_token_id=hf_config.video_token_id,
-                        vision_start_token_id=hf_config.vision_start_token_id,
-                        vision_end_token_id=hf_config.vision_end_token_id,
-                        spatial_merge_size=hf_config.vision_config.
-                        spatial_merge_size,
+                        second_per_grid_ts=second_per_grid_ts,
                        context_len=inter_data.context_lens[seq_idx],
                        seq_len=inter_data.seq_lens[seq_idx],
                    )