Fix some ci issue and refactor modelrunner (#2445)

### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: 4d9c61993a --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com>
2025-08-20 09:01:04 +08:00
parent 955411611c
commit 1327f9be1c
28 changed files with 1612 additions and 1020 deletions
--- a/vllm_ascend/torchair/torchair_model_runner.py
+++ b/vllm_ascend/torchair/torchair_model_runner.py
@@ -26,7 +26,8 @@ from vllm.forward_context import get_forward_context
 from vllm.logger import logger

 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
+from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
+                                        check_torchair_cache_exist,
                                        register_torchair_model,
                                        write_kv_cache_bytes_to_file)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
@@ -71,8 +72,16 @@ class NPUTorchairModelRunner(NPUModelRunner):
        # NOTE: If torchair graph mode and not with_prefill,
        # we can't skip_attn, it will cause graph recompile.
        if not with_prefill:
+            common_attn_metadata = TorchairCommonAttentionMetadata(
+                num_reqs=num_reqs,
+                num_actual_tokens=1,
+                actual_seq_lengths_q=self.actual_seq_lengths_q,
+                attn_mask=self.attn_mask,
+                spec_attn_mask=self.spec_attn_mask,
+                decode_token_per_req=self.decode_token_per_req,
+            )
            attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy(
-                num_reqs=num_reqs, num_actual_tokens=1)
+                common_attn_metadata)
        else:
            attn_metadata = super()._build_attention_metadata(
                with_prefill, num_reqs, skip_attn)