[Refactor] 4/N Distinguish the branches based on the applicable scenarios of PA and FIA Ops. (#5081)

RFC: https://github.com/vllm-project/vllm-ascend/issues/4629 Reason: We distinguish the branches based on the applicable scenarios of pagedAttention and fusedInferAttention, making the code more clear. At the same time, it is convenient for the subsequent iterations of sliding_window and sinks and removePA ops after FIA is ready. Todo: remove PA ops after FIA is ready add slidingwindow and ops for gpt_oss replace FIA with FIA_v2 - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-12-17 23:14:02 +08:00
parent 7671ce1bf1
commit 98e6e57622
3 changed files with 117 additions and 154 deletions
--- a/vllm_ascend/attention/utils.py
+++ b/vllm_ascend/attention/utils.py
@@ -10,7 +10,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                          is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context

-from vllm_ascend.utils import get_ascend_config
+from vllm_ascend.utils import (AscendDeviceType, get_ascend_config,
+                               get_ascend_device_type)


@lru_cache
@@ -18,8 +19,11 @@ def using_paged_attention(runtime_shape: int) -> bool:
    vllm_config = get_current_vllm_config()
    if vllm_config.speculative_config is not None:
        return False
+    if get_ascend_device_type() == AscendDeviceType.A5:
+        return False
    from vllm.config.compilation import CUDAGraphMode
-    if vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.FULL_DECODE_ONLY:
+    cudagraph_mode = vllm_config.compilation_config.cudagraph_mode
+    if cudagraph_mode != CUDAGraphMode.FULL_DECODE_ONLY:
        return False

    return runtime_shape in get_ascend_config().pa_shape_list