[Refactor] 4/N Distinguish the branches based on the applicable scenarios of PA and FIA Ops. (#5081)
RFC: https://github.com/vllm-project/vllm-ascend/issues/4629
Reason:
We distinguish the branches based on the applicable scenarios of
pagedAttention and fusedInferAttention, making the code more clear.
At the same time, it is convenient for the subsequent iterations of
sliding_window and sinks and removePA ops after FIA is ready.
Todo:
remove PA ops after FIA is ready
add slidingwindow and ops for gpt_oss
replace FIA with FIA_v2
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
@@ -10,7 +10,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
|
||||
is_v1_kv_transfer_group)
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
|
||||
from vllm_ascend.utils import get_ascend_config
|
||||
from vllm_ascend.utils import (AscendDeviceType, get_ascend_config,
|
||||
get_ascend_device_type)
|
||||
|
||||
|
||||
@lru_cache
|
||||
@@ -18,8 +19,11 @@ def using_paged_attention(runtime_shape: int) -> bool:
|
||||
vllm_config = get_current_vllm_config()
|
||||
if vllm_config.speculative_config is not None:
|
||||
return False
|
||||
if get_ascend_device_type() == AscendDeviceType.A5:
|
||||
return False
|
||||
from vllm.config.compilation import CUDAGraphMode
|
||||
if vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.FULL_DECODE_ONLY:
|
||||
cudagraph_mode = vllm_config.compilation_config.cudagraph_mode
|
||||
if cudagraph_mode != CUDAGraphMode.FULL_DECODE_ONLY:
|
||||
return False
|
||||
|
||||
return runtime_shape in get_ascend_config().pa_shape_list
|
||||
|
||||
Reference in New Issue
Block a user