fix pagedattention to support fullgraph. (#3436)

### What this PR does / why we need it?
Calculate in advance the workspace memory size needed for the
PagedAttention operator to avoid deadlocks during resource cleanup. This
PR requires torch_npu version 0920 or newer.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
This commit is contained in:
XiaoxinWang
2025-10-14 16:10:09 +08:00
committed by GitHub
parent 22a1d91cf5
commit 9eb62935b8
5 changed files with 271 additions and 21 deletions

View File

@@ -1,7 +1,9 @@
import functools
from dataclasses import dataclass
from typing import Any, List
import torch
import torch_npu
from vllm.distributed.kv_transfer import (get_kv_transfer_group,
has_kv_transfer_group,
is_v1_kv_transfer_group)
@@ -139,3 +141,17 @@ def maybe_save_kv_layer_to_connector(
return
# TODO: assert ascendMetadata
connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)
@functools.cache
def version_check():
import re
torch_npu_version = torch_npu.version.__version__
date_pattern = r'dev(\d{8})'
match = re.search(date_pattern, torch_npu_version)
if match:
full_date = match.group(1)
if full_date >= "20250919":
return True
return False