[v0.11.0][Fix] Prevent memory leak in MLA decode graph (#3743) (#3774)

### What this PR does / why we need it? The cache for MLA decode graph parameters was holding strong references to tensors, preventing them from being garbage collected and leading to increased memory usage. This change wraps the cached tensors in weak references, allowing them to be deallocated when no longer in use and reducing overall memory pressure. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
2025-10-27 16:00:20 +08:00
parent 825fdfb197
commit 43276fd822
4 changed files with 26 additions and 16 deletions
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -212,7 +212,6 @@ def update_attn_params(update_stream, forward_context, runtime_shape):
            seq_lens,
            output,
        ) = param
-        # block_table = forward_context.attn_metadata[key].block_tables
        seq_lens = forward_context.attn_metadata[key].seq_lens
        torch_npu_check = version_check()

@@ -258,8 +257,7 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
    ):
        (q_nope, k_nope, q_pe, k_pe, num_heads, num_kv_heads, input_layout,
         spec_attn_mask, sparse_mode, scale, block_table, block_size,
-         seq_lens_list, actual_seq_lengths, workspace, attn_output,
-         softmax_lse) = param
+         seq_lens_list, actual_seq_lengths, attn_output, softmax_lse) = param
        seq_lens_list = forward_context.attn_metadata[key].decode.seq_lens_list
        if speculative_config and speculative_config.method == "deepseek_mtp":
            actual_seq_lengths = forward_context.attn_metadata[
@@ -295,7 +293,7 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
                block_size=block_size,
                actual_seq_lengths_kv=seq_lens_list,
                actual_seq_lengths=actual_seq_lengths,
-                workspace=workspace,
+                workspace=graph_params.workspaces.get(runtime_shape),
                out=[attn_output, softmax_lse])
            torch.npu.graph_task_update_end(update_stream)

@@ -329,7 +327,7 @@ def set_graph_params(aclgraph_capture_sizes: set[int]):
    )


-def update_graph_params_workspaces(num_tokens: int, workspace: int):
+def update_graph_params_workspaces(num_tokens: int, workspace: Any):
    global _graph_params
    if _graph_params is not None:
        _graph_params.workspaces[num_tokens] = workspace