From 4e720936d8ffd30298053b1aa7e0618c047299e9 Mon Sep 17 00:00:00 2001
From: LeeWenquan <83354342+SunnyLee151064@users.noreply.github.com>
Date: Wed, 15 Oct 2025 11:30:30 +0800
Subject: [PATCH] Fix warning msg print (#3421)

### What this PR does / why we need it?
Avoid printing some warning msg as below :
UserWarning: To copy construct from a tensor, it is recommended to use
sourceTensor.clone().detach ...

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: SunnyLee219 <3294305115@qq.com>
---
 vllm_ascend/attention/mla_v1.py      | 8 +++-----
 vllm_ascend/torchair/torchair_mla.py | 8 +++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index a662265..c8379b7 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -79,7 +79,7 @@ class AscendMLAPrefillMetadata:
         chunk_seq_lens: torch.Tensor
 
     attn_mask: torch.Tensor
-    query_lens: list[int]
+    query_lens: torch.Tensor
     seq_lens: list[int]
     context_lens: torch.Tensor
     input_positions: torch.Tensor
@@ -380,7 +380,7 @@ class AscendMLAMetadataBuilder:
                     1).unsqueeze(2)
             prefill_metadata = AscendMLAPrefillMetadata(
                 attn_mask=common_attn_metadata.attn_mask,
-                query_lens=query_lens[reqs_start:],
+                query_lens=query_lens[reqs_start:].to(torch.int32),
                 seq_lens=seq_lens,
                 context_lens=seq_lens[reqs_start:],
                 input_positions=prefill_input_positions,
@@ -837,9 +837,7 @@ class AscendMLAImpl(MLAAttentionImpl):
                                    k_rope=k_pe,
                                    value=value,
                                    mask=self.prefill_mask,
-                                   seqlen=torch.tensor(
-                                       attn_metadata.prefill.query_lens,
-                                       dtype=torch.int32),
+                                   seqlen=attn_metadata.prefill.query_lens,
                                    head_num=self.num_heads,
                                    kv_head_num=self.num_heads,
                                    pre_out=None,
diff --git a/vllm_ascend/torchair/torchair_mla.py b/vllm_ascend/torchair/torchair_mla.py
index ed14fed..4269727 100644
--- a/vllm_ascend/torchair/torchair_mla.py
+++ b/vllm_ascend/torchair/torchair_mla.py
@@ -74,7 +74,7 @@ class AscendMLATorchairPrefillMetadata:
         chunk_seq_lens: torch.Tensor
 
     attn_mask: torch.Tensor
-    query_lens: list[int]
+    query_lens: torch.Tensor
     seq_lens: list[int]
     context_lens: torch.Tensor
     input_positions: torch.Tensor
@@ -473,7 +473,7 @@ class AscendMLATorchairMetadataBuilder:
                     1).unsqueeze(2)
             prefill_metadata = AscendMLATorchairPrefillMetadata(
                 attn_mask=common_attn_metadata.attn_mask,
-                query_lens=query_lens[tokens_start:],
+                query_lens=query_lens[tokens_start:].to(torch.int32),
                 seq_lens=seq_lens,
                 context_lens=seq_lens[tokens_start:],
                 input_positions=prefill_input_positions,
@@ -880,9 +880,7 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
                                    k_rope=k_pe,
                                    value=value,
                                    mask=self.prefill_mask,
-                                   seqlen=torch.tensor(
-                                       attn_metadata.prefill.query_lens,
-                                       dtype=torch.int32),
+                                   seqlen=attn_metadata.prefill.query_lens,
                                    head_num=self.num_heads,
                                    kv_head_num=self.num_heads,
                                    pre_out=None,