[Fixbugs]: fix refactor cause to 310p chunkprefill error (#6340)
Adapt modelrunner refactor change to make 310p work
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
Signed-off-by: Tflowers-0129 <2906339855@qq.com>
This commit is contained in:
@@ -101,8 +101,7 @@ class AscendAttentionBackendImpl310(_BaseImpl):
|
|||||||
out=output,
|
out=output,
|
||||||
)
|
)
|
||||||
|
|
||||||
out_real = output[:real_tokens, :, :]
|
return output[:aligned_tokens, :, :]
|
||||||
return out_real
|
|
||||||
|
|
||||||
def _forward_chunked_prefill_310p(self, query, attn_metadata, output):
|
def _forward_chunked_prefill_310p(self, query, attn_metadata, output):
|
||||||
assert attn_metadata is not None
|
assert attn_metadata is not None
|
||||||
@@ -110,6 +109,10 @@ class AscendAttentionBackendImpl310(_BaseImpl):
|
|||||||
if query.dtype == torch.float32:
|
if query.dtype == torch.float32:
|
||||||
query = query.to(torch.float16)
|
query = query.to(torch.float16)
|
||||||
|
|
||||||
|
num_actual_tokens = int(attn_metadata.num_actual_tokens)
|
||||||
|
query = query[:num_actual_tokens]
|
||||||
|
output = output[:num_actual_tokens]
|
||||||
|
|
||||||
qsl_cpu = attn_metadata.query_start_loc.detach().to("cpu", dtype=torch.int32)
|
qsl_cpu = attn_metadata.query_start_loc.detach().to("cpu", dtype=torch.int32)
|
||||||
qlens = (qsl_cpu[1:] - qsl_cpu[:-1]).to(torch.int32)
|
qlens = (qsl_cpu[1:] - qsl_cpu[:-1]).to(torch.int32)
|
||||||
|
|
||||||
@@ -163,8 +166,7 @@ class AscendAttentionBackendImpl310(_BaseImpl):
|
|||||||
k = key[:num_tokens]
|
k = key[:num_tokens]
|
||||||
v = value[:num_tokens]
|
v = value[:num_tokens]
|
||||||
out = self._forward_prefill_310p_fallback(q, k, v, attn_metadata, output)
|
out = self._forward_prefill_310p_fallback(q, k, v, attn_metadata, output)
|
||||||
output[:num_tokens] = out
|
return out
|
||||||
return output
|
|
||||||
|
|
||||||
if state == AscendAttentionState.ChunkedPrefill:
|
if state == AscendAttentionState.ChunkedPrefill:
|
||||||
self._forward_chunked_prefill_310p(query, attn_metadata, output)
|
self._forward_chunked_prefill_310p(query, attn_metadata, output)
|
||||||
|
|||||||
Reference in New Issue
Block a user