init v0.11.0rc0

This commit is contained in:
2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions

View File

@@ -98,10 +98,12 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
def __init__(
self,
kv_cache_spec,
layer_names,
vllm_config: VllmConfig,
device: torch.device,
):
super().__init__(vllm_config, device)
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
self.max_num_blocks_per_req = cdiv(
self.model_config.max_model_len,
self.vllm_config.cache_config.block_size)
@@ -171,8 +173,9 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
def build(
self,
common_prefix_len: int,
common_attn_metadata: AscendCommonAttentionMetadata,
model: nn.Module,
model: Optional[nn.Module] = None,
):
num_reqs = common_attn_metadata.num_reqs
num_actual_tokens = common_attn_metadata.num_actual_tokens
@@ -182,11 +185,7 @@ class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder):
block_table[:num_reqs])
seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs]
slot_mapping = common_attn_metadata.slot_mapping_cpu[:
num_actual_tokens].to(
self.device,
non_blocking=
True)
slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens]
attn_mask = common_attn_metadata.attn_mask
attn_state = common_attn_metadata.attn_state
@@ -374,6 +373,9 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
indices = torch.cat((block_indices, slots_indices), dim=1)
torch_npu.npu_scatter_nd_update_(key_cache, indices, key)
torch_npu.npu_scatter_nd_update_(value_cache, indices, value)
if attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
self.key_cache = key_cache
self.value_cache = value_cache
if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
assert attn_metadata is not None
@@ -411,11 +413,13 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
assert attn_metadata is not None
assert attn_metadata.attn_mask is not None
compress_mask = attn_metadata.attn_mask
batch_size = attn_metadata.query_lens.shape[0]
block_table = attn_metadata.block_tables[:batch_size, :]
torch_npu._npu_flash_attention_qlens(
query=query,
key_cache=self.key_cache,
value_cache=self.value_cache,
block_table=attn_metadata.block_tables,
block_table=block_table,
mask=compress_mask,
seq_len=attn_metadata.query_lens,
context_lens=attn_metadata.seq_lens,
@@ -431,17 +435,24 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
block_size = key_cache.shape[1]
query = query.view(num_tokens, 1,
self.num_heads * self.head_size).contiguous()
output = torch_npu.npu_incre_flash_attention(
query,
key_cache,
value_cache,
num_key_value_heads=self.num_kv_heads,
output, _ = torch_npu.npu_fused_infer_attention_score(
query=query,
key=key_cache,
value=value_cache,
query_rope=None,
key_rope=None,
num_heads=self.num_heads,
actual_seq_lengths=seq_lens,
scale_value=self.scale,
block_table=block_table,
num_key_value_heads=self.num_kv_heads,
input_layout='BSH',
block_size=block_size)
atten_mask=decode_meta.attn_mask,
sparse_mode=0,
scale=self.scale,
antiquant_mode=0,
antiquant_scale=None,
block_table=block_table,
block_size=block_size,
actual_seq_lengths_kv=seq_lens,
)
else:
raise NotImplementedError(
"Torchair graph mode with non-MLA attention backend is still experimental."