clean 0.15.0 support (#6852)
Clean up vllm 0.15.0 related code
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -65,7 +65,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
|
|||||||
|
|
||||||
| Date | Event |
|
| Date | Event |
|
||||||
|------------|-------------------------------------------|
|
|------------|-------------------------------------------|
|
||||||
| 2026.02.26 | Release candidates, v0.15.0rc1 |
|
| 2026.02.27 | Release candidates, v0.15.0rc1 |
|
||||||
| 2026.02.06 | v0.13.0 Final release, v0.13.0 |
|
| 2026.02.06 | v0.13.0 Final release, v0.13.0 |
|
||||||
| 2026.01.26 | Release candidates, v0.14.0rc1 |
|
| 2026.01.26 | Release candidates, v0.14.0rc1 |
|
||||||
| 2026.01.24 | Release candidates, v0.13.0rc2 |
|
| 2026.01.24 | Release candidates, v0.13.0rc2 |
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Release Notes
|
# Release Notes
|
||||||
|
|
||||||
## v0.15.0rc1 - 2026.02.26
|
## v0.15.0rc1 - 2026.02.27
|
||||||
|
|
||||||
This is the first release candidate of v0.15.0 for vLLM Ascend. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/latest) to get started.
|
This is the first release candidate of v0.15.0 for vLLM Ascend. Please follow the [official doc](https://docs.vllm.ai/projects/ascend/en/latest) to get started.
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ if 'torch_npu._inductor' not in sys.modules:
|
|||||||
from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
|
from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
|
||||||
AscendSFAMetadata,
|
AscendSFAMetadata,
|
||||||
AscendSFAMetadataBuilder)
|
AscendSFAMetadataBuilder)
|
||||||
from vllm_ascend.utils import enable_dsa_cp, vllm_version_is
|
from vllm_ascend.utils import enable_dsa_cp
|
||||||
|
|
||||||
|
|
||||||
class TestAscendSFABackend(TestBase):
|
class TestAscendSFABackend(TestBase):
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from vllm_ascend.quantization.modelslim_config import (
|
|||||||
MODELSLIM_CONFIG_FILENAME,
|
MODELSLIM_CONFIG_FILENAME,
|
||||||
AscendModelSlimConfig,
|
AscendModelSlimConfig,
|
||||||
)
|
)
|
||||||
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
|
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
|
||||||
|
|
||||||
from vllm.model_executor.layers.attention import Attention
|
from vllm.model_executor.layers.attention import Attention
|
||||||
|
|
||||||
|
|||||||
@@ -153,11 +153,10 @@ class AscendFusedMoE310(FusedMoE):
|
|||||||
self.quant_type = self.get_quant_type()
|
self.quant_type = self.get_quant_type()
|
||||||
|
|
||||||
_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
|
_MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
|
||||||
|
if not vllm_version_is("0.16.0"):
|
||||||
if not vllm_version_is("0.15.0"):
|
|
||||||
self.runner = self._init_runner()
|
self.runner = self._init_runner()
|
||||||
|
|
||||||
if not vllm_version_is("0.15.0"):
|
if not vllm_version_is("0.16.0"):
|
||||||
|
|
||||||
def _init_runner(self):
|
def _init_runner(self):
|
||||||
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
|
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
|
||||||
|
|||||||
@@ -1,18 +1,14 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch._inductor.pattern_matcher as pm
|
import torch._inductor.pattern_matcher as pm
|
||||||
from torch._inductor.pattern_matcher import PatternMatcherPass
|
from torch._inductor.pattern_matcher import PatternMatcherPass
|
||||||
|
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
|
||||||
from vllm_ascend.utils import is_moe_model, vllm_version_is
|
|
||||||
|
|
||||||
if vllm_version_is("0.15.0"):
|
|
||||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore
|
|
||||||
else:
|
|
||||||
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.config.utils import Range
|
from vllm.config.utils import Range
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce
|
from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
|
from vllm_ascend.utils import is_moe_model
|
||||||
|
|
||||||
SP_THRESHOLD = 1000
|
SP_THRESHOLD = 1000
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -21,8 +21,6 @@ import torch.nn.functional as F
|
|||||||
import torch_npu
|
import torch_npu
|
||||||
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore
|
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention # type: ignore
|
||||||
|
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
MIN_PAD_SIZE: int = 64 # min_size to pad weight
|
MIN_PAD_SIZE: int = 64 # min_size to pad weight
|
||||||
MAX_PAD_SIZE: int = 128 # max_size to pad weight
|
MAX_PAD_SIZE: int = 128 # max_size to pad weight
|
||||||
|
|
||||||
@@ -64,9 +62,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
|
|||||||
prefix=prefix,
|
prefix=prefix,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not vllm_version_is("0.15.0"):
|
|
||||||
self.layer_index = int("".join(filter(str.isdigit, prefix)))
|
self.layer_index = int("".join(filter(str.isdigit, prefix)))
|
||||||
|
|
||||||
self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
|
self.enable_pad = self.head_size > MIN_PAD_SIZE and self.head_size < MAX_PAD_SIZE
|
||||||
self.scale_value = self.head_size**-0.5
|
self.scale_value = self.head_size**-0.5
|
||||||
|
|
||||||
@@ -106,19 +102,13 @@ class AscendMMEncoderAttention(MMEncoderAttention):
|
|||||||
kv_len = key.size(1)
|
kv_len = key.size(1)
|
||||||
is_reshaped = query.dim() == 4
|
is_reshaped = query.dim() == 4
|
||||||
|
|
||||||
if vllm_version_is("0.15.0"):
|
# Directly use seq_lens cpu cache to avoid d2h copy.
|
||||||
if cu_seqlens is None:
|
|
||||||
cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
|
|
||||||
seq_lens_cpu = torch.diff(cu_seqlens).to("cpu")
|
|
||||||
else:
|
|
||||||
global seq_lens_cpu_cache
|
global seq_lens_cpu_cache
|
||||||
if self.layer_index == 0:
|
if self.layer_index == 0:
|
||||||
if cu_seqlens is None:
|
if cu_seqlens is None:
|
||||||
cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
|
cu_seqlens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device="cpu")
|
||||||
# Update seq_lens cpu cache.
|
# Update seq_lens cpu cache.
|
||||||
seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")
|
seq_lens_cpu_cache = torch.diff(cu_seqlens).to("cpu")
|
||||||
# Directly use seq_lens cpu cache to avoid d2h copy.
|
|
||||||
seq_lens_cpu = seq_lens_cpu_cache
|
|
||||||
|
|
||||||
# q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
|
# q, k, v: [b, s, head, head_dim] -> [b * s, head, head_dim]
|
||||||
q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
|
q, k, v = self._reshape_qkv_to_3d(query, key, value, bsz, q_len, kv_len)
|
||||||
@@ -138,7 +128,7 @@ class AscendMMEncoderAttention(MMEncoderAttention):
|
|||||||
query=q,
|
query=q,
|
||||||
key=k,
|
key=k,
|
||||||
value=v,
|
value=v,
|
||||||
seq_len=seq_lens_cpu,
|
seq_len=seq_lens_cpu_cache,
|
||||||
scale_value=self.scale_value,
|
scale_value=self.scale_value,
|
||||||
num_heads=self.num_heads,
|
num_heads=self.num_heads,
|
||||||
num_kv_heads=self.num_kv_heads,
|
num_kv_heads=self.num_kv_heads,
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ from vllm_ascend.ops.triton.reject_sample import (
|
|||||||
sample_recovered_tokens_kernel,
|
sample_recovered_tokens_kernel,
|
||||||
)
|
)
|
||||||
from vllm_ascend.sample.sampler import apply_top_k_top_p
|
from vllm_ascend.sample.sampler import apply_top_k_top_p
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
|
|
||||||
def apply_sampling_constraints(
|
def apply_sampling_constraints(
|
||||||
@@ -167,9 +166,6 @@ def rejection_sample(
|
|||||||
return output_token_ids
|
return output_token_ids
|
||||||
|
|
||||||
# Compute probability distribution from target logits.
|
# Compute probability distribution from target logits.
|
||||||
if vllm_version_is("0.15.0"):
|
|
||||||
target_probs = target_logits
|
|
||||||
else:
|
|
||||||
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
|
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
|
||||||
assert target_probs.is_contiguous()
|
assert target_probs.is_contiguous()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user