Upgrade to 0.11.1 newest vllm commit (#3762)
### What this PR does / why we need it?c9461e05a4Fix ```spec decode rejection sampler```, caused by https://github.com/vllm-project/vllm/pull/26060 Fix some ```import```, caused by https://github.com/vllm-project/vllm/pull/27374 Fix ```scheduler_config.send_delta_data```, caused by https://github.com/vllm-project/vllm-ascend/pull/3719 Fix ```init_with_cudagraph_sizes```, caused by https://github.com/vllm-project/vllm/pull/26016 Fix ```vl model```of replacing PatchEmbed's conv3d to linear layer, caused by https://github.com/vllm-project/vllm/pull/27418 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.11.0rc3 - vLLM main:c9461e05a4--------- Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
2
.github/workflows/_e2e_test.yaml
vendored
2
.github/workflows/_e2e_test.yaml
vendored
@@ -94,7 +94,7 @@ jobs:
|
|||||||
pytest -sv tests/e2e/singlecard/test_camem.py
|
pytest -sv tests/e2e/singlecard/test_camem.py
|
||||||
pytest -sv tests/e2e/singlecard/test_chunked.py
|
pytest -sv tests/e2e/singlecard/test_chunked.py
|
||||||
pytest -sv tests/e2e/singlecard/test_embedding.py
|
pytest -sv tests/e2e/singlecard/test_embedding.py
|
||||||
pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
|
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
|
||||||
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
||||||
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
|
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
|
||||||
pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
|
pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
|
||||||
|
|||||||
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Get vLLM version
|
- name: Get vLLM version
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
|
VLLM_COMMIT=releases/v0.11.1
|
||||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|||||||
6
.github/workflows/vllm_ascend_test.yaml
vendored
6
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/pre-commit.yml
|
uses: ./.github/workflows/pre-commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: c9461e05a4ed3557cfbf4b15ded1e26761cc39ca
|
vllm: releases/v0.11.1
|
||||||
|
|
||||||
changes:
|
changes:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -83,7 +83,7 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
|
vllm_version: [releases/v0.11.1, v0.11.0]
|
||||||
steps:
|
steps:
|
||||||
- name: Install packages
|
- name: Install packages
|
||||||
run: |
|
run: |
|
||||||
@@ -140,7 +140,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
|
vllm_version: [releases/v0.11.1, v0.11.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
@@ -69,7 +69,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [c9461e05a4ed3557cfbf4b15ded1e26761cc39ca, v0.11.0]
|
vllm_version: [releases/v0.11.1, v0.11.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
@@ -2,11 +2,17 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
from vllm.attention import AttentionBackend
|
from vllm.attention import AttentionBackend
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import is_pin_memory_available
|
|
||||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
||||||
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
|
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
|
||||||
TransferResult, TransferSpec)
|
TransferResult, TransferSpec)
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.utils import is_pin_memory_available
|
||||||
|
else:
|
||||||
|
from vllm.utils.platform_utils import is_pin_memory_available
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,11 @@ from vllm.model_executor.models.qwen2_5_vl import (
|
|||||||
from vllm.model_executor.models.utils import maybe_prefix
|
from vllm.model_executor.models.utils import maybe_prefix
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz,
|
||||||
|
vllm_version_is)
|
||||||
|
|
||||||
|
if not vllm_version_is("0.11.0"):
|
||||||
|
from vllm.model_executor.models.vision import conv3d_to_linear_weight
|
||||||
|
|
||||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||||
@@ -355,6 +359,9 @@ class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
|
|||||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||||
loaded_params: Set[str] = set()
|
loaded_params: Set[str] = set()
|
||||||
for name, loaded_weight in weights:
|
for name, loaded_weight in weights:
|
||||||
|
if not vllm_version_is("0.11.0"):
|
||||||
|
if name.endswith("patch_embed.proj.weight"):
|
||||||
|
loaded_weight = conv3d_to_linear_weight(loaded_weight)
|
||||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||||
if weight_name not in name:
|
if weight_name not in name:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -40,7 +40,11 @@ from vllm.model_executor.models.qwen2_vl import (
|
|||||||
from vllm.model_executor.models.utils import maybe_prefix
|
from vllm.model_executor.models.utils import maybe_prefix
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, is_enable_nz
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz,
|
||||||
|
vllm_version_is)
|
||||||
|
|
||||||
|
if not vllm_version_is("0.11.0"):
|
||||||
|
from vllm.model_executor.models.vision import conv3d_to_linear_weight
|
||||||
|
|
||||||
MIN_PAD_SIZE = 64 # min_size to pad weight
|
MIN_PAD_SIZE = 64 # min_size to pad weight
|
||||||
MAX_PAD_SIZE = 128 # max_size to pad weight
|
MAX_PAD_SIZE = 128 # max_size to pad weight
|
||||||
@@ -304,6 +308,10 @@ class AscendQwen2VisionTransformer(Qwen2VisionTransformer):
|
|||||||
loaded_params: Set[str] = set()
|
loaded_params: Set[str] = set()
|
||||||
|
|
||||||
for name, loaded_weight in weights:
|
for name, loaded_weight in weights:
|
||||||
|
if not vllm_version_is("0.11.0"):
|
||||||
|
if name.endswith("patch_embed.proj.weight"):
|
||||||
|
loaded_weight = conv3d_to_linear_weight(loaded_weight)
|
||||||
|
|
||||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||||
if weight_name not in name:
|
if weight_name not in name:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -33,7 +33,8 @@ from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
|
|||||||
delete_torchair_cache_file)
|
delete_torchair_cache_file)
|
||||||
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
|
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
|
||||||
prefill_context_parallel_enable,
|
prefill_context_parallel_enable,
|
||||||
update_aclgraph_sizes, vllm_version_is)
|
update_aclgraph_sizes,
|
||||||
|
update_cudagraph_capture_sizes, vllm_version_is)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
@@ -142,24 +143,47 @@ class NPUPlatform(Platform):
|
|||||||
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
|
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
|
||||||
"as the performance of operators supporting this feature "
|
"as the performance of operators supporting this feature "
|
||||||
"functionality is currently suboptimal.")
|
"functionality is currently suboptimal.")
|
||||||
if not model_config.is_multimodal_model and \
|
if vllm_version_is("0.11.0"):
|
||||||
structured_outputs_config.backend == "auto" and \
|
if not model_config.is_multimodal_model and \
|
||||||
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
structured_outputs_config.backend == "auto" and \
|
||||||
scheduler_config.policy == "fcfs":
|
not scheduler_config.send_delta_data and \
|
||||||
ascend_scheduler_config.enabled = True
|
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||||
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
scheduler_config.policy == "fcfs":
|
||||||
ascend_scheduler_config, "enable_chunked_prefill", False)
|
ascend_scheduler_config.enabled = True
|
||||||
if chunked_prefill_enabled_in_ascend_scheduler:
|
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||||
logger.warning(
|
ascend_scheduler_config, "enable_chunked_prefill",
|
||||||
"Chunked prefill feature is enabled in ascend_scheduler,"
|
False)
|
||||||
"but note that the operator supporting this feature "
|
if chunked_prefill_enabled_in_ascend_scheduler:
|
||||||
"would lead to performance degradation.")
|
logger.warning(
|
||||||
# In this situation, max_num_batched_tokens would have been rewritten.
|
"Chunked prefill feature is enabled in ascend_scheduler,"
|
||||||
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
|
"but note that the operator supporting this feature "
|
||||||
if (scheduler_config.max_num_batched_tokens
|
"would lead to performance degradation.")
|
||||||
< scheduler_config.max_model_len
|
# In this situation, max_num_batched_tokens would have been rewritten.
|
||||||
and not chunked_prefill_enabled_in_ascend_scheduler):
|
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
|
||||||
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
|
if (scheduler_config.max_num_batched_tokens
|
||||||
|
< scheduler_config.max_model_len and
|
||||||
|
not chunked_prefill_enabled_in_ascend_scheduler):
|
||||||
|
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
|
||||||
|
else:
|
||||||
|
if not model_config.is_multimodal_model and \
|
||||||
|
structured_outputs_config.backend == "auto" and \
|
||||||
|
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
|
||||||
|
scheduler_config.policy == "fcfs":
|
||||||
|
ascend_scheduler_config.enabled = True
|
||||||
|
chunked_prefill_enabled_in_ascend_scheduler = getattr(
|
||||||
|
ascend_scheduler_config, "enable_chunked_prefill",
|
||||||
|
False)
|
||||||
|
if chunked_prefill_enabled_in_ascend_scheduler:
|
||||||
|
logger.warning(
|
||||||
|
"Chunked prefill feature is enabled in ascend_scheduler,"
|
||||||
|
"but note that the operator supporting this feature "
|
||||||
|
"would lead to performance degradation.")
|
||||||
|
# In this situation, max_num_batched_tokens would have been rewritten.
|
||||||
|
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
|
||||||
|
if (scheduler_config.max_num_batched_tokens
|
||||||
|
< scheduler_config.max_model_len and
|
||||||
|
not chunked_prefill_enabled_in_ascend_scheduler):
|
||||||
|
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
|
||||||
|
|
||||||
kv_cache_dtype = vllm_config.additional_config.get(
|
kv_cache_dtype = vllm_config.additional_config.get(
|
||||||
"kv_cache_dtype", None)
|
"kv_cache_dtype", None)
|
||||||
@@ -237,8 +261,12 @@ class NPUPlatform(Platform):
|
|||||||
f"{vllm_config.parallel_config.tensor_parallel_size}")
|
f"{vllm_config.parallel_config.tensor_parallel_size}")
|
||||||
if len(sp_aclgraph_sizes) != len(original_sizes):
|
if len(sp_aclgraph_sizes) != len(original_sizes):
|
||||||
compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
|
compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes
|
||||||
vllm_config.compilation_config.init_with_cudagraph_sizes(
|
if vllm_version_is("0.11.0"):
|
||||||
sp_aclgraph_sizes)
|
compilation_config.init_with_cudagraph_sizes(
|
||||||
|
sp_aclgraph_sizes)
|
||||||
|
else:
|
||||||
|
update_cudagraph_capture_sizes(vllm_config,
|
||||||
|
sp_aclgraph_sizes)
|
||||||
|
|
||||||
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
|
# TODO: Full graph is fully supported later, and the default value will be set to full graph.
|
||||||
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||||
|
|||||||
@@ -5,10 +5,17 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import vllm.v1.sample.rejection_sampler as rs
|
import vllm.v1.sample.rejection_sampler as rs
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
from vllm.v1.sample.rejection_sampler import (RejectionSampler, compute_probs,
|
from vllm.v1.sample.rejection_sampler import (RejectionSampler,
|
||||||
generate_uniform_probs)
|
generate_uniform_probs)
|
||||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
|
from vllm.v1.sample.rejection_sampler import compute_probs
|
||||||
|
else:
|
||||||
|
from vllm.v1.sample.rejection_sampler import apply_sampling_constraints
|
||||||
|
|
||||||
PLACEHOLDER_TOKEN_ID = -1
|
PLACEHOLDER_TOKEN_ID = -1
|
||||||
GREEDY_TEMPERATURE = -1
|
GREEDY_TEMPERATURE = -1
|
||||||
# Maximum number of speculative draft tokens allowed per request in a single
|
# Maximum number of speculative draft tokens allowed per request in a single
|
||||||
@@ -82,11 +89,19 @@ class AscendRejectionSampler(RejectionSampler, nn.Module):
|
|||||||
# [num_tokens, vocab_size]
|
# [num_tokens, vocab_size]
|
||||||
# NOTE(woosuk): `target_logits` can be updated in place inside the
|
# NOTE(woosuk): `target_logits` can be updated in place inside the
|
||||||
# `compute_probs` function.
|
# `compute_probs` function.
|
||||||
target_probs = compute_probs(
|
if vllm_version_is("0.11.0"):
|
||||||
target_logits,
|
target_probs = compute_probs(
|
||||||
metadata.cu_num_draft_tokens,
|
target_logits,
|
||||||
sampling_metadata,
|
metadata.cu_num_draft_tokens,
|
||||||
)
|
sampling_metadata,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
target_logits = apply_sampling_constraints(
|
||||||
|
target_logits,
|
||||||
|
metadata.cu_num_draft_tokens,
|
||||||
|
sampling_metadata,
|
||||||
|
)
|
||||||
|
target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
|
||||||
|
|
||||||
output_token_ids = rejection_sample(
|
output_token_ids = rejection_sample(
|
||||||
metadata.draft_token_ids,
|
metadata.draft_token_ids,
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
|||||||
from vllm.model_executor.model_loader import get_model
|
from vllm.model_executor.model_loader import get_model
|
||||||
from vllm.model_executor.models import supports_multimodal
|
from vllm.model_executor.models import supports_multimodal
|
||||||
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
|
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
|
||||||
from vllm.utils import is_pin_memory_available
|
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||||
@@ -27,8 +26,10 @@ from vllm_ascend.utils import vllm_version_is
|
|||||||
|
|
||||||
if vllm_version_is("0.11.0"):
|
if vllm_version_is("0.11.0"):
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
|
from vllm.utils import is_pin_memory_available
|
||||||
else:
|
else:
|
||||||
from vllm.config import CompilationMode
|
from vllm.config import CompilationMode
|
||||||
|
from vllm.utils.platform_utils import is_pin_memory_available
|
||||||
|
|
||||||
PADDING_SLOT_ID = -1
|
PADDING_SLOT_ID = -1
|
||||||
|
|
||||||
|
|||||||
@@ -311,6 +311,41 @@ def get_max_hidden_layers(hf_config) -> int:
|
|||||||
return max(layer_counts)
|
return max(layer_counts)
|
||||||
|
|
||||||
|
|
||||||
|
# Update cudagraph capture sizes for vllm config
|
||||||
|
def update_cudagraph_capture_sizes(vllm_config: VllmConfig,
|
||||||
|
cudagraph_capture_sizes: List[int]):
|
||||||
|
|
||||||
|
valid_max_size = (cudagraph_capture_sizes[-1]
|
||||||
|
if cudagraph_capture_sizes else 0)
|
||||||
|
if (vllm_config.compilation_config.max_cudagraph_capture_size is not None
|
||||||
|
and vllm_config.compilation_config.max_cudagraph_capture_size
|
||||||
|
!= valid_max_size):
|
||||||
|
if vllm_config.compilation_config.cudagraph_capture_sizes is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"customized max_cudagraph_capture_size"
|
||||||
|
f"(={vllm_config.compilation_config.max_cudagraph_capture_size}) "
|
||||||
|
"should be consistent with the max value of "
|
||||||
|
f"cudagraph_capture_sizes(={valid_max_size})")
|
||||||
|
logger.warning(
|
||||||
|
"Truncating max_cudagraph_capture_size to %d",
|
||||||
|
valid_max_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
vllm_config.compilation_config.max_cudagraph_capture_size = valid_max_size
|
||||||
|
|
||||||
|
if vllm_config.compilation_config.cudagraph_capture_sizes is not None and len(
|
||||||
|
cudagraph_capture_sizes) < len(
|
||||||
|
vllm_config.compilation_config.cudagraph_capture_sizes):
|
||||||
|
logger.warning(
|
||||||
|
("cudagraph_capture_sizes specified in compilation_config"
|
||||||
|
" %s is overridden by config %s"),
|
||||||
|
vllm_config.compilation_config.cudagraph_capture_sizes,
|
||||||
|
cudagraph_capture_sizes,
|
||||||
|
)
|
||||||
|
vllm_config.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes
|
||||||
|
vllm_config.compilation_config.post_init_cudagraph_sizes()
|
||||||
|
|
||||||
|
|
||||||
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||||
"""Update ACL graph capture sizes based on hardware limitations"""
|
"""Update ACL graph capture sizes based on hardware limitations"""
|
||||||
# NOTE: Currently, we can only capture 1800 graphs at most,
|
# NOTE: Currently, we can only capture 1800 graphs at most,
|
||||||
@@ -402,7 +437,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
|||||||
indices[0], indices[-1] = 0, len(original_sizes) - 1
|
indices[0], indices[-1] = 0, len(original_sizes) - 1
|
||||||
|
|
||||||
sampled_sizes = [original_sizes[i] for i in indices]
|
sampled_sizes = [original_sizes[i] for i in indices]
|
||||||
compilation_config.init_with_cudagraph_sizes(sampled_sizes)
|
if vllm_version_is("0.11.0"):
|
||||||
|
compilation_config.init_with_cudagraph_sizes(sampled_sizes)
|
||||||
|
else:
|
||||||
|
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
|
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
|
||||||
@@ -433,7 +471,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
|||||||
if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
|
if original_sizes[0] < (num_speculative_tokens + 1) * max_num_seqs:
|
||||||
enlarged_sizes = [(num_speculative_tokens + 1) * size
|
enlarged_sizes = [(num_speculative_tokens + 1) * size
|
||||||
for size in original_sizes]
|
for size in original_sizes]
|
||||||
compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
|
if vllm_version_is("0.11.0"):
|
||||||
|
compilation_config.init_with_cudagraph_sizes(enlarged_sizes)
|
||||||
|
else:
|
||||||
|
update_cudagraph_capture_sizes(vllm_config, enlarged_sizes)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Adjusted ACL graphs: %s → %s for speculative decoding",
|
"Adjusted ACL graphs: %s → %s for speculative decoding",
|
||||||
original_sizes, enlarged_sizes)
|
original_sizes, enlarged_sizes)
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ from vllm.pooling_params import PoolingParams
|
|||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||||
from vllm.utils import cdiv, is_pin_memory_available
|
from vllm.utils import cdiv
|
||||||
from vllm.utils.jsontree import json_map_leaves
|
from vllm.utils.jsontree import json_map_leaves
|
||||||
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
|
||||||
from vllm.v1.attention.backends.utils import (
|
from vllm.v1.attention.backends.utils import (
|
||||||
@@ -159,13 +159,14 @@ else:
|
|||||||
if vllm_version_is("0.11.0"):
|
if vllm_version_is("0.11.0"):
|
||||||
from vllm.attention.layer import Attention
|
from vllm.attention.layer import Attention
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
from vllm.utils import LazyLoader
|
from vllm.utils import LazyLoader, is_pin_memory_available
|
||||||
|
|
||||||
from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
|
from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
|
||||||
else:
|
else:
|
||||||
from vllm.attention.layer import MLAAttention
|
from vllm.attention.layer import MLAAttention
|
||||||
from vllm.config import CompilationMode
|
from vllm.config import CompilationMode
|
||||||
from vllm.utils.import_utils import LazyLoader
|
from vllm.utils.import_utils import LazyLoader
|
||||||
|
from vllm.utils.platform_utils import is_pin_memory_available
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import xgrammar as xgr # type: ignore[import-untyped]
|
import xgrammar as xgr # type: ignore[import-untyped]
|
||||||
@@ -386,7 +387,11 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.drafter = get_spec_decode_method(
|
self.drafter = get_spec_decode_method(
|
||||||
self.speculative_config.method, self.vllm_config,
|
self.speculative_config.method, self.vllm_config,
|
||||||
self.device, self)
|
self.device, self)
|
||||||
self.rejection_sampler = AscendRejectionSampler()
|
if vllm_version_is("0.11.0"):
|
||||||
|
self.rejection_sampler = AscendRejectionSampler()
|
||||||
|
else:
|
||||||
|
self.rejection_sampler = AscendRejectionSampler(
|
||||||
|
self.sampler)
|
||||||
self.actual_seq_lengths_q = list(
|
self.actual_seq_lengths_q = list(
|
||||||
range(self.decode_token_per_req, self.max_num_tokens + 1,
|
range(self.decode_token_per_req, self.max_num_tokens + 1,
|
||||||
self.decode_token_per_req))
|
self.decode_token_per_req))
|
||||||
@@ -1885,6 +1890,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# TODO: Optimize the CPU -> NPU copy.
|
# TODO: Optimize the CPU -> NPU copy.
|
||||||
cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
|
cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
|
||||||
self.device, non_blocking=True)
|
self.device, non_blocking=True)
|
||||||
|
if not vllm_version_is("0.11.0"):
|
||||||
|
cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to(
|
||||||
|
self.device, non_blocking=True)
|
||||||
logits_indices = torch.from_numpy(logits_indices).to(self.device,
|
logits_indices = torch.from_numpy(logits_indices).to(self.device,
|
||||||
non_blocking=True)
|
non_blocking=True)
|
||||||
target_logits_indices = torch.from_numpy(target_logits_indices).to(
|
target_logits_indices = torch.from_numpy(target_logits_indices).to(
|
||||||
@@ -1896,15 +1904,25 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# draft_token_indices: [ 1, 2, 3, 105, 106, 208]
|
# draft_token_indices: [ 1, 2, 3, 105, 106, 208]
|
||||||
draft_token_ids = self.input_ids[logits_indices]
|
draft_token_ids = self.input_ids[logits_indices]
|
||||||
draft_token_ids = draft_token_ids[target_logits_indices + 1]
|
draft_token_ids = draft_token_ids[target_logits_indices + 1]
|
||||||
|
if vllm_version_is("0.11.0"):
|
||||||
metadata = SpecDecodeMetadata(
|
metadata = SpecDecodeMetadata(
|
||||||
draft_token_ids=draft_token_ids,
|
draft_token_ids=draft_token_ids,
|
||||||
num_draft_tokens=num_draft_tokens.tolist(),
|
num_draft_tokens=num_draft_tokens.tolist(),
|
||||||
cu_num_draft_tokens=cu_num_draft_tokens,
|
cu_num_draft_tokens=cu_num_draft_tokens,
|
||||||
target_logits_indices=target_logits_indices,
|
target_logits_indices=target_logits_indices,
|
||||||
bonus_logits_indices=bonus_logits_indices,
|
bonus_logits_indices=bonus_logits_indices,
|
||||||
logits_indices=logits_indices,
|
logits_indices=logits_indices,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
metadata = SpecDecodeMetadata(
|
||||||
|
draft_token_ids=draft_token_ids,
|
||||||
|
num_draft_tokens=num_draft_tokens.tolist(),
|
||||||
|
cu_num_draft_tokens=cu_num_draft_tokens,
|
||||||
|
cu_num_sampled_tokens=cu_num_sampled_tokens,
|
||||||
|
target_logits_indices=target_logits_indices,
|
||||||
|
bonus_logits_indices=bonus_logits_indices,
|
||||||
|
logits_indices=logits_indices,
|
||||||
|
)
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
def apply_grammar_bitmask(
|
def apply_grammar_bitmask(
|
||||||
|
|||||||
Reference in New Issue
Block a user