[main2main] upgrade vllm to 0308 (#7213)
### What this PR does / why we need it?
Update main2main to vllm 0308.
breaks:
* https://github.com/vllm-project/vllm/pull/30681
* https://github.com/vllm-project/vllm/pull/35552 remove
self.cudagraph_batch_sizes
* https://github.com/vllm-project/vllm/pull/35158 clear_metadata ->
defer_finalize
* https://github.com/vllm-project/vllm/pull/36006 remove
CacheConfig.cpu_offload_gb
* https://github.com/vllm-project/vllm/pull/35472
* https://github.com/vllm-project/vllm/pull/34552 attn_metadata_builder
* https://github.com/vllm-project/vllm/pull/30515 profile_seq_lens
* https://github.com/vllm-project/vllm/pull/28053
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
10
.github/workflows/_e2e_test.yaml
vendored
10
.github/workflows/_e2e_test.yaml
vendored
@@ -110,7 +110,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Upload timing data
|
- name: Upload timing data
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
if: ${{ inputs.continue_on_error == true }}
|
if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
|
||||||
with:
|
with:
|
||||||
name: timing-data-singlecard-light-part${{ matrix.part }}
|
name: timing-data-singlecard-light-part${{ matrix.part }}
|
||||||
path: test_timing_data.json
|
path: test_timing_data.json
|
||||||
@@ -200,7 +200,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Upload timing data
|
- name: Upload timing data
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
if: ${{ inputs.continue_on_error == true }}
|
if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
|
||||||
with:
|
with:
|
||||||
name: timing-data-singlecard-full-part${{ matrix.part }}
|
name: timing-data-singlecard-full-part${{ matrix.part }}
|
||||||
path: test_timing_data.json
|
path: test_timing_data.json
|
||||||
@@ -289,7 +289,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Upload timing data
|
- name: Upload timing data
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
if: ${{ inputs.continue_on_error == true }}
|
if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
|
||||||
with:
|
with:
|
||||||
name: timing-data-2card-light-part${{ matrix.part }}
|
name: timing-data-2card-light-part${{ matrix.part }}
|
||||||
path: test_timing_data.json
|
path: test_timing_data.json
|
||||||
@@ -378,7 +378,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Upload timing data
|
- name: Upload timing data
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
if: ${{ inputs.continue_on_error == true }}
|
if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
|
||||||
with:
|
with:
|
||||||
name: timing-data-2card-full-part${{ matrix.part }}
|
name: timing-data-2card-full-part${{ matrix.part }}
|
||||||
path: test_timing_data.json
|
path: test_timing_data.json
|
||||||
@@ -475,7 +475,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Upload timing data
|
- name: Upload timing data
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
if: ${{ inputs.continue_on_error == true }}
|
if: ${{ inputs.continue_on_error == true && github.event_name != 'pull_request' }}
|
||||||
with:
|
with:
|
||||||
name: timing-data-4card-full-part${{ matrix.part }}
|
name: timing-data-4card-full-part${{ matrix.part }}
|
||||||
path: test_timing_data.json
|
path: test_timing_data.json
|
||||||
|
|||||||
2
.github/workflows/bot_pr_create.yaml
vendored
2
.github/workflows/bot_pr_create.yaml
vendored
@@ -37,7 +37,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Get vLLM version
|
- name: Get vLLM version
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d
|
VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87
|
||||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
|
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
|
|||||||
|
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
# For lint purpose, actually we need make a main2main matching.
|
# For lint purpose, actually we need make a main2main matching.
|
||||||
ARG VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d
|
ARG VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
|
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
|
||||||
cd /vllm-workspace/vllm && \
|
cd /vllm-workspace/vllm && \
|
||||||
git checkout $VLLM_COMMIT
|
git checkout $VLLM_COMMIT
|
||||||
|
|||||||
2
.github/workflows/pr_test_full.yaml
vendored
2
.github/workflows/pr_test_full.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
|
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
6
.github/workflows/pr_test_light.yaml
vendored
6
.github/workflows/pr_test_light.yaml
vendored
@@ -41,7 +41,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/_pre_commit.yml
|
uses: ./.github/workflows/_pre_commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: 4034c3d32e30d01639459edd3ab486f56993876d
|
vllm: 4497431df654e46fb1fb5e64bf8611e762ae5d87
|
||||||
changes:
|
changes:
|
||||||
runs-on: linux-aarch64-a2b3-0
|
runs-on: linux-aarch64-a2b3-0
|
||||||
outputs:
|
outputs:
|
||||||
@@ -90,7 +90,7 @@ jobs:
|
|||||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
|
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
|
||||||
uses: ./.github/workflows/_unit_test.yaml
|
uses: ./.github/workflows/_unit_test.yaml
|
||||||
with:
|
with:
|
||||||
vllm: ${{ matrix.vllm_version }}
|
vllm: ${{ matrix.vllm_version }}
|
||||||
@@ -102,7 +102,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
|
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ jobs:
|
|||||||
name: refresh codecov
|
name: refresh codecov
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d]
|
vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87]
|
||||||
uses: ./.github/workflows/_unit_test.yaml
|
uses: ./.github/workflows/_unit_test.yaml
|
||||||
with:
|
with:
|
||||||
vllm: ${{ matrix.vllm_version }}
|
vllm: ${{ matrix.vllm_version }}
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
|
|||||||
|
|
||||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||||
|-------------|--------------|------------------|-------------|--------------------|
|
|-------------|--------------|------------------|-------------|--------------------|
|
||||||
| main | 4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
| main | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
|
||||||
|
|
||||||
## Release cadence
|
## Release cadence
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoE
|
|||||||
|
|
||||||
from vllm_ascend.ascend_config import init_ascend_config
|
from vllm_ascend.ascend_config import init_ascend_config
|
||||||
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
|
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
# isort: on
|
# isort: on
|
||||||
|
|
||||||
|
|
||||||
@@ -22,22 +21,6 @@ class TestAscendConfig(unittest.TestCase):
|
|||||||
"eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
|
"eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
|
||||||
}
|
}
|
||||||
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
|
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
|
||||||
if vllm_version_is("0.16.0"):
|
|
||||||
moe_parallel_config = FusedMoEParallelConfig(
|
|
||||||
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", is_sequence_parallel=True, enable_eplb=True)
|
|
||||||
moe_config = FusedMoEConfig(
|
|
||||||
num_experts=8,
|
|
||||||
experts_per_token=8,
|
|
||||||
hidden_dim=8192,
|
|
||||||
intermediate_size_per_partition=5,
|
|
||||||
num_local_experts=8,
|
|
||||||
activation="silu",
|
|
||||||
device="npu",
|
|
||||||
routing_method=RoutingMethodType.Simulated,
|
|
||||||
moe_parallel_config=moe_parallel_config,
|
|
||||||
in_dtype=torch.float16,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
moe_parallel_config = FusedMoEParallelConfig(
|
moe_parallel_config = FusedMoEParallelConfig(
|
||||||
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
|
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
|
||||||
enable_eplb=True)
|
enable_eplb=True)
|
||||||
|
|||||||
@@ -152,6 +152,7 @@ class NPUModelRunner310(NPUModelRunner):
|
|||||||
remove_lora: bool = True,
|
remove_lora: bool = True,
|
||||||
is_graph_capturing: bool = False,
|
is_graph_capturing: bool = False,
|
||||||
num_active_loras: int = 0,
|
num_active_loras: int = 0,
|
||||||
|
profile_seq_lens: int | None = None,
|
||||||
):
|
):
|
||||||
temporary_context = self.temporary_modify_uniform_decode_query_len() if uniform_decode else nullcontext()
|
temporary_context = self.temporary_modify_uniform_decode_query_len() if uniform_decode else nullcontext()
|
||||||
with temporary_context:
|
with temporary_context:
|
||||||
@@ -168,6 +169,7 @@ class NPUModelRunner310(NPUModelRunner):
|
|||||||
remove_lora=remove_lora,
|
remove_lora=remove_lora,
|
||||||
is_graph_capturing=is_graph_capturing,
|
is_graph_capturing=is_graph_capturing,
|
||||||
num_active_loras=num_active_loras,
|
num_active_loras=num_active_loras,
|
||||||
|
profile_seq_lens=profile_seq_lens,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _check_and_update_cudagraph_mode(
|
def _check_and_update_cudagraph_mode(
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
#
|
#
|
||||||
import copy
|
import copy
|
||||||
import functools
|
import functools
|
||||||
|
import logging
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -31,7 +32,9 @@ from vllm.config import VllmConfig
|
|||||||
from vllm.config.utils import Range
|
from vllm.config.utils import Range
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
|
from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
|
||||||
from vllm_ascend.utils import COMPILATION_PASS_KEY
|
from vllm_ascend.utils import COMPILATION_PASS_KEY, vllm_version_is
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable:
|
def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable:
|
||||||
@@ -83,6 +86,11 @@ def npugraph_ex_compile(
|
|||||||
config.mode = "reduce-overhead"
|
config.mode = "reduce-overhead"
|
||||||
# execute FX graph in eager mode before graph mode to optimize FX graph.
|
# execute FX graph in eager mode before graph mode to optimize FX graph.
|
||||||
config.debug.run_eagerly = True
|
config.debug.run_eagerly = True
|
||||||
|
if not vllm_version_is("0.17.0"):
|
||||||
|
# This is a temporary fix to resolve issues with inplace operations in some testcases like test_whisper.
|
||||||
|
# Avoid to change torch.ops.aten.gelu.default to torch.ops.aten.gelu_.default which will fallback to CPU
|
||||||
|
# and cause copy_between_host_and_device error.
|
||||||
|
config.debug.aclgraph.disable_reinplace_inplaceable_ops_pass = True
|
||||||
if ascend_compilation_config.enable_static_kernel:
|
if ascend_compilation_config.enable_static_kernel:
|
||||||
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
|
config.experimental_config.aclgraph._aclnn_static_shape_kernel = True
|
||||||
# According to the cudagraph_capture_size configuration, set the shapes
|
# According to the cudagraph_capture_size configuration, set the shapes
|
||||||
@@ -134,6 +142,22 @@ class AscendCompiler(CompilerInterface):
|
|||||||
# see https://github.com/pytorch/pytorch/issues/138980
|
# see https://github.com/pytorch/pytorch/issues/138980
|
||||||
graph = copy.deepcopy(graph)
|
graph = copy.deepcopy(graph)
|
||||||
|
|
||||||
|
if not vllm_version_is("0.17.0"):
|
||||||
|
from torch._guards import detect_fake_mode
|
||||||
|
|
||||||
|
current_fake_mode = detect_fake_mode()
|
||||||
|
if current_fake_mode is not None:
|
||||||
|
example_inputs = [
|
||||||
|
current_fake_mode.from_tensor(inp)
|
||||||
|
if (
|
||||||
|
isinstance(inp, torch.Tensor)
|
||||||
|
and hasattr(inp, "fake_mode")
|
||||||
|
and inp.fake_mode is not current_fake_mode
|
||||||
|
)
|
||||||
|
else inp
|
||||||
|
for inp in example_inputs
|
||||||
|
]
|
||||||
|
|
||||||
ascend_compilation_config = get_ascend_config().ascend_compilation_config
|
ascend_compilation_config = get_ascend_config().ascend_compilation_config
|
||||||
if ascend_compilation_config.enable_npugraph_ex:
|
if ascend_compilation_config.enable_npugraph_ex:
|
||||||
assert hasattr(self, "vllm_config")
|
assert hasattr(self, "vllm_config")
|
||||||
|
|||||||
@@ -18,17 +18,12 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch._inductor.pattern_matcher import PatternMatcherPass
|
from torch._inductor.pattern_matcher import PatternMatcherPass
|
||||||
|
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.config.compilation import Range
|
from vllm.config.compilation import Range
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
from vllm_ascend.compilation.passes.base_pattern import BasePattern
|
from vllm_ascend.compilation.passes.base_pattern import BasePattern
|
||||||
from vllm_ascend.utils import vllm_version_is
|
|
||||||
|
|
||||||
if vllm_version_is("0.15.0"):
|
|
||||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore
|
|
||||||
else:
|
|
||||||
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
|
|
||||||
|
|
||||||
|
|
||||||
class MulsAddPattern(BasePattern):
|
class MulsAddPattern(BasePattern):
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa
|
|||||||
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
import vllm_ascend.patch.platform.patch_mamba_config # noqa
|
||||||
import vllm_ascend.patch.platform.patch_minimax_m2_config # noqa
|
import vllm_ascend.patch.platform.patch_minimax_m2_config # noqa
|
||||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||||
|
import vllm_ascend.patch.platform.patch_torch_accelerator # noqa
|
||||||
|
|
||||||
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
|
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
|
||||||
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
import weakref
|
import weakref
|
||||||
from collections import deque
|
from collections import deque
|
||||||
@@ -19,6 +21,8 @@ from vllm.v1.executor.multiproc_executor import (
|
|||||||
set_multiprocessing_worker_envs,
|
set_multiprocessing_worker_envs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
|
||||||
class AscendMultiprocExecutor(MultiprocExecutor):
|
class AscendMultiprocExecutor(MultiprocExecutor):
|
||||||
def _init_executor(self) -> None:
|
def _init_executor(self) -> None:
|
||||||
@@ -26,6 +30,7 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
|||||||
# and ensure workers will be terminated.
|
# and ensure workers will be terminated.
|
||||||
self._finalizer = weakref.finalize(self, self.shutdown)
|
self._finalizer = weakref.finalize(self, self.shutdown)
|
||||||
self.is_failed = False
|
self.is_failed = False
|
||||||
|
if vllm_version_is("0.17.0"):
|
||||||
self.shutdown_event = threading.Event()
|
self.shutdown_event = threading.Event()
|
||||||
self.failure_callback: FailureCallback | None = None
|
self.failure_callback: FailureCallback | None = None
|
||||||
|
|
||||||
@@ -66,6 +71,7 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
|||||||
success = False
|
success = False
|
||||||
try:
|
try:
|
||||||
global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
|
global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
|
||||||
|
if vllm_version_is("0.17.0"):
|
||||||
for local_rank in range(self.local_world_size):
|
for local_rank in range(self.local_world_size):
|
||||||
global_rank = global_start_rank + local_rank
|
global_rank = global_start_rank + local_rank
|
||||||
is_driver_worker = self._is_driver_worker(global_rank)
|
is_driver_worker = self._is_driver_worker(global_rank)
|
||||||
@@ -80,6 +86,29 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
|||||||
is_driver_worker=is_driver_worker,
|
is_driver_worker=is_driver_worker,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
# When using fork, keep track of socket file descriptors that are
|
||||||
|
# inherited by the worker, so that we can close them in subsequent
|
||||||
|
# workers
|
||||||
|
inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None
|
||||||
|
|
||||||
|
for local_rank in range(self.local_world_size):
|
||||||
|
global_rank = global_start_rank + local_rank
|
||||||
|
is_driver_worker = self._is_driver_worker(global_rank)
|
||||||
|
unready_worker_handle = AscendWorkerProc.make_worker_process(
|
||||||
|
vllm_config=self.vllm_config,
|
||||||
|
local_rank=local_rank,
|
||||||
|
rank=global_rank,
|
||||||
|
distributed_init_method=distributed_init_method,
|
||||||
|
input_shm_handle=scheduler_output_handle,
|
||||||
|
shared_worker_lock=shared_worker_lock,
|
||||||
|
is_driver_worker=is_driver_worker,
|
||||||
|
inherited_fds=inherited_fds,
|
||||||
|
)
|
||||||
|
unready_workers.append(unready_worker_handle)
|
||||||
|
if inherited_fds is not None:
|
||||||
|
inherited_fds.append(unready_worker_handle.death_writer.fileno())
|
||||||
|
inherited_fds.append(unready_worker_handle.ready_pipe.fileno())
|
||||||
|
|
||||||
# Workers must be created before wait_for_ready to avoid
|
# Workers must be created before wait_for_ready to avoid
|
||||||
# deadlock, since worker.init_device() does a device sync.
|
# deadlock, since worker.init_device() does a device sync.
|
||||||
@@ -124,6 +153,8 @@ class AscendMultiprocExecutor(MultiprocExecutor):
|
|||||||
for uw in unready_workers:
|
for uw in unready_workers:
|
||||||
if uw.death_writer is not None:
|
if uw.death_writer is not None:
|
||||||
uw.death_writer.close()
|
uw.death_writer.close()
|
||||||
|
if not vllm_version_is("0.17.0"):
|
||||||
|
uw.death_writer = None
|
||||||
self._ensure_worker_termination([uw.proc for uw in unready_workers])
|
self._ensure_worker_termination([uw.proc for uw in unready_workers])
|
||||||
|
|
||||||
self.output_rank = self._get_output_rank()
|
self.output_rank = self._get_output_rank()
|
||||||
@@ -158,8 +189,10 @@ class AscendWorkerProc(WorkerProc):
|
|||||||
input_shm_handle, # Receive SchedulerOutput
|
input_shm_handle, # Receive SchedulerOutput
|
||||||
shared_worker_lock: LockType,
|
shared_worker_lock: LockType,
|
||||||
is_driver_worker: bool = False,
|
is_driver_worker: bool = False,
|
||||||
|
inherited_fds: list[int] | None = None,
|
||||||
) -> UnreadyWorkerProcHandle:
|
) -> UnreadyWorkerProcHandle:
|
||||||
context = get_mp_context()
|
context = get_mp_context()
|
||||||
|
if vllm_version_is("0.17.0"):
|
||||||
# (reader, writer)
|
# (reader, writer)
|
||||||
reader, writer = context.Pipe(duplex=False)
|
reader, writer = context.Pipe(duplex=False)
|
||||||
|
|
||||||
@@ -190,6 +223,42 @@ class AscendWorkerProc(WorkerProc):
|
|||||||
# Keep death_writer open in parent - when parent exits,
|
# Keep death_writer open in parent - when parent exits,
|
||||||
# death_reader in child will get EOFError
|
# death_reader in child will get EOFError
|
||||||
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
|
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
|
||||||
|
else:
|
||||||
|
# Ready pipe to communicate readiness from child to parent
|
||||||
|
ready_reader, ready_writer = context.Pipe(duplex=False)
|
||||||
|
# Death pipe to let child detect parent process exit
|
||||||
|
death_reader, death_writer = context.Pipe(duplex=False)
|
||||||
|
if inherited_fds is not None:
|
||||||
|
inherited_fds = inherited_fds.copy()
|
||||||
|
inherited_fds.extend((ready_reader.fileno(), death_writer.fileno()))
|
||||||
|
process_kwargs = {
|
||||||
|
"vllm_config": vllm_config,
|
||||||
|
"local_rank": local_rank,
|
||||||
|
"rank": rank,
|
||||||
|
"distributed_init_method": distributed_init_method,
|
||||||
|
"input_shm_handle": input_shm_handle,
|
||||||
|
"ready_pipe": ready_writer,
|
||||||
|
"death_pipe": death_reader,
|
||||||
|
"shared_worker_lock": shared_worker_lock,
|
||||||
|
"is_driver_worker": is_driver_worker,
|
||||||
|
# Have the worker close parent end of this worker's pipes too
|
||||||
|
"inherited_fds": inherited_fds if inherited_fds is not None else [],
|
||||||
|
}
|
||||||
|
# Run EngineCore busy loop in background process.
|
||||||
|
proc = context.Process(
|
||||||
|
target=WorkerProc.worker_main,
|
||||||
|
kwargs=process_kwargs,
|
||||||
|
name=f"VllmWorker-{rank}",
|
||||||
|
daemon=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
proc.start()
|
||||||
|
# Close child ends of pipes here in the parent
|
||||||
|
ready_writer.close()
|
||||||
|
death_reader.close()
|
||||||
|
# Keep death_writer open in parent - when parent exits,
|
||||||
|
# death_reader in child will get EOFError
|
||||||
|
return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
|
||||||
|
|
||||||
|
|
||||||
vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor
|
vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor
|
||||||
|
|||||||
8
vllm_ascend/patch/platform/patch_torch_accelerator.py
Normal file
8
vllm_ascend/patch/platform/patch_torch_accelerator.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def patch_empty_cache() -> None:
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
torch.accelerator.empty_cache = patch_empty_cache
|
||||||
@@ -46,7 +46,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
|||||||
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
|
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
|
||||||
from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
|
from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
|
||||||
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
|
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
|
||||||
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is
|
from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled
|
||||||
|
|
||||||
# Currently we will fix block size to a small one since `num_reqs` can't be too large
|
# Currently we will fix block size to a small one since `num_reqs` can't be too large
|
||||||
_PREPARE_INPUTS_BLOCK_SIZE = 4
|
_PREPARE_INPUTS_BLOCK_SIZE = 4
|
||||||
@@ -615,7 +615,6 @@ class SpecDecodeBaseProposer(EagleProposer):
|
|||||||
if not self.parallel_drafting:
|
if not self.parallel_drafting:
|
||||||
for draft_step in range(1, self.num_speculative_tokens):
|
for draft_step in range(1, self.num_speculative_tokens):
|
||||||
per_layer_attn_metadata = dict()
|
per_layer_attn_metadata = dict()
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
for attn_group in self.draft_attn_groups:
|
for attn_group in self.draft_attn_groups:
|
||||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
||||||
draft_step,
|
draft_step,
|
||||||
@@ -632,28 +631,12 @@ class SpecDecodeBaseProposer(EagleProposer):
|
|||||||
)
|
)
|
||||||
for layer_name in self.attn_layer_names:
|
for layer_name in self.attn_layer_names:
|
||||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
per_layer_attn_metadata[layer_name] = attn_metadata
|
||||||
else:
|
|
||||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
|
||||||
draft_step,
|
|
||||||
attn_metadata,
|
|
||||||
common_attn_metadata,
|
|
||||||
batch_size,
|
|
||||||
num_input_tokens,
|
|
||||||
used_update_positions,
|
|
||||||
aclgraph_runtime_mode,
|
|
||||||
ori_seq_len,
|
|
||||||
slot_indices,
|
|
||||||
mtp_slot_mapping,
|
|
||||||
)
|
|
||||||
for layer_name in self.attn_layer_names:
|
|
||||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
|
||||||
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
||||||
else:
|
else:
|
||||||
# Copy the old attn_metadata and update
|
# Copy the old attn_metadata and update
|
||||||
if not self.parallel_drafting:
|
if not self.parallel_drafting:
|
||||||
for draft_step in range(1, self.num_speculative_tokens):
|
for draft_step in range(1, self.num_speculative_tokens):
|
||||||
per_layer_attn_metadata = dict()
|
per_layer_attn_metadata = dict()
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
for attn_group in self.draft_attn_groups:
|
for attn_group in self.draft_attn_groups:
|
||||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
||||||
draft_step,
|
draft_step,
|
||||||
@@ -667,18 +650,6 @@ class SpecDecodeBaseProposer(EagleProposer):
|
|||||||
)
|
)
|
||||||
for layer_name in self.attn_layer_names:
|
for layer_name in self.attn_layer_names:
|
||||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
per_layer_attn_metadata[layer_name] = attn_metadata
|
||||||
else:
|
|
||||||
common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
|
|
||||||
draft_step,
|
|
||||||
attn_metadata,
|
|
||||||
common_attn_metadata,
|
|
||||||
batch_size,
|
|
||||||
num_input_tokens,
|
|
||||||
used_update_positions,
|
|
||||||
aclgraph_runtime_mode,
|
|
||||||
)
|
|
||||||
for layer_name in self.attn_layer_names:
|
|
||||||
per_layer_attn_metadata[layer_name] = attn_metadata
|
|
||||||
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
multi_steps_attn_metadata.append(per_layer_attn_metadata)
|
||||||
|
|
||||||
token_indices_to_sample_len = token_indices_to_sample.shape[0]
|
token_indices_to_sample_len = token_indices_to_sample.shape[0]
|
||||||
@@ -1082,16 +1053,11 @@ class SpecDecodeBaseProposer(EagleProposer):
|
|||||||
# 2.
|
# 2.
|
||||||
# Recompute the slot mapping based on the new positions and
|
# Recompute the slot mapping based on the new positions and
|
||||||
# rejection mask.
|
# rejection mask.
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
# Use the first draft attention group's kv_cache_spec for block_size
|
# Use the first draft attention group's kv_cache_spec for block_size
|
||||||
# (all draft layers share the same kv-cache group)
|
# (all draft layers share the same kv-cache group)
|
||||||
assert len(self.draft_attn_groups) > 0
|
assert len(self.draft_attn_groups) > 0
|
||||||
block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
|
block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
|
||||||
else:
|
|
||||||
if self.attn_metadata_builder is None:
|
|
||||||
block_size = self._get_attention_metadata_builder().kv_cache_spec.block_size
|
|
||||||
else:
|
|
||||||
block_size = self.attn_metadata_builder.kv_cache_spec.block_size
|
|
||||||
new_slot_mapping = compute_new_slot_mapping(
|
new_slot_mapping = compute_new_slot_mapping(
|
||||||
cad=cad,
|
cad=cad,
|
||||||
new_positions=self.positions[:total_num_output_tokens],
|
new_positions=self.positions[:total_num_output_tokens],
|
||||||
@@ -1130,7 +1096,6 @@ class SpecDecodeBaseProposer(EagleProposer):
|
|||||||
attn_group=None,
|
attn_group=None,
|
||||||
):
|
):
|
||||||
assert draft_step > 0
|
assert draft_step > 0
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
|
assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
|
||||||
common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
|
common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
|
||||||
|
|
||||||
@@ -1243,13 +1208,7 @@ class SpecDecodeBaseProposer(EagleProposer):
|
|||||||
# Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
|
# Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
|
||||||
common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
|
common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
|
||||||
|
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
attn_metadata_builder = attn_group.get_metadata_builder()
|
attn_metadata_builder = attn_group.get_metadata_builder()
|
||||||
else:
|
|
||||||
if self.attn_metadata_builder is None:
|
|
||||||
attn_metadata_builder = self._get_attention_metadata_builder()
|
|
||||||
else:
|
|
||||||
attn_metadata_builder = self.attn_metadata_builder
|
|
||||||
|
|
||||||
attn_metadata = attn_metadata_builder.build_for_drafting(
|
attn_metadata = attn_metadata_builder.build_for_drafting(
|
||||||
common_attn_metadata=common_attn_metadata,
|
common_attn_metadata=common_attn_metadata,
|
||||||
|
|||||||
@@ -412,7 +412,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
self.cpu_slot_mapping = None
|
self.cpu_slot_mapping = None
|
||||||
self.sampling_done_event: torch.npu.Event | None = None
|
self.sampling_done_event: torch.npu.Event | None = None
|
||||||
|
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
# self.cudagraph_batch_sizes sorts in ascending order.
|
# self.cudagraph_batch_sizes sorts in ascending order.
|
||||||
if (
|
if (
|
||||||
self.compilation_config.cudagraph_capture_sizes
|
self.compilation_config.cudagraph_capture_sizes
|
||||||
@@ -1376,7 +1375,12 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
skip_compiled=has_encoder_input,
|
skip_compiled=has_encoder_input,
|
||||||
),
|
),
|
||||||
self.maybe_get_kv_connector_output(
|
self.maybe_get_kv_connector_output(
|
||||||
scheduler_output, clear_metadata=clear_kv_metadata
|
scheduler_output,
|
||||||
|
**(
|
||||||
|
{"clear_metadata": clear_kv_metadata}
|
||||||
|
if vllm_version_is("0.17.0")
|
||||||
|
else {"defer_finalize": not clear_kv_metadata}
|
||||||
|
),
|
||||||
) as kv_connector_output,
|
) as kv_connector_output,
|
||||||
):
|
):
|
||||||
hidden_states = self._model_forward(
|
hidden_states = self._model_forward(
|
||||||
@@ -2253,6 +2257,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
remove_lora: bool = True,
|
remove_lora: bool = True,
|
||||||
is_graph_capturing: bool = False,
|
is_graph_capturing: bool = False,
|
||||||
num_active_loras: int = 0,
|
num_active_loras: int = 0,
|
||||||
|
profile_seq_lens: int | None = None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
# only support eager mode and piecewise graph now
|
# only support eager mode and piecewise graph now
|
||||||
assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes()
|
assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes()
|
||||||
@@ -2359,6 +2364,9 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# seq_lens. We use this seq_len only when capturing graph, and still use max_query_len
|
# seq_lens. We use this seq_len only when capturing graph, and still use max_query_len
|
||||||
# in inference. This will be removed once npu_fused_infer_attention_score
|
# in inference. This will be removed once npu_fused_infer_attention_score
|
||||||
# outperforms _npu_paged_attention on all cases.
|
# outperforms _npu_paged_attention on all cases.
|
||||||
|
if profile_seq_lens is not None:
|
||||||
|
seq_lens = profile_seq_lens
|
||||||
|
else:
|
||||||
seq_lens = (
|
seq_lens = (
|
||||||
SEQ_LEN_WITH_MAX_PA_WORKSPACE
|
SEQ_LEN_WITH_MAX_PA_WORKSPACE
|
||||||
if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config)
|
if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config)
|
||||||
@@ -2579,7 +2587,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
|
|
||||||
self.may_reinitialize_input_batch(kv_cache_config)
|
self.may_reinitialize_input_batch(kv_cache_config)
|
||||||
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
|
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
|
||||||
if vllm_version_is("0.17.0"):
|
|
||||||
# TODO: refactor the logic of attention
|
# TODO: refactor the logic of attention
|
||||||
# Initialize drafter attention group initialization
|
# Initialize drafter attention group initialization
|
||||||
if self.speculative_config and (
|
if self.speculative_config and (
|
||||||
@@ -3031,11 +3038,18 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
max_num_blocks.append(max_num_blocks_per_req)
|
max_num_blocks.append(max_num_blocks_per_req)
|
||||||
|
|
||||||
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
|
||||||
|
if vllm_version_is("0.17.0"):
|
||||||
assert self.cache_config.cpu_offload_gb == 0, (
|
assert self.cache_config.cpu_offload_gb == 0, (
|
||||||
"Cannot re-initialize the input batch when CPU weight "
|
"Cannot re-initialize the input batch when CPU weight "
|
||||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||||
"for more details."
|
"for more details."
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
assert self.offload_config.uva.cpu_offload_gb == 0, (
|
||||||
|
"Cannot re-initialize the input batch when CPU weight "
|
||||||
|
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
||||||
|
"for more details."
|
||||||
|
)
|
||||||
self.input_batch = NPUInputBatch(
|
self.input_batch = NPUInputBatch(
|
||||||
max_num_reqs=self.max_num_reqs,
|
max_num_reqs=self.max_num_reqs,
|
||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
|
|||||||
Reference in New Issue
Block a user