[bugfix]fix extra npu context in device 0 (#8041)

### What this PR does / why we need it? When we launch a PD-disaggregated process and send requests, an additional processes appear on NPU 0, becasue when a thread has a primary cuda context, the child thread it creates automatically doesn't inherit the cuda context. See https://forums.developer.nvidia.com/t/when-a-thread-has-a-primary-cuda-context-does-the-child-thread-it-creates-automatically-inherit-the-cuda-context/362810. vLLM has fixed this issue in [pr-37449 ](https://github.com/vllm-project/vllm/pull/37449), but version 0.18.0 does not include the fix. Therefore, we need to patch it.  ### Does this PR introduce _any_ user-facing change? no  ### How was this patch tested?  --------- Signed-off-by: zouyida <zouyida@huawei.com> Co-authored-by: zouyida <zouyida@huawei.com>
2026-04-08 23:35:52 +08:00
parent 4a628f1042
commit c40a387f63
2 changed files with 88 additions and 7 deletions
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -14,8 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import vllm_ascend.patch.platform.patch_distributed  # noqa
 import vllm_ascend.patch.platform.patch_fusion_matcher_compat_ops  # noqa
 import vllm_ascend.patch.platform.patch_kv_cache_interface  # noqa
@@ -27,13 +25,11 @@ if not is_310p():
 else:
    import vllm_ascend.patch.platform.patch_mamba_config_310  # noqa
 import vllm_ascend.patch.platform.patch_minimax_m2_config  # noqa
 import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.platform.patch_torch_accelerator  # noqa
 import vllm_ascend.patch.platform.patch_minimax_usage_accounting  # noqa
 import vllm_ascend.patch.platform.patch_glm_tool_call_parser  # noqa
 if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
    import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
 if envs.VLLM_ASCEND_BALANCE_SCHEDULING:
    import vllm_ascend.patch.platform.patch_balance_schedule  # noqa
--- a/vllm_ascend/patch/platform/patch_multiproc_executor.py
+++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py
@@ -1,14 +1,20 @@
 from __future__ import annotations
 import os
 import queue
 import weakref
 from collections import deque
 from collections.abc import Callable
 from multiprocessing.synchronize import Lock as LockType
 from threading import Thread
 import vllm.v1.executor.multiproc_executor
 from vllm import envs
 from vllm.config import VllmConfig
 from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue
 from vllm.envs import enable_envs_cache
 from vllm.platforms import current_platform
 from vllm.tracing import instrument
 from vllm.utils.network_utils import get_distributed_init_method, get_loopback_ip, get_open_port
 from vllm.utils.system_utils import get_mp_context
 from vllm.v1.executor.abstract import FailureCallback
@@ -19,6 +25,7 @@ from vllm.v1.executor.multiproc_executor import (
    WorkerProc,
    set_multiprocessing_worker_envs,
 )
 from vllm.v1.worker.worker_base import WorkerWrapperBase
 class AscendMultiprocExecutor(MultiprocExecutor):
@@ -159,6 +166,79 @@ class AscendMultiprocExecutor(MultiprocExecutor):
 class AscendWorkerProc(WorkerProc):
    @instrument(span_name="Worker init")
    def __init__(
        self,
        vllm_config: VllmConfig,
        local_rank: int,
        rank: int,
        distributed_init_method: str,
        input_shm_handle: Handle,
        shared_worker_lock: LockType,
        is_driver_worker: bool,
    ):
        self.rank = rank
        wrapper = WorkerWrapperBase(rpc_rank=local_rank, global_rank=rank)
        # TODO: move `init_worker` to executor level as a collective rpc call
        all_kwargs: list[dict] = [{} for _ in range(vllm_config.parallel_config.world_size)]
        all_kwargs[local_rank] = {
            "vllm_config": vllm_config,
            "local_rank": local_rank,
            "rank": rank,
            "distributed_init_method": distributed_init_method,
            "is_driver_worker": is_driver_worker,
            "shared_worker_lock": shared_worker_lock,
        }
        wrapper.init_worker(all_kwargs)
        self.worker = wrapper
        self.setup_proc_title_and_log_prefix(enable_ep=vllm_config.parallel_config.enable_expert_parallel)
        # Load model
        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
        if not is_eep_new_worker:
            self.worker.init_device()
            # Update process title now that parallel groups are initialized
            self.setup_proc_title_and_log_prefix(enable_ep=vllm_config.parallel_config.enable_expert_parallel)
            self.worker.load_model()
        scheduler_config = vllm_config.scheduler_config
        self.use_async_scheduling = scheduler_config.async_scheduling
        if self.use_async_scheduling:
            self.async_output_queue: queue.Queue = queue.Queue()
            self.async_output_copy_thread = Thread(
                target=self.async_output_busy_loop,
                daemon=True,
                name="WorkerAsyncOutputCopy",
            )
            self.async_output_copy_thread.start()
        # Set block size based on the attention backends
        current_platform.update_block_size_for_backend(vllm_config)
        # Initialize message queues after init_device() since multi-node setups
        # (nnodes_within_dp > 1) require distributed groups to be initialized
        self._init_message_queues(input_shm_handle, vllm_config)
        # Enable environment variable cache (e.g. assume no more
        # environment variable overrides after this point)
        enable_envs_cache()
    @staticmethod
    def worker_main(*args, **kwargs):
        from vllm_ascend.utils import adapt_patch
        adapt_patch(is_global_patch=True)
        WorkerProc.worker_main(*args, **kwargs)
    def async_output_busy_loop(self):
        """Entrypoint for the thread which handles outputs asynchronously."""
        if hasattr(self.worker, "device"):
            current_platform.set_device(self.worker.device)
        while True:
            output = self.async_output_queue.get()
            self.enqueue_output(output)
    @staticmethod
    def make_worker_process(
        vllm_config: VllmConfig,
@@ -192,11 +272,15 @@ class AscendWorkerProc(WorkerProc):
            "inherited_fds": inherited_fds if inherited_fds is not None else [],
        }
        # Run EngineCore busy loop in background process.
        daemon_mode = not (
            os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1")
            or os.getenv("EXPERT_MAP_RECORD", "false") == "true"
        )
        proc = context.Process(
-            target=WorkerProc.worker_main,
+            target=AscendWorkerProc.worker_main,
            kwargs=process_kwargs,
            name=f"VllmWorker-{rank}",
-            daemon=False,
+            daemon=daemon_mode,
        )
        proc.start()
@@ -209,3 +293,4 @@ class AscendWorkerProc(WorkerProc):
 vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor
 vllm.v1.executor.multiproc_executor.WorkerProc = AscendWorkerProc