[bugfix]fix extra npu context in device 0 (#8041)

### What this PR does / why we need it? When we launch a PD-disaggregated process and send requests, an additional processes appear on NPU 0, becasue when a thread has a primary cuda context, the child thread it creates automatically doesn't inherit the cuda context. See https://forums.developer.nvidia.com/t/when-a-thread-has-a-primary-cuda-context-does-the-child-thread-it-creates-automatically-inherit-the-cuda-context/362810. vLLM has fixed this issue in [pr-37449 ](https://github.com/vllm-project/vllm/pull/37449), but version 0.18.0 does not include the fix. Therefore, we need to patch it.  ### Does this PR introduce _any_ user-facing change? no  ### How was this patch tested?  --------- Signed-off-by: zouyida <zouyida@huawei.com> Co-authored-by: zouyida <zouyida@huawei.com>
2026-04-08 23:35:52 +08:00
parent 4a628f1042
commit c40a387f63
2 changed files with 88 additions and 7 deletions
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -14,8 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import os
-
 import vllm_ascend.patch.platform.patch_distributed  # noqa
 import vllm_ascend.patch.platform.patch_fusion_matcher_compat_ops  # noqa
 import vllm_ascend.patch.platform.patch_kv_cache_interface  # noqa
@@ -27,13 +25,11 @@ if not is_310p():
 else:
    import vllm_ascend.patch.platform.patch_mamba_config_310  # noqa
 import vllm_ascend.patch.platform.patch_minimax_m2_config  # noqa
+import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.platform.patch_torch_accelerator  # noqa
 import vllm_ascend.patch.platform.patch_minimax_usage_accounting  # noqa
 import vllm_ascend.patch.platform.patch_glm_tool_call_parser  # noqa

-if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
-    import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
-
 if envs.VLLM_ASCEND_BALANCE_SCHEDULING:
    import vllm_ascend.patch.platform.patch_balance_schedule  # noqa