[Bugfix] fix dp parallel + tp > 1 offline inference port conflict (#4539)

### What this PR does / why we need it? fix dp parallel + tp > 1 offline inference port conflict issue import PR:https://github.com/vllm-project/vllm-ascend/pull/429 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2025-11-29 18:37:11 +08:00
parent 1874265074
commit a3041cd78c
3 changed files with 53 additions and 22 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -269,6 +269,7 @@ jobs:
            tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC 
            # tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP \
            # tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP
          pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py
      - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct)
        shell: bash -l {0}
--- a/tests/e2e/multicard/test_data_parallel_tp2.py
+++ b/tests/e2e/multicard/test_data_parallel_tp2.py
@@ -0,0 +1,52 @@
 """
 Run `pytest tests/e2e/multicard/test_data_parallel_tp2.py`.
 """
 import os
 import subprocess
 import sys
 from unittest.mock import patch
 import pytest
 MODELS = ["Qwen/Qwen3-0.6B"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"})
 def test_data_parallel_inference(model, max_tokens):
    script = "examples/offline_data_parallel.py"
    env = os.environ.copy()
    cmd = [
        sys.executable,
        script,
        "--model",
        model,
        "--dp-size",
        "2",
        "--tp-size",
        "2",
        "--node-size",
        "1",
        "--node-rank",
        "0",
        "--trust-remote-code",
    ]
    print(f"Running subprocess: {' '.join(cmd)}")
    proc = subprocess.run(cmd,
                          env=env,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT,
                          timeout=600)
    output = proc.stdout.decode()
    print(output)
    assert "DP rank 0 needs to process" in output
    assert "DP rank 1 needs to process" in output
    assert "Generated text:" in output
    assert proc.returncode == 0
--- a/vllm_ascend/patch/platform/patch_distributed.py
+++ b/vllm_ascend/patch/platform/patch_distributed.py
@@ -18,32 +18,10 @@
 # This file is a part of the vllm-ascend project.
 import torch
 import vllm.envs as envs_vllm
 from vllm.config import ParallelConfig
 from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 def parallel_config_get_dp_port(self) -> int:
    """
    We might need to initialize process groups in multiple
    processes that is related to data parallelism,
    e.g. both in the worker and in the engine, which
    can live in different processes. To avoid port conflicts, we
    increment the port number each time we need to initialize a
    new process group related to data parallelism.
    """
    answer = self.data_parallel_master_port
    self.data_parallel_master_port += 1
    # NOTE: Get port from envs directly when using torchrun
    port = envs_vllm.VLLM_DP_MASTER_PORT if envs_vllm.VLLM_DP_MASTER_PORT else answer
    return port
 ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
 class NullHandle:
    def __init__(self):