diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index f9ecbadd..db1a6414 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -60,13 +60,13 @@ defaults: # only cancel in-progress runs of the same workflow # and ignore the lint / 8 cards test type concurrency: - group: ascend-nightly-${{ github.ref }}-${{ inputs.config_file_path }} + group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path }} cancel-in-progress: true jobs: e2e: name: ${{ inputs.config_file_path }} - # This is a runner with no NPU for k8s controller + # This is the runner with no NPU for k8s controller runs-on: ${{ inputs.runner }} container: image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 @@ -75,7 +75,7 @@ jobs: KUBECTL: /root/.cache/.kube/kubectl NAMESPACE: vllm-project LEADER_POD: vllm-0 - RESULT_FILE: /root/.cache/tests/ret/test_result.txt + RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }} steps: - name: Install system denpendencies run: | @@ -84,7 +84,7 @@ jobs: pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple pip install jinja2-cli - apt-get update -y && apt-get install -y git curl + #apt-get update -y && apt-get install -y git curl - name: Install kubectl run: | @@ -117,8 +117,8 @@ jobs: run: | # pre clear the crd resources created by lws kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found - - name: Launch cluster + id: launcher run: | set -e @@ -130,6 +130,8 @@ jobs: vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}" vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}" result_file_path="$RESULT_FILE" + fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}" + echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV required_params=("size" "replicas" "image" "config_file_path") for param in "${required_params[@]}"; do @@ -155,8 +157,7 @@ jobs: -D vllm_ascend_ref="$vllm_ascend_ref" \ -D result_file_path="$result_file_path" \ -D npu_per_node="$npu_per_node" \ - -D controller_name="$HOSTNAME" \ - -D kb_secret=${{ secrets.KUBECONFIG_B64 }} \ + -D fail_tag="$fail_tag" \ --outfile lws.yaml kubectl apply -f ./lws.yaml @@ -180,7 +181,14 @@ jobs: - name: Stream logs run: | - kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" + set -euo pipefail + echo "Looking for logs containing: $FAIL_TAG" + kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do + echo "$line" + if echo "$line" | grep -q "$FAIL_TAG"; then + exit 1 # workflow step failed + fi + done - name: Post process if: always() diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index 8e3224e2..e77f5623 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -44,7 +44,7 @@ defaults: # only cancel in-progress runs of the same workflow # and ignore the lint / 1 card / 4 cards test type concurrency: - group: ascend-nightly-${{ github.ref }}-${{ inputs.tests }} + group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }} cancel-in-progress: true jobs: diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml index 83fbb4d6..0842e7e1 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -42,6 +42,7 @@ concurrency: jobs: single-node-tests: + name: single-node if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' strategy: fail-fast: false @@ -63,6 +64,7 @@ jobs: tests: ${{ matrix.test_config.tests }} multi-node-tests: + name: multi-node needs: single-node-tests if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') strategy: @@ -71,10 +73,10 @@ jobs: matrix: test_config: - name: multi-node-deepseek-dp - config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml + config_file_path: DeepSeek-R1-W8A8-A2.yaml size: 2 - name: multi-node-deepseek-dp-torchair - config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml + config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml size: 2 uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml index 7254f9cf..2cd6d817 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -42,6 +42,7 @@ concurrency: jobs: single-node-tests: + name: single-node if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' strategy: fail-fast: false @@ -85,6 +86,7 @@ jobs: tests: ${{ matrix.test_config.tests }} multi-node-tests: + name: multi-node needs: single-node-tests if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') strategy: @@ -93,19 +95,19 @@ jobs: matrix: test_config: - name: multi-node-deepseek-pd - config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml + config_file_path: DeepSeek-V3.yaml size: 2 - name: multi-node-qwen3-dp - config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml + config_file_path: Qwen3-235B-A3B.yaml size: 2 - name: multi-node-dpsk-4node-pd - config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml + config_file_path: DeepSeek-R1-W8A8.yaml size: 4 - name: multi-node-qwenw8a8-2node - config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml + config_file_path: Qwen3-235B-W8A8.yaml size: 2 - name: multi-node-glm-2node - config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml + config_file_path: GLM-4_5.yaml size: 2 uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: @@ -117,12 +119,3 @@ jobs: config_file_path: ${{ matrix.test_config.config_file_path }} secrets: KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} - - clear_resources: - needs: multi-node-tests - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - uses: ./.github/workflows/_kill_lws_resources.yaml - with: - runner: linux-aarch64-a3-0 - secrets: - KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 64ac88cc..dc68bd12 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -163,10 +163,11 @@ class RemoteOpenAIServer: self.proxy_port = proxy_port self._start_server(model, vllm_serve_args, env_dict) - max_wait_seconds = max_wait_seconds or 7200 + max_wait_seconds = max_wait_seconds or 1800 if self.disaggregated_prefill: assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided" - self._wait_for_server_pd(proxy_port=proxy_port) + self._wait_for_server_pd(proxy_port=proxy_port, + timeout=max_wait_seconds) else: self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds) @@ -186,7 +187,7 @@ class RemoteOpenAIServer: """Subclasses override this method to customize process polling""" return self.proc.poll() - def hang_until_terminated(self) -> None: + def hang_until_terminated(self, url) -> None: """ Wait until the server process terminates. This is for headless mode, where the api server @@ -196,7 +197,7 @@ class RemoteOpenAIServer: try: while True: try: - resp = client.get(self.url_for("health"), timeout=5) + resp = client.get(url, timeout=5) if resp.status_code != 200: break time.sleep(5) @@ -206,7 +207,7 @@ class RemoteOpenAIServer: if isinstance(client, httpx.Client): client.close() - def _wait_for_server_pd(self, proxy_port: int): + def _wait_for_server_pd(self, proxy_port: int, timeout: float): # Wait for all api_server nodes ready assert self.nodes_info is not None, "cluster info must be provided" for node_info in self.nodes_info: @@ -214,12 +215,12 @@ class RemoteOpenAIServer: continue url_health = f"http://{node_info.ip}:{node_info.server_port}/health" - self._wait_for_server(url=url_health, timeout=7200) + self._wait_for_server(url=url_health, timeout=timeout) # Wait for proxy ready master_node = self.nodes_info[0] url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck" - self._wait_for_server(url=url_proxy, timeout=7200) + self._wait_for_server(url=url_proxy, timeout=timeout) def _wait_for_server(self, *, url: str, timeout: float): # run health check diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2-torchair.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml similarity index 100% rename from tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2-torchair.yaml rename to tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2-torchair.yaml diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml similarity index 100% rename from tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2.yaml rename to tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml index c5b34c9d..8c00803c 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml @@ -97,3 +97,12 @@ deployment: } }' benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 4096 + batch_size: 512 + baseline: 95 + threshold: 5 diff --git a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml index b72bb542..7fde3392 100644 --- a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml +++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml @@ -47,3 +47,4 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 benchmarks: + diff --git a/tests/e2e/nightly/multi_node/config/multi_node_config.py b/tests/e2e/nightly/multi_node/config/multi_node_config.py index 3d540d84..9bde4581 100644 --- a/tests/e2e/nightly/multi_node/config/multi_node_config.py +++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py @@ -17,6 +17,7 @@ setup_logger() logger = logging.getLogger(__name__) DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py" DISAGGEGATED_PREFILL_PORT = 5333 +CONFIG_BASE_PATH = "tests/e2e/nightly/multi_node/config/models/" @dataclass @@ -187,9 +188,8 @@ class MultiNodeConfig: @classmethod def from_yaml(cls, yaml_path: Optional[str] = None): if not yaml_path: - yaml_path = os.getenv( - "CONFIG_YAML_PATH", - "tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") + yaml_path = os.getenv("CONFIG_YAML_PATH", "DeepSeek-V3.yaml") + yaml_path = os.path.join(CONFIG_BASE_PATH, yaml_path) with open(yaml_path, 'r') as file: config_data = yaml.safe_load(file) test_name = config_data.get("test_name", "default_test") @@ -255,6 +255,7 @@ class MultiNodeConfig: ranktable_path = self.disaggregated_prefill.get("ranktable_path") assert ranktable_gen_path is not None and ranktable_path is not None if os.path.exists(str(ranktable_path)): + logger.info("ranktable has already generated") return local_host = self.cur_ip @@ -286,6 +287,8 @@ class MultiNodeConfig: assert self.nic_name is not None env["GLOO_SOCKET_IFNAME"] = self.nic_name + logger.info( + f"Generating ranktable from command: {' '.join(map(str, cmd))}") subprocess.run(cmd, env=env, check=True) assert os.path.exists( str(ranktable_path)), "failed generate ranktable.json" diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index ba12baea..f619b597 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -18,7 +18,7 @@ spec: image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }} env: - name: CONFIG_YAML_PATH - value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }} + value: {{ config_file_path | default("DeepSeek-V3.yaml") }} - name: WORKSPACE value: "/root/workspace" # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here. @@ -29,11 +29,9 @@ spec: - name: VLLM_ASCEND_REMOTE_URL value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} - name: RESULT_FILE_PATH - value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }} - - name: CONTROLLER_NAME - value: {{ controller_name | default("placeholder") }} - - name: SECRET - value: {{ kb_secret | default("placeholder") }} + value: {{ result_file_path | default("/root/.cache/tests/ret") }} + - name: FAIL_TAG + value: {{ fail_tag | default("FAIL_TAG") }} command: - sh - -c @@ -80,7 +78,7 @@ spec: image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }} env: - name: CONFIG_YAML_PATH - value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }} + value: {{ config_file_path | default("DeepSeek-V3.yaml") }} - name: WORKSPACE value: "/root/workspace" # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here. @@ -92,6 +90,8 @@ spec: value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} - name: RESULT_FILE_PATH value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }} + - name: FAIL_TAG + value: {{ fail_tag | default("FAIL_TAG") }} command: - sh - -c diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index c76bb20a..78d829cd 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -20,6 +20,11 @@ print_section() { echo -e "\n${BLUE}=== $1 ===${NC}" } +print_failure() { + echo -e "${RED}${FAIL_TAG} ✗ ERROR: $1${NC}" + exit 1 +} + # Function to print success messages print_success() { echo -e "${GREEN}✓ $1${NC}" @@ -161,32 +166,24 @@ kill_npu_processes() { sleep 4 } -run_tests() { +run_tests_with_log() { set +e kill_npu_processes - pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py - ret=$? + BASENAME=$(basename "$CONFIG_YAML_PATH" .yaml) + # each worker should have log file + LOG_FILE="${RESULT_FILE_PATH}/${BASENAME}_worker_${LWS_WORKER_INDEX}.log" + mkdir -p ${RESULT_FILE_PATH} + pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py 2>&1 | tee $LOG_FILE + ret=${PIPESTATUS[0]} + set -e if [ "$LWS_WORKER_INDEX" -eq 0 ]; then if [ $ret -eq 0 ]; then print_success "All tests passed!" else - print_error "Some tests failed!" - kubectl delete pod $CONTROLLER_NAME -n vllm-project + print_failure "Some tests failed!" + mv LOG_FILE error_${LOG_FILE} fi fi - set -e -} - -install_kubectl() { - arch=$(uname -m) - KUBECTL=/root/.cache/.kube/kubectl - if echo "$arch" | grep -qiE "arm|aarch64"; then - echo "Detected ARM architecture: $arch" - KUBECTL="$KUBECTL"_arm - fi - install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl - echo "$SECRET" | base64 -d > /tmp/kubeconfig - export KUBECONFIG=/tmp/kubeconfig } main() { @@ -194,7 +191,6 @@ main() { check_and_config checkout_src install_sys_dependencies - install_kubectl install_vllm install_ais_bench # to speed up mooncake build process, install Go here @@ -203,7 +199,7 @@ main() { . $SRC_DIR/vllm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh \ pooling_async_memecpy_v1 9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3 cd "$WORKSPACE/source_code/vllm-ascend" - run_tests + run_tests_with_log } main "$@" diff --git a/tests/e2e/nightly/multi_node/test_multi_node.py b/tests/e2e/nightly/multi_node/test_multi_node.py index 19bdf64a..2b23e755 100644 --- a/tests/e2e/nightly/multi_node/test_multi_node.py +++ b/tests/e2e/nightly/multi_node/test_multi_node.py @@ -118,6 +118,11 @@ async def test_multi_node() -> None: port = proxy_port if disaggregated_prefill else server_port # aisbench test aisbench_cases = [acc_cmd, perf_cmd] - run_aisbench_cases(local_model_path, port, aisbench_cases) + run_aisbench_cases(local_model_path, + port, + aisbench_cases, + host_ip=config.cluster_ips[0]) else: - remote_server.hang_until_terminated() + # for the nodes except master, should hang until the task complete + master_url = f"http://{config.cluster_ips[0]}:{server_port}/health" + remote_server.hang_until_terminated(master_url) diff --git a/tools/aisbench.py b/tools/aisbench.py index 5fabc465..14f1468e 100644 --- a/tools/aisbench.py +++ b/tools/aisbench.py @@ -68,6 +68,7 @@ class AisbenchRunner: model: str, port: int, aisbench_config: dict, + host_ip: str = "localhost", verify=True): self.model = model self.dataset_path = maybe_download_from_modelscope( @@ -76,6 +77,7 @@ class AisbenchRunner: assert self.dataset_path is not None and self.model_path is not None, \ f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}" self.port = port + self.host_ip = host_ip self.task_type = aisbench_config["case_type"] self.request_conf = aisbench_config["request_conf"] self.dataset_conf = aisbench_config.get("dataset_conf") @@ -131,6 +133,7 @@ class AisbenchRunner: content = f.read() content = re.sub(r'model=.*', f'model="{self.model}",', content) content = re.sub(r'host_port.*', f'host_port = {self.port},', content) + content = re.sub(r'host_ip.*', f'host_ip = "{self.host_ip}",', content) content = re.sub(r'max_out_len.*', f'max_out_len = {self.max_out_len},', content) content = re.sub(r'batch_size.*', f'batch_size = {self.batch_size},', @@ -238,14 +241,21 @@ class AisbenchRunner: assert self.baseline - self.threshold <= acc_value <= self.baseline + self.threshold, f"Accuracy verification failed. The accuracy of {self.dataset_path} is {acc_value}, which is not within {self.threshold} relative to baseline {self.baseline}." -def run_aisbench_cases(model, port, aisbench_cases, server_args=""): +def run_aisbench_cases(model, + port, + aisbench_cases, + server_args="", + host_ip="localhost"): aisbench_results = [] aisbench_errors = [] for aisbench_case in aisbench_cases: if not aisbench_case: continue try: - with AisbenchRunner(model, port, aisbench_case) as aisbench: + with AisbenchRunner(model=model, + port=port, + host_ip=host_ip, + aisbench_config=aisbench_case) as aisbench: aisbench_results.append(aisbench.result) except Exception as e: aisbench_results.append("")