diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index d9bbcf63..f9ecbadd 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -65,6 +65,7 @@ concurrency: jobs: e2e: + name: ${{ inputs.config_file_path }} # This is a runner with no NPU for k8s controller runs-on: ${{ inputs.runner }} container: @@ -112,9 +113,10 @@ jobs: # prepare for lws entrypoint scripts install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh - - name: Clear result ret + - name: Clear resources run: | - rm -f $RESULT_FILE + # pre clear the crd resources created by lws + kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found - name: Launch cluster run: | @@ -153,6 +155,8 @@ jobs: -D vllm_ascend_ref="$vllm_ascend_ref" \ -D result_file_path="$result_file_path" \ -D npu_per_node="$npu_per_node" \ + -D controller_name="$HOSTNAME" \ + -D kb_secret=${{ secrets.KUBECONFIG_B64 }} \ --outfile lws.yaml kubectl apply -f ./lws.yaml @@ -178,29 +182,6 @@ jobs: run: | kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" - - name: Determine is success - run: | - TIMEOUT=300 - ELAPSED=0 - while [ ! -f "$RESULT_FILE" ]; do - sleep 5 - ELAPSED=$((ELAPSED + 5)) - if [ $ELAPSED -ge $TIMEOUT ]; then - echo "Timeout waiting for test result file" - exit 1 - fi - done - - RET=$(cat "$RESULT_FILE") - echo "Test result: $RET" - - if [ "$RET" -ne 0 ]; then - echo "Test failed" - exit 1 - else - echo "Test succeeded" - fi - - name: Post process if: always() run: | diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index 9aab8286..8e3224e2 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -49,12 +49,10 @@ concurrency: jobs: e2e-nightly: - name: e2e-nightly + name: ${{ inputs.tests }} runs-on: ${{ inputs.runner }} container: image: ${{ inputs.image }} - env: - VLLM_USE_MODELSCOPE: True steps: - name: Check npu and CANN info run: | @@ -111,5 +109,4 @@ jobs: VLLM_USE_MODELSCOPE: True VLLM_CI_RUNNER: ${{ inputs.runner }} run: | - # TODO: enable more tests pytest -sv ${{ inputs.tests }} diff --git a/.github/workflows/_kill_lws_resources.yaml b/.github/workflows/_kill_lws_resources.yaml new file mode 100644 index 00000000..dd6549bd --- /dev/null +++ b/.github/workflows/_kill_lws_resources.yaml @@ -0,0 +1,57 @@ +name: 'resource clear' + +on: + workflow_call: + inputs: + runner: + required: false + type: string + default: linux-aarch64-a3-0 + secrets: + KUBECONFIG_B64: + required: true + + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +jobs: + resource_clear: + # This is a runner with no NPU for k8s controller + runs-on: ${{ inputs.runner }} + container: + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + env: + KUBECONFIG: /tmp/kubeconfig + KUBECTL: /root/.cache/.kube/kubectl + NAMESPACE: vllm-project + LEADER_POD: vllm-0 + RESULT_FILE: /root/.cache/tests/ret/test_result.txt + steps: + - name: Install kubectl + run: | + # Install kubectl + arch=$(uname -m) + + if echo "$arch" | grep -qiE "arm|aarch64"; then + echo "Detected ARM architecture: $arch" + KUBECTL="$KUBECTL"_arm + fi + install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl + + # Verify kubectl installation + kubectl version --client=true + + - name: Decode kubeconfig from secrets + run: | + # Decode and save kubeconfig + echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG + + - name: Clear LWS resources + if: always() + run: | + kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml index 7a34b234..7254f9cf 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -101,6 +101,12 @@ jobs: - name: multi-node-dpsk-4node-pd config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml size: 4 + - name: multi-node-qwenw8a8-2node + config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml + size: 2 + - name: multi-node-glm-2node + config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml + size: 2 uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: soc_version: a3 @@ -111,3 +117,12 @@ jobs: config_file_path: ${{ matrix.test_config.config_file_path }} secrets: KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} + + clear_resources: + needs: multi-node-tests + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + uses: ./.github/workflows/_kill_lws_resources.yaml + with: + runner: linux-aarch64-a3-0 + secrets: + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} diff --git a/tests/e2e/nightly/multi_node/config/multi_node_config.py b/tests/e2e/nightly/multi_node/config/multi_node_config.py index 620ba39c..3d540d84 100644 --- a/tests/e2e/nightly/multi_node/config/multi_node_config.py +++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py @@ -15,7 +15,7 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port, setup_logger() logger = logging.getLogger(__name__) -DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py" +DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py" DISAGGEGATED_PREFILL_PORT = 5333 diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index e6b5e98d..ba12baea 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -7,7 +7,7 @@ spec: replicas: {{ replicas | default(1) }} leaderWorkerTemplate: size: {{ size | default(2) }} - restartPolicy: RecreateGroupOnPodRestart + restartPolicy: None leaderTemplate: metadata: labels: @@ -30,6 +30,10 @@ spec: value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} - name: RESULT_FILE_PATH value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }} + - name: CONTROLLER_NAME + value: {{ controller_name | default("placeholder") }} + - name: SECRET + value: {{ kb_secret | default("placeholder") }} command: - sh - -c diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index 38cbe4ef..c76bb20a 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -162,14 +162,31 @@ kill_npu_processes() { } run_tests() { - pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py + set +e kill_npu_processes + pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py ret=$? if [ "$LWS_WORKER_INDEX" -eq 0 ]; then - mkdir -p "$(dirname "$RESULT_FILE_PATH")" - echo $ret > "$RESULT_FILE_PATH" + if [ $ret -eq 0 ]; then + print_success "All tests passed!" + else + print_error "Some tests failed!" + kubectl delete pod $CONTROLLER_NAME -n vllm-project + fi fi - return $ret + set -e +} + +install_kubectl() { + arch=$(uname -m) + KUBECTL=/root/.cache/.kube/kubectl + if echo "$arch" | grep -qiE "arm|aarch64"; then + echo "Detected ARM architecture: $arch" + KUBECTL="$KUBECTL"_arm + fi + install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl + echo "$SECRET" | base64 -d > /tmp/kubeconfig + export KUBECONFIG=/tmp/kubeconfig } main() { @@ -177,6 +194,7 @@ main() { check_and_config checkout_src install_sys_dependencies + install_kubectl install_vllm install_ais_bench # to speed up mooncake build process, install Go here diff --git a/tools/aisbench.py b/tools/aisbench.py index e8c8159c..5fabc465 100644 --- a/tools/aisbench.py +++ b/tools/aisbench.py @@ -284,12 +284,12 @@ def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None): def maybe_download_from_modelscope( model: str, - repo_type: str | None = None, + repo_type: str = "model", revision: str | None = None, download_dir: str | None = None, ignore_patterns: str | list[str] | None = None, allow_patterns: list[str] | str | None = None, -) -> str | None: +) -> str: """ Download model/dataset from ModelScope hub. Returns the path to the downloaded model, or None if the model is not @@ -310,5 +310,4 @@ def maybe_download_from_modelscope( ) else: model_path = model - return model_path - return None + return model_path