[CI] Optimize nightly CI (#3858)
### What this PR does / why we need it?
This patch optimize nightly CI:
1. Bug fixes ais_bench get None repo_type error
2. Fix A2 install kubectl error with arm arch
3. Fix the multi_node CI unable to determine whether the job was
successful error
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0rc3
- vLLM main:
83f478bb19
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
31
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
31
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
@@ -65,6 +65,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
name: ${{ inputs.config_file_path }}
|
||||
# This is a runner with no NPU for k8s controller
|
||||
runs-on: ${{ inputs.runner }}
|
||||
container:
|
||||
@@ -112,9 +113,10 @@ jobs:
|
||||
# prepare for lws entrypoint scripts
|
||||
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
||||
|
||||
- name: Clear result ret
|
||||
- name: Clear resources
|
||||
run: |
|
||||
rm -f $RESULT_FILE
|
||||
# pre clear the crd resources created by lws
|
||||
kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
|
||||
|
||||
- name: Launch cluster
|
||||
run: |
|
||||
@@ -153,6 +155,8 @@ jobs:
|
||||
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
||||
-D result_file_path="$result_file_path" \
|
||||
-D npu_per_node="$npu_per_node" \
|
||||
-D controller_name="$HOSTNAME" \
|
||||
-D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
|
||||
--outfile lws.yaml
|
||||
|
||||
kubectl apply -f ./lws.yaml
|
||||
@@ -178,29 +182,6 @@ jobs:
|
||||
run: |
|
||||
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
|
||||
|
||||
- name: Determine is success
|
||||
run: |
|
||||
TIMEOUT=300
|
||||
ELAPSED=0
|
||||
while [ ! -f "$RESULT_FILE" ]; do
|
||||
sleep 5
|
||||
ELAPSED=$((ELAPSED + 5))
|
||||
if [ $ELAPSED -ge $TIMEOUT ]; then
|
||||
echo "Timeout waiting for test result file"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
RET=$(cat "$RESULT_FILE")
|
||||
echo "Test result: $RET"
|
||||
|
||||
if [ "$RET" -ne 0 ]; then
|
||||
echo "Test failed"
|
||||
exit 1
|
||||
else
|
||||
echo "Test succeeded"
|
||||
fi
|
||||
|
||||
- name: Post process
|
||||
if: always()
|
||||
run: |
|
||||
|
||||
Reference in New Issue
Block a user