Files
xc-llm-ascend/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
zhangxinyuehfad 1e4017e3fa [CI] support nightly ci for per pr by labels (#6483)
### What this PR does / why we need it?

This PR refactors the nightly CI workflows (A2 and A3) to support
running tests against a specific PR's code, in addition to the existing
scheduled/dispatch runs using pre-built images.

#### Motivation:
Previously, nightly tests could only be triggered by schedule or
workflow_dispatch, always using the pre-built nightly image. This change
allows developers to trigger nightly tests against their own PR's source
code, enabling early validation without waiting for a nightly build.

#### Changes
Trigger logic (parse-trigger job)

A new parse-trigger job is introduced in both
schedule_nightly_test_a2.yaml and schedule_nightly_test_a3.yaml to
centralize trigger evaluation:

`schedule / workflow_dispatch`: runs all tests with the pre-built image
(existing behavior preserved)
`pull_request (labeled + synchronize)`: runs only when:The PR has the
nightly-test label, and /nightly [test-names] comment exists (latest one
wins)

1. /nightly or /nightly all — runs all tests
2. /nightly test1 test2 — runs only named tests (comma-wrapped for exact
matching)

#### How to trigger
1. Add the nightly-test label to your PR
2. Comment /nightly (all tests) or /nightly test1 test2 (specific tests)
4. Re-triggering: add another /nightly comment and push a new commit
(synchronize event)

### Does this PR introduce _any_ user-facing change?
None

### How was this patch tested?

- vLLM version: v0.14.1
- vLLM main:
dc917cceb8

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-03-05 16:46:37 +08:00

143 lines
4.8 KiB
Django/Jinja

apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
namespace: vllm-project
spec:
replicas: {{ replicas | default(1) }}
leaderWorkerTemplate:
size: {{ size | default(2) }}
restartPolicy: None
leaderTemplate:
metadata:
labels:
role: leader
spec:
containers:
- name: vllm-leader
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
- name: IS_PR_TEST
value: "{{ is_pr_test | default("false") }}"
- name: VLLM_VERSION
value: {{ vllm_version | default("latest") }}
- name: VLLM_ASCEND_REF
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
ports:
- containerPort: 8080
# readinessProbe:
# tcpSocket:
# port: 8080
# initialDelaySeconds: 15
# periodSeconds: 10
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
workerTemplate:
spec:
containers:
- name: vllm-worker
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
- name: IS_PR_TEST
value: "{{ is_pr_test | default("false") }}"
- name: VLLM_VERSION
value: {{ vllm_version | default("latest") }}
- name: VLLM_ASCEND_REF
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
namespace: vllm-project
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP