### What this PR does / why we need it?
This PR refactors the nightly CI workflows (A2 and A3) to support
running tests against a specific PR's code, in addition to the existing
scheduled/dispatch runs using pre-built images.
#### Motivation:
Previously, nightly tests could only be triggered by schedule or
workflow_dispatch, always using the pre-built nightly image. This change
allows developers to trigger nightly tests against their own PR's source
code, enabling early validation without waiting for a nightly build.
#### Changes
Trigger logic (parse-trigger job)
A new parse-trigger job is introduced in both
schedule_nightly_test_a2.yaml and schedule_nightly_test_a3.yaml to
centralize trigger evaluation:
`schedule / workflow_dispatch`: runs all tests with the pre-built image
(existing behavior preserved)
`pull_request (labeled + synchronize)`: runs only when:The PR has the
nightly-test label, and /nightly [test-names] comment exists (latest one
wins)
1. /nightly or /nightly all — runs all tests
2. /nightly test1 test2 — runs only named tests (comma-wrapped for exact
matching)
#### How to trigger
1. Add the nightly-test label to your PR
2. Comment /nightly (all tests) or /nightly test1 test2 (specific tests)
4. Re-triggering: add another /nightly comment and push a new commit
(synchronize event)
### Does this PR introduce _any_ user-facing change?
None
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
143 lines
4.8 KiB
Django/Jinja
143 lines
4.8 KiB
Django/Jinja
apiVersion: leaderworkerset.x-k8s.io/v1
|
|
kind: LeaderWorkerSet
|
|
metadata:
|
|
name: vllm
|
|
namespace: vllm-project
|
|
spec:
|
|
replicas: {{ replicas | default(1) }}
|
|
leaderWorkerTemplate:
|
|
size: {{ size | default(2) }}
|
|
restartPolicy: None
|
|
leaderTemplate:
|
|
metadata:
|
|
labels:
|
|
role: leader
|
|
spec:
|
|
containers:
|
|
- name: vllm-leader
|
|
imagePullPolicy: Always
|
|
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
|
|
env:
|
|
- name: CONFIG_YAML_PATH
|
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
|
- name: WORKSPACE
|
|
value: "/vllm-workspace"
|
|
- name: FAIL_TAG
|
|
value: {{ fail_tag | default("FAIL_TAG") }}
|
|
- name: IS_PR_TEST
|
|
value: "{{ is_pr_test | default("false") }}"
|
|
- name: VLLM_VERSION
|
|
value: {{ vllm_version | default("latest") }}
|
|
- name: VLLM_ASCEND_REF
|
|
value: {{ vllm_ascend_ref | default("main") }}
|
|
- name: VLLM_ASCEND_REMOTE_URL
|
|
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
bash /root/.cache/tests/run.sh
|
|
resources:
|
|
limits:
|
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
|
memory: 512Gi
|
|
ephemeral-storage: 100Gi
|
|
requests:
|
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
|
ephemeral-storage: 100Gi
|
|
cpu: 125
|
|
ports:
|
|
- containerPort: 8080
|
|
# readinessProbe:
|
|
# tcpSocket:
|
|
# port: 8080
|
|
# initialDelaySeconds: 15
|
|
# periodSeconds: 10
|
|
volumeMounts:
|
|
- mountPath: /root/.cache
|
|
name: shared-volume
|
|
- mountPath: /usr/local/Ascend/driver/tools
|
|
name: driver-tools
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
sizeLimit: 15Gi
|
|
- name: shared-volume
|
|
persistentVolumeClaim:
|
|
claimName: vllm-project-hk001
|
|
- name: driver-tools
|
|
hostPath:
|
|
path: /usr/local/Ascend/driver/tools
|
|
workerTemplate:
|
|
spec:
|
|
containers:
|
|
- name: vllm-worker
|
|
imagePullPolicy: Always
|
|
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
|
|
env:
|
|
- name: CONFIG_YAML_PATH
|
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
|
- name: WORKSPACE
|
|
value: "/vllm-workspace"
|
|
- name: FAIL_TAG
|
|
value: {{ fail_tag | default("FAIL_TAG") }}
|
|
- name: IS_PR_TEST
|
|
value: "{{ is_pr_test | default("false") }}"
|
|
- name: VLLM_VERSION
|
|
value: {{ vllm_version | default("latest") }}
|
|
- name: VLLM_ASCEND_REF
|
|
value: {{ vllm_ascend_ref | default("main") }}
|
|
- name: VLLM_ASCEND_REMOTE_URL
|
|
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
|
command:
|
|
- sh
|
|
- -c
|
|
- |
|
|
bash /root/.cache/tests/run.sh
|
|
resources:
|
|
limits:
|
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
|
memory: 512Gi
|
|
ephemeral-storage: 100Gi
|
|
requests:
|
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
|
ephemeral-storage: 100Gi
|
|
cpu: 125
|
|
volumeMounts:
|
|
- mountPath: /root/.cache
|
|
name: shared-volume
|
|
- mountPath: /usr/local/Ascend/driver/tools
|
|
name: driver-tools
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
sizeLimit: 15Gi
|
|
- name: shared-volume
|
|
persistentVolumeClaim:
|
|
claimName: vllm-project-hk001
|
|
- name: driver-tools
|
|
hostPath:
|
|
path: /usr/local/Ascend/driver/tools
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: vllm-leader
|
|
namespace: vllm-project
|
|
spec:
|
|
ports:
|
|
- name: http
|
|
port: 8080
|
|
protocol: TCP
|
|
targetPort: 8080
|
|
selector:
|
|
leaderworkerset.sigs.k8s.io/name: vllm
|
|
role: leader
|
|
type: ClusterIP
|