add qwen3

2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions
--- a/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/vllm-v0.6.2/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,196 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
+steps:
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+
+
+
+  - label: "A100 vllm step 10"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:v0.6.2
+                <<: *common_container_settings
+
+
+
+  - label: "A100 sglang benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: lmsysorg/sglang:v0.3.2-cu121
+                <<: *common_container_settings
+
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.6.1-cu12
+                <<: *common_container_settings
+
+
+
+
+  - label: "A100 trt llama-8B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama8B"
+
+
+  - label: "A100 trt llama-70B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+                <<: *common_container_settings
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: VLLM_SOURCE_CODE_LOC
+                    value: /workspace/build/buildkite/vllm/performance-benchmark
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                  - name: TEST_SELECTOR
+                    value: "llama70B"
+
+
+  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
+  #               <<: *common_container_settings
+
+
+  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           <<: *common_pod_spec
+  #           containers:
+  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
+  #               <<: *common_container_settings
+        
+  - wait
+
+  - label: "Collect the results"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+
+  - block: ":rocket: check the results!"