Fix oom issues with fp8 for llama (#1454)

2024-09-18 03:45:19 -07:00
parent aa2750beb3
commit 1acccb364a
8 changed files with 33 additions and 21 deletions
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -144,18 +144,18 @@ jobs:
          cd test/srt
          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache

-      - name: Benchmark Offline Throughput (w/o ChunkedPrefill)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_chunked_prefill
-
      - name: Benchmark Offline Throughput (w/ Triton)
        timeout-minutes: 10
        run: |
          cd test/srt
          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend

+      - name: Benchmark Offline Throughput (w/ FP8)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+
  performance-test-2-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 2-gpu-runner