From 1b5d56f7f885cdc4284579ee863f9944f4c12bce Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 1 Sep 2024 00:27:25 -0700 Subject: [PATCH] [CI] Add more multi-gpu tests (#1280) --- .github/workflows/accuracy-test.yml | 33 ++++++++++- .github/workflows/cache-purge.yml | 27 --------- .github/workflows/e2e-test.yml | 44 +++++++++++++- .github/workflows/moe-test.yml | 45 --------------- .github/workflows/unit-test.yml | 10 ++-- python/sglang/bench_latency.py | 41 +++++++------ test/srt/test_moe_eval_accuracy_large.py | 73 ++++++++++++++++++++++++ test/srt/test_moe_serving_latency.py | 45 +++++++++++++++ test/srt/test_moe_serving_throughput.py | 19 +----- test/srt/test_serving_latency.py | 43 ++++++++++++++ test/srt/test_serving_throughput.py | 19 +----- 11 files changed, 271 insertions(+), 128 deletions(-) delete mode 100644 .github/workflows/cache-purge.yml delete mode 100644 .github/workflows/moe-test.yml create mode 100644 test/srt/test_moe_eval_accuracy_large.py create mode 100644 test/srt/test_moe_serving_latency.py create mode 100644 test/srt/test_serving_latency.py diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index 6fb102a4c..b7118e217 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -18,7 +18,7 @@ concurrency: cancel-in-progress: true jobs: - accuracy-test: + one-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 1-gpu-runner @@ -41,3 +41,34 @@ jobs: run: | cd test/srt python3 test_eval_accuracy_large.py + + two-gpu: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: 2-gpu-runner + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . + + - name: Evaluate Accuracy + timeout-minutes: 20 + run: | + cd test/srt + python3 test_moe_eval_accuracy_large.py + + finish: + needs: [one-gpu, two-gpu] + runs-on: ubuntu-latest + steps: + - name: Finish + run: echo "This is an empty step to ensure that all jobs are completed." diff --git a/.github/workflows/cache-purge.yml b/.github/workflows/cache-purge.yml deleted file mode 100644 index c699f4988..000000000 --- a/.github/workflows/cache-purge.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Weekly Cache Purge - -on: - schedule: - - cron: '0 0 * * 0' # Every Sunday at 00:00 - workflow_dispatch: - -jobs: - purge-cache: - if: github.repository == 'sgl-project/sglang' - runs-on: self-hosted - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Purge pip cache - run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - pip cache purge - - - name: Update dependencies - run: | - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 11c94775c..c5594ac4a 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -18,7 +18,7 @@ concurrency: cancel-in-progress: true jobs: - e2e-test: + one-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 1-gpu-runner @@ -41,7 +41,8 @@ jobs: - name: Benchmark Serving Latency timeout-minutes: 10 run: | - python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8 + cd test/srt + python3 -m unittest test_serving_latency.TestServingLatency.test_default - name: Benchmark Serving Throughput (w/o RadixAttention) timeout-minutes: 10 @@ -54,3 +55,42 @@ jobs: run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill + + two-gpu: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: 2-gpu-runner + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Benchmark Serving Throughput (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + + - name: Benchmark Serving Latency (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default + + - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache + + finish: + needs: [one-gpu, two-gpu] + runs-on: ubuntu-latest + steps: + - name: Finish + run: echo "This is an empty step to ensure that all jobs are completed." diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml deleted file mode 100644 index 4440aa215..000000000 --- a/.github/workflows/moe-test.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: MoE Test - -on: - push: - branches: [ main ] - paths: - - "python/sglang/**" - - "test/**" - pull_request: - branches: [ main ] - paths: - - "python/sglang/**" - - "test/**" - workflow_dispatch: - -concurrency: - group: moe-test-${{ github.ref }} - cancel-in-progress: true - -jobs: - moe-test: - if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: 2-gpu-runner - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install dependencies - run: | - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - - - name: Benchmark MoE Serving Throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default - - - name: Benchmark MoE Serving Throughput (w/o RadixAttention) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 41a565a63..5d774b67e 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -18,7 +18,7 @@ concurrency: cancel-in-progress: true jobs: - unit-test-jobs: + run-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 1-gpu-runner strategy: @@ -48,9 +48,9 @@ jobs: python3 run_suite.py --suite minimal --range-begin 8 fi - unit-test: - needs: unit-test-jobs + finish: + needs: [run-test] runs-on: ubuntu-latest steps: - - name: Merge step - run: echo "This is an empty merge step" \ No newline at end of file + - name: Finish + run: echo "This is an empty step to ensure that all jobs are completed." diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index 966a97d20..9006b7150 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct ## plot the results in series of lines: python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results" - # Usage (correctness test): python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct ## Reference output (of the correctness test above, can be gpu dependent): -prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633], - [-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633], - [ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]], - device='cuda:0', dtype=torch.float16) -prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141], - [-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742], - [-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]], - device='cuda:0', dtype=torch.float16) - The capital of France is. +input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]] + +prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633], + [-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633], + [ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]], + device='cuda:0') + +prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141], + [-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781], + [-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]], + device='cuda:0') + +========== Prompt 0 ========== + The capital of France is Paris. The capital of the United States is Washington, D.C. - The capital of the United Kindom is. + +========== Prompt 1 ========== + The capital of the United Kindom is London. The capital of the United Kingdom is London. The capital of the - Today is a sunny day and I like go for a walk in the park. + +========== Prompt 2 ========== + Today is a sunny day and I like to go for a walk in the park. I'm going to the park """ @@ -225,12 +233,12 @@ def correctness_test( # Prepare inputs input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer) - rank_print(f"{input_ids=}") + rank_print(f"\n{input_ids=}\n") if bench_args.cut_len > 0: # Prefill next_token_ids, next_token_logits, batch = extend(reqs, model_runner) - rank_print("prefill logits (first half)", next_token_logits) + rank_print(f"prefill logits (first half): {next_token_logits} \n") # Prepare extend inputs reqs = prepare_extend_inputs_for_correctness_test( @@ -239,7 +247,7 @@ def correctness_test( # Extend next_token_ids, next_token_logits, batch = extend(reqs, model_runner) - rank_print("prefill logits (final)", next_token_logits) + rank_print(f"prefill logits (final): {next_token_logits} \n") # Decode output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))] @@ -250,7 +258,8 @@ def correctness_test( # Print for i in range(len(reqs)): - rank_print(tokenizer.decode(output_ids[i])) + rank_print(f"========== Prompt {i} ==========") + rank_print(tokenizer.decode(output_ids[i]), "\n") @torch.inference_mode() diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/srt/test_moe_eval_accuracy_large.py new file mode 100644 index 000000000..d13f427d8 --- /dev/null +++ b/test/srt/test_moe_eval_accuracy_large.py @@ -0,0 +1,73 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MOE_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLarge(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--log-level-http", + "warning", + "--tp", + "2", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=3000, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.63, f"{metrics}" + + def test_human_eval(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="humaneval", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.43, f"{metrics}" + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.64, f"{metrics}" + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_moe_serving_latency.py b/test/srt/test_moe_serving_latency.py new file mode 100644 index 000000000..9d5215323 --- /dev/null +++ b/test/srt/test_moe_serving_latency.py @@ -0,0 +1,45 @@ +import os +import subprocess +import unittest + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST + + +class TestServingLatency(unittest.TestCase): + def test_default(self): + command = [ + "python3", + "-m", + "sglang.bench_latency", + "--model", + DEFAULT_MOE_MODEL_NAME_FOR_TEST, + "--batch-size", + "1", + "--input", + "128", + "--output", + "8", + "--tp", + "2", + ] + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + output = stdout.decode() + error = stderr.decode() + print(f"Output: {output}") + print(f"Error: {error}") + + lastline = output.split("\n")[-3] + value = float(lastline.split(" ")[-2]) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert value > 125 + + kill_child_process(process.pid) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index 4f6e8db82..6f040da34 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase): other_args.append("--disable-flashinfer") other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) other_args.extend(["--tensor-parallel-size", "2"]) - other_args.append("--enable-p2p-check") model = DEFAULT_MOE_MODEL_NAME_FOR_TEST base_url = DEFAULT_URL_FOR_TEST @@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase): ) # Run benchmark - num_prompts = 200 + num_prompts = 300 args = SimpleNamespace( backend="sglang", base_url=base_url, @@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE): 950, H100 (SMX): 1800 - assert res["output_throughput"] > 1750 + assert res["output_throughput"] > 1850 def test_default_without_radix_cache(self): res = self.run_test( @@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE): 950, H100 (SMX): 1900 - assert res["output_throughput"] > 1850 - - def test_all_cases(self): - for disable_radix_cache in [False, True]: - for disable_flashinfer in [False, True]: - for chunked_prefill_size in [-1, 2048]: - self.run_test( - disable_radix_cache=False, - disable_flashinfer=False, - chunked_prefill_size=-1, - ) + assert res["output_throughput"] > 1950 if __name__ == "__main__": diff --git a/test/srt/test_serving_latency.py b/test/srt/test_serving_latency.py new file mode 100644 index 000000000..e762892c8 --- /dev/null +++ b/test/srt/test_serving_latency.py @@ -0,0 +1,43 @@ +import os +import subprocess +import unittest + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST + + +class TestServingLatency(unittest.TestCase): + def test_default(self): + command = [ + "python3", + "-m", + "sglang.bench_latency", + "--model", + DEFAULT_MODEL_NAME_FOR_TEST, + "--batch-size", + "1", + "--input", + "128", + "--output", + "8", + ] + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + output = stdout.decode() + error = stderr.decode() + print(f"Output: {output}") + print(f"Error: {error}") + + lastline = output.split("\n")[-3] + value = float(lastline.split(" ")[-2]) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert value > 130 + + kill_child_process(process.pid) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index f1089a6a7..d4ed12612 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase): ) # Run benchmark - num_prompts = 400 + num_prompts = 500 args = SimpleNamespace( backend="sglang", base_url=base_url, @@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE): 1450, H100 (SMX): 2550 - assert res["output_throughput"] > 2500 + assert res["output_throughput"] > 2400 def test_default_without_radix_cache(self): res = self.run_test( @@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE): 1500, H100 (SMX): 2850 assert res["output_throughput"] > 2800 def test_default_without_chunked_prefill(self): @@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 (PCIE): 1450, H100 (SMX): 2550 - assert res["output_throughput"] > 2500 - - def test_all_cases(self): - for disable_radix_cache in [False, True]: - for disable_flashinfer in [False, True]: - for chunked_prefill_size in [-1, 2048]: - self.run_test( - disable_radix_cache=False, - disable_flashinfer=False, - chunked_prefill_size=-1, - ) + assert res["output_throughput"] > 2400 if __name__ == "__main__":