[CI] Add more multi-gpu tests (#1280)

This commit is contained in:
Lianmin Zheng
2024-09-01 00:27:25 -07:00
committed by GitHub
parent d134c139a1
commit 1b5d56f7f8
11 changed files with 271 additions and 128 deletions

View File

@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
accuracy-test: one-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
@@ -41,3 +41,34 @@ jobs:
run: | run: |
cd test/srt cd test/srt
python3 test_eval_accuracy_large.py python3 test_eval_accuracy_large.py
two-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate Accuracy
timeout-minutes: 20
run: |
cd test/srt
python3 test_moe_eval_accuracy_large.py
finish:
needs: [one-gpu, two-gpu]
runs-on: ubuntu-latest
steps:
- name: Finish
run: echo "This is an empty step to ensure that all jobs are completed."

View File

@@ -1,27 +0,0 @@
name: Weekly Cache Purge
on:
schedule:
- cron: '0 0 * * 0' # Every Sunday at 00:00
workflow_dispatch:
jobs:
purge-cache:
if: github.repository == 'sgl-project/sglang'
runs-on: self-hosted
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Purge pip cache
run: |
source $HOME/venv/bin/activate
echo "$HOME/venv/bin" >> $GITHUB_PATH
pip cache purge
- name: Update dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall

View File

@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
e2e-test: one-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
@@ -41,7 +41,8 @@ jobs:
- name: Benchmark Serving Latency - name: Benchmark Serving Latency
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8 cd test/srt
python3 -m unittest test_serving_latency.TestServingLatency.test_default
- name: Benchmark Serving Throughput (w/o RadixAttention) - name: Benchmark Serving Throughput (w/o RadixAttention)
timeout-minutes: 10 timeout-minutes: 10
@@ -54,3 +55,42 @@ jobs:
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
two-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Serving Throughput (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
- name: Benchmark Serving Latency (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
- name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
finish:
needs: [one-gpu, two-gpu]
runs-on: ubuntu-latest
steps:
- name: Finish
run: echo "This is an empty step to ensure that all jobs are completed."

View File

@@ -1,45 +0,0 @@
name: MoE Test
on:
push:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
pull_request:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
workflow_dispatch:
concurrency:
group: moe-test-${{ github.ref }}
cancel-in-progress: true
jobs:
moe-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark MoE Serving Throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
- name: Benchmark MoE Serving Throughput (w/o RadixAttention)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache

View File

@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
unit-test-jobs: run-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
strategy: strategy:
@@ -48,9 +48,9 @@ jobs:
python3 run_suite.py --suite minimal --range-begin 8 python3 run_suite.py --suite minimal --range-begin 8
fi fi
unit-test: finish:
needs: unit-test-jobs needs: [run-test]
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Merge step - name: Finish
run: echo "This is an empty merge step" run: echo "This is an empty step to ensure that all jobs are completed."

View File

@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
## plot the results in series of lines: ## plot the results in series of lines:
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results" python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
# Usage (correctness test): # Usage (correctness test):
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
## Reference output (of the correctness test above, can be gpu dependent): ## Reference output (of the correctness test above, can be gpu dependent):
prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633], input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
[ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]], prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
device='cuda:0', dtype=torch.float16) [-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141], [ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
[-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742], device='cuda:0')
[-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]],
device='cuda:0', dtype=torch.float16) prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
<s> The capital of France is. [-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
[-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
device='cuda:0')
========== Prompt 0 ==========
<s> The capital of France is Paris.
The capital of the United States is Washington, D.C. The capital of the United States is Washington, D.C.
<s> The capital of the United Kindom is.
========== Prompt 1 ==========
<s> The capital of the United Kindom is London.
The capital of the United Kingdom is London. The capital of the United Kingdom is London.
The capital of the The capital of the
<s> Today is a sunny day and I like go for a walk in the park.
========== Prompt 2 ==========
<s> Today is a sunny day and I like to go for a walk in the park.
I'm going to the park I'm going to the park
""" """
@@ -225,12 +233,12 @@ def correctness_test(
# Prepare inputs # Prepare inputs
input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer) input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
rank_print(f"{input_ids=}") rank_print(f"\n{input_ids=}\n")
if bench_args.cut_len > 0: if bench_args.cut_len > 0:
# Prefill # Prefill
next_token_ids, next_token_logits, batch = extend(reqs, model_runner) next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
rank_print("prefill logits (first half)", next_token_logits) rank_print(f"prefill logits (first half): {next_token_logits} \n")
# Prepare extend inputs # Prepare extend inputs
reqs = prepare_extend_inputs_for_correctness_test( reqs = prepare_extend_inputs_for_correctness_test(
@@ -239,7 +247,7 @@ def correctness_test(
# Extend # Extend
next_token_ids, next_token_logits, batch = extend(reqs, model_runner) next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
rank_print("prefill logits (final)", next_token_logits) rank_print(f"prefill logits (final): {next_token_logits} \n")
# Decode # Decode
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))] output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
@@ -250,7 +258,8 @@ def correctness_test(
# Print # Print
for i in range(len(reqs)): for i in range(len(reqs)):
rank_print(tokenizer.decode(output_ids[i])) rank_print(f"========== Prompt {i} ==========")
rank_print(tokenizer.decode(output_ids[i]), "\n")
@torch.inference_mode() @torch.inference_mode()

View File

@@ -0,0 +1,73 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestEvalAccuracyLarge(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--log-level-http",
"warning",
"--tp",
"2",
],
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=3000,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.63, f"{metrics}"
def test_human_eval(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="humaneval",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.43, f"{metrics}"
def test_mgsm_en(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.64, f"{metrics}"
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,45 @@
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()

View File

@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
other_args.append("--disable-flashinfer") other_args.append("--disable-flashinfer")
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
other_args.extend(["--tensor-parallel-size", "2"]) other_args.extend(["--tensor-parallel-size", "2"])
other_args.append("--enable-p2p-check")
model = DEFAULT_MOE_MODEL_NAME_FOR_TEST model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST base_url = DEFAULT_URL_FOR_TEST
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
) )
# Run benchmark # Run benchmark
num_prompts = 200 num_prompts = 300
args = SimpleNamespace( args = SimpleNamespace(
backend="sglang", backend="sglang",
base_url=base_url, base_url=base_url,
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1800 assert res["output_throughput"] > 1850
assert res["output_throughput"] > 1750
def test_default_without_radix_cache(self): def test_default_without_radix_cache(self):
res = self.run_test( res = self.run_test(
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1900 assert res["output_throughput"] > 1950
assert res["output_throughput"] > 1850
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1,43 @@
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()

View File

@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
) )
# Run benchmark # Run benchmark
num_prompts = 400 num_prompts = 500
args = SimpleNamespace( args = SimpleNamespace(
backend="sglang", backend="sglang",
base_url=base_url, base_url=base_url,
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550 assert res["output_throughput"] > 2400
assert res["output_throughput"] > 2500
def test_default_without_radix_cache(self): def test_default_without_radix_cache(self):
res = self.run_test( res = self.run_test(
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1500, H100 (SMX): 2850
assert res["output_throughput"] > 2800 assert res["output_throughput"] > 2800
def test_default_without_chunked_prefill(self): def test_default_without_chunked_prefill(self):
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550 assert res["output_throughput"] > 2400
assert res["output_throughput"] > 2500
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
if __name__ == "__main__": if __name__ == "__main__":