Enable overlap by default (#2067)

This commit is contained in:
Lianmin Zheng
2024-11-19 22:07:58 -08:00
committed by GitHub
parent 699384cb01
commit 7d671e4ad2
17 changed files with 92 additions and 75 deletions

View File

@@ -17,8 +17,8 @@ suites = {
"test_json_constrained.py",
"test_large_max_new_tokens.py",
"test_metrics.py",
"test_non_overlap_scheduler.py",
"test_openai_server.py",
"test_overlap_schedule.py",
"test_pytorch_sampling_backend.py",
"test_radix_attention.py",
"test_retract_decode.py",

View File

@@ -97,8 +97,8 @@ class TestBenchServing(unittest.TestCase):
if is_in_ci():
self.assertLess(res["median_e2e_latency_ms"], 12000)
self.assertLess(res["median_ttft_ms"], 80)
self.assertLess(res["median_itl_ms"], 11)
self.assertLess(res["median_ttft_ms"], 86)
self.assertLess(res["median_itl_ms"], 10)
def test_moe_offline_throughput_default(self):
res = run_bench_serving(

View File

@@ -78,10 +78,11 @@ class TestJSONConstrained(unittest.TestCase):
self.assertIsInstance(js_obj["population"], int)
# Make sure jump forward is triggered
self.assertGreater(
ret["meta_info"]["completion_tokens"],
ret["meta_info"]["completion_tokens_wo_jump_forward"],
)
# NOTE: This is skipped because overlap scheduler does not support jump forward
# self.assertGreater(
# ret["meta_info"]["completion_tokens"],
# ret["meta_info"]["completion_tokens_wo_jump_forward"],
# )
def test_json_generate(self):
self.run_decode(json_schema=self.json_schema)

View File

@@ -59,7 +59,7 @@ class TestMoEEvalAccuracyLarge(unittest.TestCase):
)
metrics = run_eval(args)
self.assertGreater(metrics["score"], 0.41)
self.assertGreater(metrics["score"], 0.40)
def test_mgsm_en(self):
args = SimpleNamespace(

View File

@@ -12,22 +12,22 @@ from sglang.test.test_utils import run_mmlu_test
class TestOverlapSchedule(unittest.TestCase):
def test_no_radix_attention_chunked_prefill(self):
run_mmlu_test(
disable_radix_cache=True, chunked_prefill_size=32, enable_overlap=True
disable_radix_cache=True, chunked_prefill_size=32, disable_overlap=True
)
def test_no_radix_attention_no_chunked_prefill(self):
run_mmlu_test(
disable_radix_cache=True, chunked_prefill_size=-1, enable_overlap=True
disable_radix_cache=True, chunked_prefill_size=-1, disable_overlap=True
)
def test_radix_attention_chunked_prefill(self):
run_mmlu_test(
disable_radix_cache=False, chunked_prefill_size=32, enable_overlap=True
disable_radix_cache=False, chunked_prefill_size=32, disable_overlap=True
)
def test_radix_attention_no_chunked_prefill(self):
run_mmlu_test(
disable_radix_cache=False, chunked_prefill_size=-1, enable_overlap=True
disable_radix_cache=False, chunked_prefill_size=-1, disable_overlap=True
)

View File

@@ -107,7 +107,7 @@ class TestRadixCacheLPM(TestRadixCacheFCFS):
)
class TestRadixCacheOverlapLPM(TestRadixCacheFCFS):
class TestRadixCacheNonOverlapLPM(TestRadixCacheFCFS):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
@@ -117,7 +117,7 @@ class TestRadixCacheOverlapLPM(TestRadixCacheFCFS):
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--enable-overlap-schedule",
"--disable-overlap-schedule",
"--chunked-prefill-size",
"128",
"--max-total-tokens",

View File

@@ -1,3 +1,4 @@
import time
import unittest
from types import SimpleNamespace
@@ -56,14 +57,14 @@ class TestTorchCompile(unittest.TestCase):
return response.json()
def test_throughput(self):
import time
# Warmup
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
res = self.run_decode(max_tokens)
tok = time.time()
print(res["text"])
print(f"{res=}")
throughput = max_tokens / (tok - tic)
print(f"Throughput: {throughput} tokens/s")
self.assertGreaterEqual(throughput, 152)

View File

@@ -1,3 +1,4 @@
import time
import unittest
from types import SimpleNamespace
@@ -56,10 +57,10 @@ class TestTorchCompile(unittest.TestCase):
return response.json()
def test_throughput(self):
import time
# Warmup
res = self.run_decode(16)
max_tokens = 256
tic = time.time()
res = self.run_decode(max_tokens)
tok = time.time()