Deprecate --disable-flashinfer and introduce --attention-backend (#1380)

This commit is contained in:
Lianmin Zheng
2024-09-10 17:11:16 -07:00
committed by GitHub
parent 3a6e8b6d78
commit 46094e0c1b
13 changed files with 99 additions and 61 deletions

View File

@@ -14,13 +14,12 @@ from sglang.test.test_utils import (
class TestServingThroughput(unittest.TestCase):
def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
# Launch the server
other_args = []
if disable_radix_cache:
other_args.append("--disable-radix-cache")
if disable_flashinfer:
other_args.append("--disable-flashinfer")
other_args.extend(["--attention-backend", attention_backend])
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
other_args.extend(["--tensor-parallel-size", "2"])
@@ -70,7 +69,7 @@ class TestServingThroughput(unittest.TestCase):
def test_default(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
disable_flashinfer=ServerArgs.disable_flashinfer,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
@@ -80,7 +79,7 @@ class TestServingThroughput(unittest.TestCase):
def test_default_without_radix_cache(self):
res = self.run_test(
disable_radix_cache=True,
disable_flashinfer=ServerArgs.disable_flashinfer,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)