Support cuda graph in the triton attention backend (#1401)

2024-09-12 00:36:55 -07:00
parent 2a71be5e25
commit 3efa798116
6 changed files with 147 additions and 60 deletions
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -96,6 +96,16 @@ class TestServingThroughput(unittest.TestCase):
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            assert res["output_throughput"] > 2400

+    def test_default_with_triton_attention_backend(self):
+        res = self.run_test(
+            disable_radix_cache=ServerArgs.disable_radix_cache,
+            attention_backend="triton",
+            chunked_prefill_size=-1,
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2400
+

 if __name__ == "__main__":
    unittest.main()