2x performance improvement for large prefill & Fix workspace conflicts (#579)

2024-07-03 16:14:57 -07:00
parent 96c503eb60
commit 2a754e57b0
6 changed files with 88 additions and 25 deletions
--- a/python/sglang/global_config.py
+++ b/python/sglang/global_config.py
@@ -35,5 +35,8 @@ class GlobalConfig:
        self.new_token_ratio_decay = 0.0001
        self.new_token_ratio_recovery = 0.05

+        # The threshold (number of tokens) to trigger layer-wise cuda sync.
+        # This can improve the speed for large batch sizes during prefill.
+        self.layer_sync_threshold = 8192

 global_config = GlobalConfig()