2x performance improvement for large prefill & Fix workspace conflicts (#579)

This commit is contained in:
Ying Sheng
2024-07-03 16:14:57 -07:00
committed by GitHub
parent 96c503eb60
commit 2a754e57b0
6 changed files with 88 additions and 25 deletions

View File

@@ -35,5 +35,8 @@ class GlobalConfig:
self.new_token_ratio_decay = 0.0001
self.new_token_ratio_recovery = 0.05
# The threshold (number of tokens) to trigger layer-wise cuda sync.
# This can improve the speed for large batch sizes during prefill.
self.layer_sync_threshold = 8192
global_config = GlobalConfig()