Enable cuda graph by default (#612)

This commit is contained in:
Lianmin Zheng
2024-07-13 05:29:46 -07:00
committed by GitHub
parent 396a69240f
commit 665815969a
10 changed files with 331 additions and 84 deletions

View File

@@ -8,36 +8,40 @@ class GlobalConfig:
# 2: output final text after every run
self.verbosity = 0
# Default backend of the language
self.default_backend = None
# Output configs
self.skip_special_tokens_in_output = True
self.spaces_between_special_tokens_in_out = True
# Optimization configs
self.eager_fill_image = False
self.enable_precache_with_tracing = True
self.enable_parallel_encoding = True
self.enable_parallel_decoding = True
# Choices: ["no_adjust", "adjust_cache"]
# no_adjust: Do not adjust the position embedding of KV cache.
# adjust_cache: Adjust the position embedding of KV cache.
self.concate_and_append_mode = "no_adjust"
# Request dependency time due to network delay
# Runtime constants: Request dependency time due to network delay
self.request_dependency_delay = 0.02
self.wait_for_new_request_delay = 0.0006
# New generation token ratio estimation
# Runtime constants: New generation token ratio estimation
self.base_new_token_ratio = 0.4
self.base_min_new_token_ratio = 0.2
self.new_token_ratio_decay = 0.0001
self.new_token_ratio_recovery = 0.05
# The threshold (number of tokens) to trigger layer-wise cuda sync.
# Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
# This can improve the speed for large batch sizes during prefill.
self.layer_sync_threshold = 8192
# Runtime constants: Flashinfer
self.flashinfer_workspace_size = 192 * 1024 * 1024
# Output tokenization configs
self.skip_special_tokens_in_output = True
self.spaces_between_special_tokens_in_out = True
# Interpreter optimization configs
self.eager_fill_image = False
self.enable_precache_with_tracing = True
self.enable_parallel_encoding = True
self.enable_parallel_decoding = True
# Deprecated
# Choices: ["no_adjust", "adjust_cache"]
# no_adjust: Do not adjust the position embedding of KV cache.
# adjust_cache: Adjust the position embedding of KV cache.
self.concate_and_append_mode = "no_adjust"
global_config = GlobalConfig()