Co-authored-by: hnyls2002 <hnyls2002@gmail.com> Co-authored-by: Ying Sheng <sqy1415@gmail.com>
51 lines
1.7 KiB
Python
51 lines
1.7 KiB
Python
"""Global configurations"""
|
|
|
|
|
|
class GlobalConfig:
|
|
def __init__(self):
|
|
# Verbosity level
|
|
# 0: do not output anything
|
|
# 2: output final text after every run
|
|
self.verbosity = 0
|
|
|
|
# Default backend of the language
|
|
self.default_backend = None
|
|
|
|
# Runtime constants: Request dependency time due to network delay
|
|
self.request_dependency_delay = 0.02
|
|
self.wait_for_new_request_delay = 0.0006
|
|
|
|
# Runtime constants: New generation token ratio estimation
|
|
self.init_new_token_ratio = 0.7
|
|
self.base_min_new_token_ratio = 0.1
|
|
self.new_token_ratio_decay = 0.001
|
|
self.new_token_ratio_recovery = 0.05
|
|
|
|
# Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
|
|
# This can improve the speed for large batch sizes during prefill.
|
|
self.layer_sync_threshold = 8192
|
|
|
|
# Runtime constants: others
|
|
self.num_continue_decode_steps = 10
|
|
self.retract_decode_steps = 20
|
|
self.flashinfer_workspace_size = 192 * 1024 * 1024
|
|
|
|
# Output tokenization configs
|
|
self.skip_special_tokens_in_output = True
|
|
self.spaces_between_special_tokens_in_out = True
|
|
|
|
# Interpreter optimization configs
|
|
self.eager_fill_image = False
|
|
self.enable_precache_with_tracing = True
|
|
self.enable_parallel_encoding = True
|
|
self.enable_parallel_decoding = True
|
|
|
|
# Deprecated
|
|
# Choices: ["no_adjust", "adjust_cache"]
|
|
# no_adjust: Do not adjust the position embedding of KV cache.
|
|
# adjust_cache: Adjust the position embedding of KV cache.
|
|
self.concate_and_append_mode = "no_adjust"
|
|
|
|
|
|
global_config = GlobalConfig()
|