Enable cuda graph by default (#612)

2024-07-13 05:29:46 -07:00
parent 396a69240f
commit 665815969a
10 changed files with 331 additions and 84 deletions
--- a/python/sglang/global_config.py
+++ b/python/sglang/global_config.py
@@ -8,36 +8,40 @@ class GlobalConfig:
        # 2: output final text after every run
        self.verbosity = 0

+        # Default backend of the language
        self.default_backend = None

-        # Output configs
-        self.skip_special_tokens_in_output = True
-        self.spaces_between_special_tokens_in_out = True
-
-        # Optimization configs
-        self.eager_fill_image = False
-        self.enable_precache_with_tracing = True
-        self.enable_parallel_encoding = True
-        self.enable_parallel_decoding = True
-
-        # Choices: ["no_adjust", "adjust_cache"]
-        # no_adjust: Do not adjust the position embedding of KV cache.
-        # adjust_cache: Adjust the position embedding of KV cache.
-        self.concate_and_append_mode = "no_adjust"
-
-        # Request dependency time due to network delay
+        # Runtime constants: Request dependency time due to network delay
        self.request_dependency_delay = 0.02
        self.wait_for_new_request_delay = 0.0006

-        # New generation token ratio estimation
+        # Runtime constants: New generation token ratio estimation
        self.base_new_token_ratio = 0.4
        self.base_min_new_token_ratio = 0.2
        self.new_token_ratio_decay = 0.0001
        self.new_token_ratio_recovery = 0.05

-        # The threshold (number of tokens) to trigger layer-wise cuda sync.
+        # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
        # This can improve the speed for large batch sizes during prefill.
        self.layer_sync_threshold = 8192

+        # Runtime constants: Flashinfer
+        self.flashinfer_workspace_size = 192 * 1024 * 1024
+
+        # Output tokenization configs
+        self.skip_special_tokens_in_output = True
+        self.spaces_between_special_tokens_in_out = True
+
+        # Interpreter optimization configs
+        self.eager_fill_image = False
+        self.enable_precache_with_tracing = True
+        self.enable_parallel_encoding = True
+        self.enable_parallel_decoding = True
+
+        # Deprecated
+        # Choices: ["no_adjust", "adjust_cache"]
+        # no_adjust: Do not adjust the position embedding of KV cache.
+        # adjust_cache: Adjust the position embedding of KV cache.
+        self.concate_and_append_mode = "no_adjust"

 global_config = GlobalConfig()