diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..393c999d2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: stable
+    hooks:
+      - id: black
diff --git a/benchmark/latency_throughput/bench_serving.py b/benchmark/latency_throughput/bench_serving.py
index 24816d4bd..8566420ed 100644
--- a/benchmark/latency_throughput/bench_serving.py
+++ b/benchmark/latency_throughput/bench_serving.py
@@ -312,8 +312,8 @@ def main(args: argparse.Namespace):
         np.sum([output_len for _, output_len, _ in REQUEST_LATENCY]) / benchmark_time
     )
 
-    #latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY]
-    #print(latencies)
+    # latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY]
+    # print(latencies)
 
     print(f"Total time: {benchmark_time:.2f} s")
     print(f"Request throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
diff --git a/benchmark/line_retrieval/gen_data.py b/benchmark/line_retrieval/gen_data.py
index 5763e6615..c88ecba49 100644
--- a/benchmark/line_retrieval/gen_data.py
+++ b/benchmark/line_retrieval/gen_data.py
@@ -48,9 +48,9 @@ def generate_lines(random_words, num_lines, redirect_ratio):
         )
         for i in redirect_indices:
             target_idx = np.random.choice(min(i * 2 + 100, num_lines))
-            lines[
-                i
-            ] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
+            lines[i] = (
+                f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
+            )
             redirects[i] = target_idx
 
     # Build links and find sources
diff --git a/examples/quick_start/anthropic_example_chat.py b/examples/quick_start/anthropic_example_chat.py
index 03dbb0a45..03d699be7 100644
--- a/examples/quick_start/anthropic_example_chat.py
+++ b/examples/quick_start/anthropic_example_chat.py
@@ -3,6 +3,7 @@ Usage:
 export ANTHROPIC_API_KEY=sk-******
 python3 anthropic_example_chat.py
 """
+
 import sglang as sgl
 
 
@@ -30,7 +31,7 @@ def stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter():
@@ -39,13 +40,18 @@ def stream():
 
 
 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
 
     for s in states:
         print(s.messages())
diff --git a/examples/quick_start/anthropic_example_complete.py b/examples/quick_start/anthropic_example_complete.py
index 35d0e8f62..bce2a61ea 100644
--- a/examples/quick_start/anthropic_example_complete.py
+++ b/examples/quick_start/anthropic_example_complete.py
@@ -9,15 +9,14 @@ import sglang as sgl
 
 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""
+    s += """
 \n\nHuman: What is the capital of France?
 \n\nAssistant: Paris
 \n\nHuman: What is the capital of Germany?
 \n\nAssistant: Berlin
 \n\nHuman: What is the capital of Italy?
 \n\nAssistant: Rome
-""")
+"""
     s += "\n\nHuman: " + question + "\n"
     s += "\n\nAssistant:" + sgl.gen("answer", temperature=0)
 
@@ -33,8 +32,8 @@ def single():
 
 def stream():
     state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )
 
     for out in state.text_iter("answer"):
         print(out, end="", flush=True)
@@ -42,10 +41,12 @@ def stream():
 
 
 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
 
     for s in states:
         print(s["answer"])
diff --git a/examples/quick_start/azure_openai_example_chat.py b/examples/quick_start/azure_openai_example_chat.py
index 3c40af8d2..d53f935f4 100644
--- a/examples/quick_start/azure_openai_example_chat.py
+++ b/examples/quick_start/azure_openai_example_chat.py
@@ -3,9 +3,11 @@ Usage:
 export AZURE_OPENAI_API_KEY=sk-******
 python3 openai_example_chat.py
 """
-import sglang as sgl
+
 import os
 
+import sglang as sgl
+
 
 @sgl.function
 def multi_turn_question(s, question_1, question_2):
@@ -32,7 +34,7 @@ def stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter():
@@ -41,13 +43,18 @@ def stream():
 
 
 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
 
     for s in states:
         print(s.messages())
diff --git a/examples/quick_start/gemini_example_chat.py b/examples/quick_start/gemini_example_chat.py
index aafa1665c..0ae623109 100644
--- a/examples/quick_start/gemini_example_chat.py
+++ b/examples/quick_start/gemini_example_chat.py
@@ -3,6 +3,7 @@ Usage:
 export GCP_PROJECT_ID=******
 python3 gemini_example_chat.py
 """
+
 import sglang as sgl
 
 
@@ -30,7 +31,7 @@ def stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter():
@@ -39,13 +40,18 @@ def stream():
 
 
 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
 
     for s in states:
         print(s.messages())
diff --git a/examples/quick_start/gemini_example_complete.py b/examples/quick_start/gemini_example_complete.py
index 255a3ad4c..5188bf418 100644
--- a/examples/quick_start/gemini_example_complete.py
+++ b/examples/quick_start/gemini_example_complete.py
@@ -9,15 +9,14 @@ import sglang as sgl
 
 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
     s += "Q: " + question + "\n"
     s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
 
@@ -33,8 +32,8 @@ def single():
 
 def stream():
     state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )
 
     for out in state.text_iter("answer"):
         print(out, end="", flush=True)
@@ -42,10 +41,12 @@ def stream():
 
 
 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
 
     for s in states:
         print(s["answer"])
diff --git a/examples/quick_start/gemini_example_multimodal_chat.py b/examples/quick_start/gemini_example_multimodal_chat.py
index fa5e6e8b7..afe0c723f 100644
--- a/examples/quick_start/gemini_example_multimodal_chat.py
+++ b/examples/quick_start/gemini_example_multimodal_chat.py
@@ -3,6 +3,7 @@ Usage:
 export GCP_PROJECT_ID=******
 python3 gemini_example_multimodal_chat.py
 """
+
 import sglang as sgl
 
 
@@ -19,7 +20,7 @@ if __name__ == "__main__":
         image_file1="./images/cat.jpeg",
         image_file2="./images/dog.jpeg",
         question="Describe difference of the two images in one sentence.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter("answer"):
diff --git a/examples/quick_start/openai_example_chat.py b/examples/quick_start/openai_example_chat.py
index 66b8536c0..9511e21cf 100644
--- a/examples/quick_start/openai_example_chat.py
+++ b/examples/quick_start/openai_example_chat.py
@@ -3,6 +3,7 @@ Usage:
 export OPENAI_API_KEY=sk-******
 python3 openai_example_chat.py
 """
+
 import sglang as sgl
 
 
@@ -31,7 +32,7 @@ def stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter():
@@ -40,13 +41,18 @@ def stream():
 
 
 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
 
     for s in states:
         print(s.messages())
diff --git a/examples/quick_start/openai_example_complete.py b/examples/quick_start/openai_example_complete.py
index 41b3c9904..d64bcaf1c 100644
--- a/examples/quick_start/openai_example_complete.py
+++ b/examples/quick_start/openai_example_complete.py
@@ -9,15 +9,14 @@ import sglang as sgl
 
 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
     s += "Q: " + question + "\n"
     s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
 
@@ -33,8 +32,8 @@ def single():
 
 def stream():
     state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )
 
     for out in state.text_iter("answer"):
         print(out, end="", flush=True)
@@ -42,10 +41,12 @@ def stream():
 
 
 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
 
     for s in states:
         print(s["answer"])
diff --git a/examples/quick_start/openrouter_example_chat.py b/examples/quick_start/openrouter_example_chat.py
index 43ac3d4e2..a0b6f15bc 100644
--- a/examples/quick_start/openrouter_example_chat.py
+++ b/examples/quick_start/openrouter_example_chat.py
@@ -3,9 +3,11 @@ Usage:
 export OPENROUTER_API_KEY=sk-******
 python3 together_example_chat.py
 """
-import sglang as sgl
+
 import os
 
+import sglang as sgl
+
 
 @sgl.function
 def multi_turn_question(s, question_1, question_2):
diff --git a/examples/quick_start/srt_example_chat.py b/examples/quick_start/srt_example_chat.py
index 2f261b095..b1e1658a2 100644
--- a/examples/quick_start/srt_example_chat.py
+++ b/examples/quick_start/srt_example_chat.py
@@ -2,6 +2,7 @@
 Usage:
 python3 srt_example_chat.py
 """
+
 import sglang as sgl
 
 
@@ -29,7 +30,7 @@ def stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter():
@@ -38,13 +39,18 @@ def stream():
 
 
 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
 
     for s in states:
         print(s.messages())
diff --git a/examples/quick_start/srt_example_complete.py b/examples/quick_start/srt_example_complete.py
index 200891670..056245979 100644
--- a/examples/quick_start/srt_example_complete.py
+++ b/examples/quick_start/srt_example_complete.py
@@ -2,20 +2,20 @@
 Usage:
 python3 srt_example_complete.py
 """
+
 import sglang as sgl
 
 
 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
     s += "Q: " + question + "\n"
     s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
 
@@ -31,8 +31,8 @@ def single():
 
 def stream():
     state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )
 
     for out in state.text_iter("answer"):
         print(out, end="", flush=True)
@@ -40,10 +40,12 @@ def stream():
 
 
 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
 
     for s in states:
         print(s["answer"])
diff --git a/examples/quick_start/srt_example_llava.py b/examples/quick_start/srt_example_llava.py
index 27685b1d2..5d8f75239 100644
--- a/examples/quick_start/srt_example_llava.py
+++ b/examples/quick_start/srt_example_llava.py
@@ -1,6 +1,7 @@
 """
 Usage: python3 srt_example_llava.py
 """
+
 import sglang as sgl
 
 
@@ -12,9 +13,8 @@ def image_qa(s, image_path, question):
 
 def single():
     state = image_qa.run(
-        image_path="images/cat.jpeg",
-        question="What is this?",
-        max_new_tokens=128)
+        image_path="images/cat.jpeg", question="What is this?", max_new_tokens=128
+    )
     print(state["answer"], "\n")
 
 
@@ -23,7 +23,8 @@ def stream():
         image_path="images/cat.jpeg",
         question="What is this?",
         max_new_tokens=64,
-        stream=True)
+        stream=True,
+    )
 
     for out in state.text_iter("answer"):
         print(out, end="", flush=True)
@@ -33,8 +34,8 @@ def stream():
 def batch():
     states = image_qa.run_batch(
         [
-            {"image_path": "images/cat.jpeg", "question":"What is this?"},
-            {"image_path": "images/dog.jpeg", "question":"What is this?"},
+            {"image_path": "images/cat.jpeg", "question": "What is this?"},
+            {"image_path": "images/dog.jpeg", "question": "What is this?"},
         ],
         max_new_tokens=128,
     )
@@ -43,8 +44,10 @@ def batch():
 
 
 if __name__ == "__main__":
-    runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.6-vicuna-7b",
-                          tokenizer_path="llava-hf/llava-1.5-7b-hf")
+    runtime = sgl.Runtime(
+        model_path="liuhaotian/llava-v1.6-vicuna-7b",
+        tokenizer_path="llava-hf/llava-1.5-7b-hf",
+    )
     sgl.set_default_backend(runtime)
     print(f"chat template: {runtime.endpoint.chat_template.name}")
 
diff --git a/examples/quick_start/srt_example_yi_vl.py b/examples/quick_start/srt_example_yi_vl.py
index 359aacac3..66c7d5712 100644
--- a/examples/quick_start/srt_example_yi_vl.py
+++ b/examples/quick_start/srt_example_yi_vl.py
@@ -3,6 +3,7 @@ Usage: python3 srt_example_yi_vl.py
 
 Requirements: transformers==4.38
 """
+
 import sglang as sgl
 
 
@@ -17,7 +18,8 @@ def single():
         image_path="images/cat.jpeg",
         question="What is this?",
         max_new_tokens=64,
-        stop="###")
+        stop="###",
+    )
     print(state["answer"], "\n")
 
 
@@ -27,7 +29,8 @@ def stream():
         question="What is this?",
         max_new_tokens=64,
         stream=True,
-        stop="###")
+        stop="###",
+    )
 
     for out in state.text_iter("answer"):
         print(out, end="", flush=True)
@@ -37,11 +40,11 @@ def stream():
 def batch():
     states = image_qa.run_batch(
         [
-            {"image_path": "images/cat.jpeg", "question":"What is this?"},
-            {"image_path": "images/dog.jpeg", "question":"What is this?"},
+            {"image_path": "images/cat.jpeg", "question": "What is this?"},
+            {"image_path": "images/dog.jpeg", "question": "What is this?"},
         ],
         max_new_tokens=64,
-        stop="###"
+        stop="###",
     )
     for s in states:
         print(s["answer"], "\n")
diff --git a/examples/quick_start/together_example_chat.py b/examples/quick_start/together_example_chat.py
index d2834f44e..2d2059062 100644
--- a/examples/quick_start/together_example_chat.py
+++ b/examples/quick_start/together_example_chat.py
@@ -3,9 +3,11 @@ Usage:
 export TOGETHER_API_KEY=sk-******
 python3 together_example_chat.py
 """
-import sglang as sgl
+
 import os
 
+import sglang as sgl
+
 
 @sgl.function
 def multi_turn_question(s, question_1, question_2):
@@ -32,7 +34,7 @@ def stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter():
@@ -41,13 +43,18 @@ def stream():
 
 
 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
 
     for s in states:
         print(s.messages())
diff --git a/examples/quick_start/together_example_complete.py b/examples/quick_start/together_example_complete.py
index 011c652fd..d9119ed6c 100644
--- a/examples/quick_start/together_example_complete.py
+++ b/examples/quick_start/together_example_complete.py
@@ -4,21 +4,21 @@ export TOGETHER_API_KEY=sk-******
 python3 together_example_complete.py
 """
 
-import sglang as sgl
 import os
 
+import sglang as sgl
+
 
 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
     s += "Q: " + question + "\n"
     s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
 
@@ -34,8 +34,8 @@ def single():
 
 def stream():
     state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )
 
     for out in state.text_iter("answer"):
         print(out, end="", flush=True)
@@ -43,10 +43,12 @@ def stream():
 
 
 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
 
     for s in states:
         print(s["answer"])
diff --git a/examples/usage/async_io.py b/examples/usage/async_io.py
index 68714812f..d12a3a4d9 100644
--- a/examples/usage/async_io.py
+++ b/examples/usage/async_io.py
@@ -2,7 +2,9 @@
 Usage:
 python3 async_io.py
 """
+
 import asyncio
+
 from sglang import Runtime
 
 
@@ -14,7 +16,10 @@ async def generate(
     tokenizer = engine.get_tokenizer()
 
     messages = [
-        {"role": "system", "content": "You will be given question answer tasks.",},
+        {
+            "role": "system",
+            "content": "You will be given question answer tasks.",
+        },
         {"role": "user", "content": prompt},
     ]
 
@@ -36,5 +41,5 @@ if __name__ == "__main__":
     prompt = "Who is Alan Turing?"
     sampling_params = {"max_new_tokens": 128}
     asyncio.run(generate(runtime, prompt, sampling_params))
-    
+
     runtime.shutdown()
diff --git a/examples/usage/cot_decoding.py b/examples/usage/cot_decoding.py
index d81a813c8..5f9cd68d4 100644
--- a/examples/usage/cot_decoding.py
+++ b/examples/usage/cot_decoding.py
@@ -33,8 +33,7 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
     )
     logprobs = step_0.get_meta_info("get_top_k")["decode_top_logprobs"][0]
 
-    print("Decoding step 0:",
-          ", ".join(pformat(token[2]) for token in logprobs))
+    print("Decoding step 0:", ", ".join(pformat(token[2]) for token in logprobs))
     for idx, (f, token) in enumerate(zip(forks, logprobs)):
         logprob, token_id, text = token
         f += text
@@ -56,17 +55,9 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
         )
 
         # calculate probability disparity between the top and secondary tokens
-        x1s = [
-            exp(xt[0][0])
-            for xt in f.get_meta_info("answer")["decode_top_logprobs"]
-        ]
-        x2s = [
-            exp(xt[1][0])
-            for xt in f.get_meta_info("answer")["decode_top_logprobs"]
-        ]
-        tokens = [
-            xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]
-        ]
+        x1s = [exp(xt[0][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
+        x2s = [exp(xt[1][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
+        tokens = [xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
         delta = (sum(x1s) - sum(x2s)) / len(x1s)
 
         # extract the answer span (without the '<|end_of_text|>' token)
@@ -79,42 +70,45 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
             top_logprobs_num=2,
             return_text_in_logprobs=True,
         )
-        answer = answer_forks[idx]['answer_span'].replace('\n', ' ').strip(':')
+        answer = answer_forks[idx]["answer_span"].replace("\n", " ").strip(":")
         print(
             f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score={delta}, answer={answer}){CLEAR}"
         )
-        generated_text = str(answer_forks[idx])[len("ProgramState("):-1]
+        generated_text = str(answer_forks[idx])[len("ProgramState(") : -1]
         print(f"{BLUE}{pformat(generated_text)}{CLEAR}")
 
         if verbose:
             answer_tokens = [
-                xt[0][2] for xt in answer_forks[idx].get_meta_info(
-                    "answer_span")["decode_top_logprobs"]
+                xt[0][2]
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "decode_top_logprobs"
+                ]
             ]
             answer_x1s = [
-                exp(xt[0][0]) for xt in answer_forks[idx].get_meta_info(
-                    "answer_span")["decode_top_logprobs"]
+                exp(xt[0][0])
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "decode_top_logprobs"
+                ]
             ]
             answer_x2s = [
-                exp(xt[1][0]) for xt in answer_forks[idx].get_meta_info(
-                    "answer_span")["decode_top_logprobs"]
+                exp(xt[1][0])
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "decode_top_logprobs"
+                ]
             ]
 
             for token, x1, x2 in zip(tokens, x1s, x2s):
-                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
-                      end="")
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
             print("\n===========")
             for token, x1, x2 in zip(answer_tokens, answer_x1s, answer_x2s):
-                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
-                      end="")
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
             print()
 
 
 sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
 
 state = cot_decoding.run(
-    question=
-    r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4  weeks?",
+    question=r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4  weeks?",
     get_top_k=10,
     is_chat_model=True,
     verbose=False,
diff --git a/examples/usage/json_decode.py b/examples/usage/json_decode.py
index ec2323e68..dc34d3527 100644
--- a/examples/usage/json_decode.py
+++ b/examples/usage/json_decode.py
@@ -3,10 +3,12 @@ Usage:
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
 python json_decode.py
 """
+
 from enum import Enum
 
-import sglang as sgl
 from pydantic import BaseModel
+
+import sglang as sgl
 from sglang.srt.constrained import build_regex_from_object
 
 character_regex = (
diff --git a/examples/usage/llava/http_llama3_llava_test.py b/examples/usage/llava/http_llama3_llava_test.py
index 113adbc8d..813a26af5 100644
--- a/examples/usage/llava/http_llama3_llava_test.py
+++ b/examples/usage/llava/http_llama3_llava_test.py
@@ -14,16 +14,13 @@ Output:
 
 import argparse
 import asyncio
+import copy
 import json
 import time
-import copy
 
 import aiohttp
 import requests
-
-from llava.conversation import (
-    conv_llava_llama_3,
-)
+from llava.conversation import conv_llava_llama_3
 
 
 async def send_request(url, data, delay=0):
diff --git a/examples/usage/llava/http_qwen_llava_test.py b/examples/usage/llava/http_qwen_llava_test.py
index 9ba206415..1c29658c6 100644
--- a/examples/usage/llava/http_qwen_llava_test.py
+++ b/examples/usage/llava/http_qwen_llava_test.py
@@ -14,16 +14,13 @@ Output:
 
 import argparse
 import asyncio
+import copy
 import json
 import time
-import copy
 
 import aiohttp
 import requests
-
-from llava.conversation import (
-    conv_qwen
-)
+from llava.conversation import conv_qwen
 
 
 async def send_request(url, data, delay=0):
diff --git a/examples/usage/llava/srt_llava_next_test.py b/examples/usage/llava/srt_llava_next_test.py
index d077fb2a6..0f9621648 100644
--- a/examples/usage/llava/srt_llava_next_test.py
+++ b/examples/usage/llava/srt_llava_next_test.py
@@ -2,13 +2,15 @@
 Usage: python3 srt_example_llava.py
 """
 
-import sglang as sgl
-from sglang.srt.utils import load_image
-from sglang.lang.chat_template import get_chat_template
-
 from PIL import ImageFile
+
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
+from sglang.srt.utils import load_image
+
 ImageFile.LOAD_TRUNCATED_IMAGES = True  # Allow loading of truncated images
 
+
 @sgl.function
 def image_qa(s, image, question):
     s += sgl.user(sgl.image(image) + question)
diff --git a/examples/usage/llava_video/srt_example_llava_v.py b/examples/usage/llava_video/srt_example_llava_v.py
index e18a81ebb..df771f41b 100644
--- a/examples/usage/llava_video/srt_example_llava_v.py
+++ b/examples/usage/llava_video/srt_example_llava_v.py
@@ -2,15 +2,17 @@
 Usage: python3 srt_example_llava.py
 """
 
-import sglang as sgl
-import os
-import csv
-import time
 import argparse
+import csv
+import os
+import time
+
+import sglang as sgl
+
 
 @sgl.function
 def video_qa(s, num_frames, video_path, question):
-    s += sgl.user(sgl.video(video_path,num_frames) + question)
+    s += sgl.user(sgl.video(video_path, num_frames) + question)
     s += sgl.assistant(sgl.gen("answer"))
 
 
@@ -25,7 +27,6 @@ def single(path, num_frames=16):
     print(state["answer"], "\n")
 
 
-
 def split_into_chunks(lst, num_chunks):
     """Split a list into a specified number of chunks."""
     # Calculate the chunk size using integer division. Note that this may drop some items if not evenly divisible.
@@ -34,7 +35,7 @@ def split_into_chunks(lst, num_chunks):
     if chunk_size == 0:
         chunk_size = len(lst)
     # Use list comprehension to generate chunks. The last chunk will take any remainder if the list size isn't evenly divisible.
-    chunks = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
+    chunks = [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
     # Ensure we have exactly num_chunks chunks, even if some are empty
     chunks.extend([[] for _ in range(num_chunks - len(chunks))])
     return chunks
@@ -42,67 +43,73 @@ def split_into_chunks(lst, num_chunks):
 
 def save_batch_results(batch_video_files, states, cur_chunk, batch_idx, save_dir):
     csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
-    with open(csv_filename, 'w', newline='') as csvfile:
+    with open(csv_filename, "w", newline="") as csvfile:
         writer = csv.writer(csvfile)
-        writer.writerow(['video_name', 'answer'])
+        writer.writerow(["video_name", "answer"])
         for video_path, state in zip(batch_video_files, states):
             video_name = os.path.basename(video_path)
             writer.writerow([video_name, state["answer"]])
 
+
 def compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir):
     final_csv_filename = f"{save_dir}/final_results_chunk_{cur_chunk}.csv"
-    with open(final_csv_filename, 'w', newline='') as final_csvfile:
+    with open(final_csv_filename, "w", newline="") as final_csvfile:
         writer = csv.writer(final_csvfile)
-        writer.writerow(['video_name', 'answer'])
+        writer.writerow(["video_name", "answer"])
         for batch_idx in range(num_batches):
             batch_csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
-            with open(batch_csv_filename, 'r') as batch_csvfile:
+            with open(batch_csv_filename, "r") as batch_csvfile:
                 reader = csv.reader(batch_csvfile)
                 next(reader)  # Skip header row
                 for row in reader:
                     writer.writerow(row)
             os.remove(batch_csv_filename)
 
+
 def find_video_files(video_dir):
     # Check if the video_dir is actually a file
     if os.path.isfile(video_dir):
         # If it's a file, return it as a single-element list
         return [video_dir]
-    
+
     # Original logic to find video files in a directory
     video_files = []
     for root, dirs, files in os.walk(video_dir):
         for file in files:
-            if file.endswith(('.mp4', '.avi', '.mov')):
+            if file.endswith((".mp4", ".avi", ".mov")):
                 video_files.append(os.path.join(root, file))
     return video_files
 
+
 def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=64):
     video_files = find_video_files(video_dir)
     chunked_video_files = split_into_chunks(video_files, num_chunks)[cur_chunk]
     num_batches = 0
 
     for i in range(0, len(chunked_video_files), batch_size):
-        batch_video_files = chunked_video_files[i:i + batch_size]
+        batch_video_files = chunked_video_files[i : i + batch_size]
         print(f"Processing batch of {len(batch_video_files)} video(s)...")
 
         if not batch_video_files:
             print("No video files found in the specified directory.")
             return
-        
+
         batch_input = [
-            {   
+            {
                 "num_frames": num_frames,
                 "video_path": video_path,
                 "question": "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes.",
-            } for video_path in batch_video_files
+            }
+            for video_path in batch_video_files
         ]
 
         start_time = time.time()
         states = video_qa.run_batch(batch_input, max_new_tokens=512, temperature=0.2)
         total_time = time.time() - start_time
         average_time = total_time / len(batch_video_files)
-        print(f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds")
+        print(
+            f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds"
+        )
 
         save_batch_results(batch_video_files, states, cur_chunk, num_batches, save_dir)
         num_batches += 1
@@ -113,16 +120,47 @@ def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=
 if __name__ == "__main__":
 
     # Create the parser
-    parser = argparse.ArgumentParser(description='Run video processing with specified port.')
+    parser = argparse.ArgumentParser(
+        description="Run video processing with specified port."
+    )
 
     # Add an argument for the port
-    parser.add_argument('--port', type=int, default=30000, help='The master port for distributed serving.')
-    parser.add_argument('--chunk-idx', type=int, default=0, help='The index of the chunk to process.')
-    parser.add_argument('--num-chunks', type=int, default=8, help='The number of chunks to process.')
-    parser.add_argument('--save-dir', type=str, default="./work_dirs/llava_video", help='The directory to save the processed video files.')
-    parser.add_argument('--video-dir', type=str, default="./videos/Q98Z4OTh8RwmDonc.mp4", help='The directory or path for the processed video files.')
-    parser.add_argument('--model-path', type=str, default="lmms-lab/LLaVA-NeXT-Video-7B", help='The model path for the video processing.')
-    parser.add_argument('--num-frames', type=int, default=16, help='The number of frames to process in each video.' )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=30000,
+        help="The master port for distributed serving.",
+    )
+    parser.add_argument(
+        "--chunk-idx", type=int, default=0, help="The index of the chunk to process."
+    )
+    parser.add_argument(
+        "--num-chunks", type=int, default=8, help="The number of chunks to process."
+    )
+    parser.add_argument(
+        "--save-dir",
+        type=str,
+        default="./work_dirs/llava_video",
+        help="The directory to save the processed video files.",
+    )
+    parser.add_argument(
+        "--video-dir",
+        type=str,
+        default="./videos/Q98Z4OTh8RwmDonc.mp4",
+        help="The directory or path for the processed video files.",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="lmms-lab/LLaVA-NeXT-Video-7B",
+        help="The model path for the video processing.",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=16,
+        help="The number of frames to process in each video.",
+    )
     parser.add_argument("--mm_spatial_pool_stride", type=int, default=2)
 
     # Parse the arguments
@@ -154,7 +192,6 @@ if __name__ == "__main__":
     if "34b" in args.model_path.lower():
         model_overide_args["image_token_index"] = 64002
 
-
     if args.num_frames == 32:
         model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
         model_overide_args["max_sequence_length"] = 4096 * 2
@@ -162,22 +199,22 @@ if __name__ == "__main__":
     elif args.num_frames < 32:
         pass
     else:
-        print("The maximum number of frames to process is 32. Please specify a valid number of frames.")
+        print(
+            "The maximum number of frames to process is 32. Please specify a valid number of frames."
+        )
         exit()
 
-
     runtime = sgl.Runtime(
-        model_path=args.model_path, #"liuhaotian/llava-v1.6-vicuna-7b",
+        model_path=args.model_path,  # "liuhaotian/llava-v1.6-vicuna-7b",
         tokenizer_path=tokenizer_path,
         port=cur_port,
-        additional_ports=[cur_port+1,cur_port+2,cur_port+3,cur_port+4],
+        additional_ports=[cur_port + 1, cur_port + 2, cur_port + 3, cur_port + 4],
         model_overide_args=model_overide_args,
-        tp_size=1
+        tp_size=1,
     )
     sgl.set_default_backend(runtime)
     print(f"chat template: {runtime.endpoint.chat_template.name}")
 
-
     # Run a single request
     # try:
     print("\n========== single ==========\n")
@@ -185,24 +222,29 @@ if __name__ == "__main__":
     if os.path.isfile(root):
         video_files = [root]
     else:
-        video_files = [os.path.join(root, f) for f in os.listdir(root) if f.endswith(('.mp4', '.avi', '.mov'))]  # Add more extensions if needed
+        video_files = [
+            os.path.join(root, f)
+            for f in os.listdir(root)
+            if f.endswith((".mp4", ".avi", ".mov"))
+        ]  # Add more extensions if needed
     start_time = time.time()  # Start time for processing a single video
     for cur_video in video_files[:1]:
         print(cur_video)
         single(cur_video, num_frames)
     end_time = time.time()  # End time for processing a single video
     total_time = end_time - start_time
-    average_time = total_time / len(video_files)  # Calculate the average processing time
+    average_time = total_time / len(
+        video_files
+    )  # Calculate the average processing time
     print(f"Average processing time per video: {average_time:.2f} seconds")
     runtime.shutdown()
     # except Exception as e:
     #     print(e)
     runtime.shutdown()
 
-
     # # # Run a batch of requests
     # print("\n========== batch ==========\n")
     # if not os.path.exists(args.save_dir):
     #     os.makedirs(args.save_dir)
     # batch(args.video_dir,args.save_dir,cur_chunk, num_chunks, num_frames, num_chunks)
-    # runtime.shutdown()
\ No newline at end of file
+    # runtime.shutdown()
diff --git a/examples/usage/openai_chat_speculative.py b/examples/usage/openai_chat_speculative.py
index 94eb43276..a9c5f5afb 100644
--- a/examples/usage/openai_chat_speculative.py
+++ b/examples/usage/openai_chat_speculative.py
@@ -15,23 +15,40 @@ incorrect:
 export OPENAI_API_KEY=sk-******
 python3 openai_chat_speculative.py
 """
+
 import sglang as sgl
-from sglang import function, set_default_backend, OpenAI
+from sglang import OpenAI, function, set_default_backend
 
 
 @function(num_api_spec_tokens=256)
 def gen_character_spec(s):
     s += sgl.system("You are a helpful assistant.")
     s += sgl.user("Construct a character within the following format:")
-    s += sgl.assistant("Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n")
+    s += sgl.assistant(
+        "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+    )
     s += sgl.user("Please generate new Name, Birthday and Job.\n")
-    s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
+    s += sgl.assistant(
+        "Name:"
+        + sgl.gen("name", stop="\n")
+        + "\nBirthday:"
+        + sgl.gen("birthday", stop="\n")
+        + "\nJob:"
+        + sgl.gen("job", stop="\n")
+    )
 
 
 @function(num_api_spec_tokens=256)
 def gen_character_spec_no_few_shot(s):
     s += sgl.user("Construct a character. For each field stop with a newline\n")
-    s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nAge:" + sgl.gen("age", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
+    s += sgl.assistant(
+        "Name:"
+        + sgl.gen("name", stop="\n")
+        + "\nAge:"
+        + sgl.gen("age", stop="\n")
+        + "\nJob:"
+        + sgl.gen("job", stop="\n")
+    )
 
 
 @function
@@ -45,10 +62,19 @@ def gen_character_normal(s):
 def multi_turn_question(s, question_1, question_2):
     s += sgl.system("You are a helpful assistant.")
     s += sgl.user("Answer questions in the following format:")
-    s += sgl.user("Question 1: What is the capital of France?\nQuestion 2: What is the population of this city?\n")
-    s += sgl.assistant("Answer 1: The capital of France is Paris.\nAnswer 2: The population of Paris in 2024 is estimated to be around 2.1 million for the city proper.\n")
-    s += sgl.user("Question 1: " + question_1+"\nQuestion 2: " + question_2)
-    s += sgl.assistant("Answer 1: " + sgl.gen("answer_1", stop="\n") + "\nAnswer 2: " + sgl.gen("answer_2", stop="\n"))
+    s += sgl.user(
+        "Question 1: What is the capital of France?\nQuestion 2: What is the population of this city?\n"
+    )
+    s += sgl.assistant(
+        "Answer 1: The capital of France is Paris.\nAnswer 2: The population of Paris in 2024 is estimated to be around 2.1 million for the city proper.\n"
+    )
+    s += sgl.user("Question 1: " + question_1 + "\nQuestion 2: " + question_2)
+    s += sgl.assistant(
+        "Answer 1: "
+        + sgl.gen("answer_1", stop="\n")
+        + "\nAnswer 2: "
+        + sgl.gen("answer_2", stop="\n")
+    )
 
 
 def test_spec_single_turn():
@@ -97,7 +123,7 @@ def test_spec_multi_turn_stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter():
@@ -126,4 +152,4 @@ if __name__ == "__main__":
 
     print("\n========== test spec multi turn stream ==========\n")
     # expect error in stream_executor: stream is not supported...
-    test_spec_multi_turn_stream()
\ No newline at end of file
+    test_spec_multi_turn_stream()
diff --git a/examples/usage/openai_speculative.py b/examples/usage/openai_speculative.py
index c64694da6..4389cb059 100644
--- a/examples/usage/openai_speculative.py
+++ b/examples/usage/openai_speculative.py
@@ -2,7 +2,8 @@
 Usage:
 python3 openai_speculative.py
 """
-from sglang import function, gen, set_default_backend, OpenAI
+
+from sglang import OpenAI, function, gen, set_default_backend
 
 
 @function(num_api_spec_tokens=64)
@@ -35,7 +36,11 @@ if __name__ == "__main__":
     backend = OpenAI("gpt-3.5-turbo-instruct")
     set_default_backend(backend)
 
-    for function in [gen_character_spec, gen_character_no_spec, gen_character_spec_no_few_shot]:
+    for function in [
+        gen_character_spec,
+        gen_character_no_spec,
+        gen_character_spec_no_few_shot,
+    ]:
         backend.token_usage.reset()
 
         print(f"function: {function.func.__name__}")
@@ -46,4 +51,4 @@ if __name__ == "__main__":
         print("...birthday:", state["birthday"])
         print("...job:", state["job"])
         print(backend.token_usage)
-        print()
\ No newline at end of file
+        print()
diff --git a/examples/usage/parallel_sample.py b/examples/usage/parallel_sample.py
index 288b48ac0..0f3cf1700 100644
--- a/examples/usage/parallel_sample.py
+++ b/examples/usage/parallel_sample.py
@@ -2,6 +2,7 @@
 Usage:
 python3 parallel_sample.py
 """
+
 import sglang as sgl
 
 
@@ -12,7 +13,6 @@ def parallel_sample(s, question, n):
         "Reasoning: I need to use a calculator.\n"
         "Tool: calculator\n"
         "Answer: 6\n"
-
         "Question: Compute 3 + 2 + 2\n"
         "Reasoning: I will try a calculator.\n"
         "Tool: calculator\n"
@@ -27,13 +27,9 @@ def parallel_sample(s, question, n):
 
 
 sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
-#sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+# sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
 
-state = parallel_sample.run(
-    question="Compute 5 + 2 + 4.",
-    n=5,
-    temperature=1.0
-)
+state = parallel_sample.run(question="Compute 5 + 2 + 4.", n=5, temperature=1.0)
 
 for i in range(5):
     obj = {
diff --git a/examples/usage/readme_examples.py b/examples/usage/readme_examples.py
index 8789e1b13..7269ef148 100644
--- a/examples/usage/readme_examples.py
+++ b/examples/usage/readme_examples.py
@@ -3,13 +3,18 @@ Usage:
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
 python readme_examples.py
 """
+
 import sglang as sgl
 
 
 @sgl.function
 def tool_use(s, question):
     s += "To answer this question: " + question + ". "
-    s += "I need to use a " + sgl.gen("tool", choices=["calculator", "search engine"]) + ". "
+    s += (
+        "I need to use a "
+        + sgl.gen("tool", choices=["calculator", "search engine"])
+        + ". "
+    )
 
     if s["tool"] == "calculator":
         s += "The math expression is" + sgl.gen("expression")
@@ -75,7 +80,7 @@ def driver_batching():
             {"question": "What is the capital of France?"},
             {"question": "What is the capital of Japan?"},
         ],
-        progress_bar=True
+        progress_bar=True,
     )
 
     for s in states:
@@ -85,9 +90,7 @@ def driver_batching():
 
 def driver_stream():
     state = text_qa.run(
-        question="What is the capital of France?",
-        temperature=0.1,
-        stream=True
+        question="What is the capital of France?", temperature=0.1, stream=True
     )
 
     for out in state.text_iter():
@@ -96,7 +99,7 @@ def driver_stream():
 
 
 if __name__ == "__main__":
-    #sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
+    # sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
     sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
 
     driver_tool_use()
diff --git a/examples/usage/streaming.py b/examples/usage/streaming.py
index 20feaafbc..506ee35c6 100644
--- a/examples/usage/streaming.py
+++ b/examples/usage/streaming.py
@@ -2,7 +2,9 @@
 Usage:
 python3 streaming.py
 """
+
 import asyncio
+
 import sglang as sgl
 
 
@@ -22,7 +24,7 @@ def stream_a_variable():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     for out in state.text_iter(var_name="answer_2"):
@@ -34,7 +36,7 @@ async def async_stream():
     state = multi_turn_question.run(
         question_1="What is the capital of the United States?",
         question_2="List two local attractions.",
-        stream=True
+        stream=True,
     )
 
     async for out in state.text_async_iter(var_name="answer_2"):
diff --git a/examples/usage/triton/models/character_generation/1/model.py b/examples/usage/triton/models/character_generation/1/model.py
index e76992f95..5550e9398 100644
--- a/examples/usage/triton/models/character_generation/1/model.py
+++ b/examples/usage/triton/models/character_generation/1/model.py
@@ -1,45 +1,55 @@
-import triton_python_backend_utils as pb_utils
 import numpy
+import triton_python_backend_utils as pb_utils
+from pydantic import BaseModel
+
 import sglang as sgl
 from sglang import function, set_default_backend
 from sglang.srt.constrained import build_regex_from_object
 
-from pydantic import BaseModel
-
 sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
 
+
 class Character(BaseModel):
     name: str
     eye_color: str
     house: str
 
+
 @function
 def character_gen(s, name):
     s += (
         name
         + " is a character in Harry Potter. Please fill in the following information about this character.\n"
     )
-    s += sgl.gen("json_output", max_tokens=256, regex=build_regex_from_object(Character))
+    s += sgl.gen(
+        "json_output", max_tokens=256, regex=build_regex_from_object(Character)
+    )
 
 
 class TritonPythonModel:
     def initialize(self, args):
         print("Initialized.")
+
     def execute(self, requests):
         responses = []
         for request in requests:
             tensor_in = pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT")
             if tensor_in is None:
                 return pb_utils.InferenceResponse(output_tensors=[])
-            
-            input_list_names = [i.decode('utf-8') if isinstance(i, bytes) else i for i in tensor_in.as_numpy().tolist()]
 
-            input_list_dicts = [{"name":i} for i in input_list_names]
+            input_list_names = [
+                i.decode("utf-8") if isinstance(i, bytes) else i
+                for i in tensor_in.as_numpy().tolist()
+            ]
+
+            input_list_dicts = [{"name": i} for i in input_list_names]
 
             states = character_gen.run_batch(input_list_dicts)
             character_strs = [state.text() for state in states]
 
-            tensor_out = pb_utils.Tensor("OUTPUT_TEXT", numpy.array(character_strs, dtype=object))
+            tensor_out = pb_utils.Tensor(
+                "OUTPUT_TEXT", numpy.array(character_strs, dtype=object)
+            )
 
-            responses.append(pb_utils.InferenceResponse(output_tensors = [tensor_out]))
-        return responses
\ No newline at end of file
+            responses.append(pb_utils.InferenceResponse(output_tensors=[tensor_out]))
+        return responses
diff --git a/playground/load_tokenizer.py b/playground/load_tokenizer.py
index 39fa18424..94cf34bc7 100644
--- a/playground/load_tokenizer.py
+++ b/playground/load_tokenizer.py
@@ -3,11 +3,12 @@ import code
 
 from sglang.srt.hf_transformers_utils import get_tokenizer
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
+    parser.add_argument(
+        "--name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct"
+    )
     args = parser.parse_args()
 
     t = get_tokenizer(args.name)
-    code.interact(local=locals())
\ No newline at end of file
+    code.interact(local=locals())
diff --git a/python/sglang/srt/managers/controller/cuda_graph_runner.py b/python/sglang/srt/managers/controller/cuda_graph_runner.py
index 7218936be..1be3cfb77 100644
--- a/python/sglang/srt/managers/controller/cuda_graph_runner.py
+++ b/python/sglang/srt/managers/controller/cuda_graph_runner.py
@@ -183,14 +183,18 @@ class CudaGraphRunner:
         else:
             output = LogitProcessorOutput(
                 next_token_logits=output.next_token_logits[:raw_bs],
-                next_token_logprobs=output.next_token_logprobs[:raw_bs]
-                if output.next_token_logprobs is not None
-                else None,
+                next_token_logprobs=(
+                    output.next_token_logprobs[:raw_bs]
+                    if output.next_token_logprobs is not None
+                    else None
+                ),
                 normalized_prompt_logprobs=None,
                 prefill_token_logprobs=None,
                 prefill_top_logprobs=None,
-                decode_top_logprobs=output.decode_top_logprobs[:raw_bs]
-                if output.decode_top_logprobs is not None
-                else None,
+                decode_top_logprobs=(
+                    output.decode_top_logprobs[:raw_bs]
+                    if output.decode_top_logprobs is not None
+                    else None
+                ),
             )
         return output
diff --git a/python/sglang/srt/managers/controller/manager_single.py b/python/sglang/srt/managers/controller/manager_single.py
index d4186d484..37af98e9a 100644
--- a/python/sglang/srt/managers/controller/manager_single.py
+++ b/python/sglang/srt/managers/controller/manager_single.py
@@ -1,7 +1,7 @@
 """A controller that manages a group of tensor parallel workers."""
 
-import multiprocessing
 import logging
+import multiprocessing
 import os
 import pickle
 
@@ -11,11 +11,10 @@ import zmq
 import zmq.asyncio
 
 from sglang.srt.managers.controller.tp_worker import ModelTpServer
-from sglang.srt.server_args import PortArgs, ServerArgs, ModelPortArgs
+from sglang.srt.server_args import ModelPortArgs, PortArgs, ServerArgs
 from sglang.srt.utils import kill_parent_process
 from sglang.utils import get_exception_traceback
 
-
 logger = logging.getLogger("srt.controller")
 
 
@@ -45,14 +44,16 @@ def run_tp_server(
         raise
 
 
-def launch_tp_servers(gpu_ids, tp_rank_range, server_args,
-                      model_port_args, model_overide_args):
+def launch_tp_servers(
+    gpu_ids, tp_rank_range, server_args, model_port_args, model_overide_args
+):
     """Launch multiple tp servers."""
     procs = []
     for i in tp_rank_range:
-        proc = multiprocessing.Process(target=run_tp_server, args=(
-            gpu_ids[i], i, server_args, model_port_args, model_overide_args
-        ))
+        proc = multiprocessing.Process(
+            target=run_tp_server,
+            args=(gpu_ids[i], i, server_args, model_port_args, model_overide_args),
+        )
         proc.start()
         procs.append(proc)
 
@@ -93,7 +94,9 @@ def broadcast_recv_input(data, rank, dist_group):
 class ControllerSingle:
     """A controller that manages a group of tensor parallel workers."""
 
-    def __init__(self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict):
+    def __init__(
+        self, server_args: ServerArgs, port_args: PortArgs, model_overide_args: dict
+    ):
         # Parse args
         self.server_args = server_args
         self.tp_procs = []
@@ -116,8 +119,12 @@ class ControllerSingle:
         if tp_size_local > 1:
             tp_rank_range = range(1, tp_size_local)
             self.tp_procs = launch_tp_servers(
-                gpu_ids, tp_rank_range, server_args,
-                port_args.model_port_args[0], model_overide_args)
+                gpu_ids,
+                tp_rank_range,
+                server_args,
+                port_args.model_port_args[0],
+                model_overide_args,
+            )
 
         # Launch tp rank 0
         self.tp_server = ModelTpServer(
diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py
index 80c40e4f5..ae1f555a1 100644
--- a/python/sglang/srt/managers/controller/model_runner.py
+++ b/python/sglang/srt/managers/controller/model_runner.py
@@ -11,7 +11,11 @@ import torch
 import torch.nn as nn
 from vllm.config import DeviceConfig, LoadConfig
 from vllm.config import ModelConfig as VllmModelConfig
-from vllm.distributed import init_distributed_environment, initialize_model_parallel, get_tp_group
+from vllm.distributed import (
+    get_tp_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import ModelRegistry
 
@@ -89,9 +93,9 @@ class ModelRunner:
 
         # Set some global args
         global_server_args_dict["disable_flashinfer"] = server_args.disable_flashinfer
-        global_server_args_dict[
-            "attention_reduce_in_fp32"
-        ] = server_args.attention_reduce_in_fp32
+        global_server_args_dict["attention_reduce_in_fp32"] = (
+            server_args.attention_reduce_in_fp32
+        )
 
         # Load the model and create memory pool
         self.load_model()
diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py
index c9cd0f3f1..897cab140 100644
--- a/python/sglang/srt/managers/controller/tp_worker.py
+++ b/python/sglang/srt/managers/controller/tp_worker.py
@@ -241,12 +241,9 @@ class ModelTpServer:
 
     def print_stats(self):
         num_used = self.max_total_num_tokens - (
-            self.token_to_kv_pool.available_size()
-            + self.tree_cache.evictable_size()
-        )
-        throughput = self.num_generated_tokens / (
-            time.time() - self.last_stats_tic
+            self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
         )
+        throughput = self.num_generated_tokens / (time.time() - self.last_stats_tic)
         self.num_generated_tokens = 0
         self.last_stats_tic = time.time()
         logger.info(
@@ -260,8 +257,7 @@ class ModelTpServer:
 
     def check_memory(self):
         available_size = (
-            self.token_to_kv_pool.available_size()
-            + self.tree_cache.evictable_size()
+            self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
         )
         if available_size != self.max_total_num_tokens:
             warnings.warn(
@@ -348,7 +344,8 @@ class ModelTpServer:
         if self.running_batch:
             available_size -= sum(
                 [
-                    (r.sampling_params.max_new_tokens - len(r.output_ids)) * self.new_token_ratio
+                    (r.sampling_params.max_new_tokens - len(r.output_ids))
+                    * self.new_token_ratio
                     for r in self.running_batch.reqs
                 ]
             )
@@ -370,7 +367,9 @@ class ModelTpServer:
                     req.image_offset += 1
 
             if (
-                req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens
+                req.extend_input_len
+                + req.sampling_params.max_new_tokens
+                + new_batch_total_tokens
                 < available_size
                 and (
                     req.extend_input_len + new_batch_input_tokens
@@ -382,7 +381,9 @@ class ModelTpServer:
                 available_size += delta
 
                 if not (
-                    req.extend_input_len + req.sampling_params.max_new_tokens + new_batch_total_tokens
+                    req.extend_input_len
+                    + req.sampling_params.max_new_tokens
+                    + new_batch_total_tokens
                     < available_size
                 ):
                     # Undo locking
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index bd5012904..75af8e62c 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -335,15 +335,16 @@ class TokenizerManager:
             )
 
             if top_logprobs_num > 0:
-                ret["meta_info"][
-                    "prefill_top_logprobs"
-                ] = self.detokenize_top_logprobs_tokens(
-                    ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs
+                ret["meta_info"]["prefill_top_logprobs"] = (
+                    self.detokenize_top_logprobs_tokens(
+                        ret["meta_info"]["prefill_top_logprobs"],
+                        return_text_in_logprobs,
+                    )
                 )
-                ret["meta_info"][
-                    "decode_top_logprobs"
-                ] = self.detokenize_top_logprobs_tokens(
-                    ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
+                ret["meta_info"]["decode_top_logprobs"] = (
+                    self.detokenize_top_logprobs_tokens(
+                        ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
+                    )
                 )
         return ret
 
diff --git a/python/sglang/srt/memory_pool.py b/python/sglang/srt/memory_pool.py
index c0a384ccc..28fc512f6 100644
--- a/python/sglang/srt/memory_pool.py
+++ b/python/sglang/srt/memory_pool.py
@@ -21,7 +21,9 @@ class ReqToTokenPool:
         if need_size > self.can_use_mem_size:
             return None
 
-        select_index = torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32)
+        select_index = (
+            torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32)
+        )
         self.mem_state[select_index] = False
         self.can_use_mem_size -= need_size
 
@@ -79,7 +81,9 @@ class TokenToKVPool:
 
         addition_size = need_size - buffer_len
         alloc_size = max(addition_size, self.prefetch_chunk_size)
-        select_index = torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32)
+        select_index = (
+            torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32)
+        )
 
         if select_index.shape[0] < addition_size:
             return None
diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py
index eca15c7cb..e6b3c1d19 100644
--- a/python/sglang/srt/models/llama2.py
+++ b/python/sglang/srt/models/llama2.py
@@ -163,9 +163,9 @@ class LlamaDecoderLayer(nn.Module):
         if rope_scaling is not None and getattr(
             config, "original_max_position_embeddings", None
         ):
-            rope_scaling[
-                "original_max_position_embeddings"
-            ] = config.original_max_position_embeddings
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
         rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = LlamaAttention(
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index 05152c271..8e713cff0 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -313,7 +313,10 @@ class Qwen2ForCausalLM(nn.Module):
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
-                if self.config.tie_word_embeddings and name=="model.embed_tokens.weight":
+                if (
+                    self.config.tie_word_embeddings
+                    and name == "model.embed_tokens.weight"
+                ):
                     weight_loader(params_dict["lm_head.weight"], loaded_weight)
 
 
diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py
index 6e90babcc..ca1f27a63 100644
--- a/python/sglang/srt/models/qwen2_moe.py
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -401,9 +401,11 @@ class Qwen2MoeForCausalLM(nn.Module):
             # These are the weights for the experts
             # (param_name, weight_name, expert_id, shard_id)
             (
-                "experts.w13_weight"
-                if weight_name in ["gate_proj", "up_proj"]
-                else "experts.w2_weight",
+                (
+                    "experts.w13_weight"
+                    if weight_name in ["gate_proj", "up_proj"]
+                    else "experts.w2_weight"
+                ),
                 f"experts.{expert_id}.{weight_name}.weight",
                 expert_id,
                 shard_id,
@@ -418,7 +420,7 @@ class Qwen2MoeForCausalLM(nn.Module):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
                     continue
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 30b0e7eec..57862c42c 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -32,8 +32,8 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.controller.manager_multi import (
     start_controller_process as start_controller_process_multi,
 )
+from sglang.srt.managers.controller.manager_single import launch_tp_servers
 from sglang.srt.managers.controller.manager_single import (
-    launch_tp_servers,
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
@@ -198,11 +198,22 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
 
         if server_args.node_rank != 0:
             tp_size_local = server_args.tp_size // server_args.nnodes
-            gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)]
-            tp_rank_range = list(range(server_args.node_rank * tp_size_local,
-                                  (server_args.node_rank + 1) * tp_size_local))
-            procs = launch_tp_servers(gpu_ids, tp_rank_range, server_args,
-                                      port_args.model_port_args[0], model_overide_args)
+            gpu_ids = [
+                i for _ in range(server_args.nnodes) for i in range(tp_size_local)
+            ]
+            tp_rank_range = list(
+                range(
+                    server_args.node_rank * tp_size_local,
+                    (server_args.node_rank + 1) * tp_size_local,
+                )
+            )
+            procs = launch_tp_servers(
+                gpu_ids,
+                tp_rank_range,
+                server_args,
+                port_args.model_port_args[0],
+                model_overide_args,
+            )
             while True:
                 pass
 
diff --git a/scripts/convert_yi_vl.py b/scripts/convert_yi_vl.py
index a45f83a30..bdf37ff92 100644
--- a/scripts/convert_yi_vl.py
+++ b/scripts/convert_yi_vl.py
@@ -10,16 +10,15 @@ import os
 
 from transformers import AutoConfig, AutoTokenizer
 
+
 def add_image_token(model_path: str):
     tokenizer = AutoTokenizer.from_pretrained(model_path)
-    tokenizer.add_tokens(
-        ["<image_placeholder>"],
-        special_tokens=True
-    )
+    tokenizer.add_tokens(["<image_placeholder>"], special_tokens=True)
 
     print(tokenizer)
     tokenizer.save_pretrained(model_path)
 
+
 def edit_model_config(model_path):
     config = AutoConfig.from_pretrained(model_path)
 
@@ -29,10 +28,11 @@ def edit_model_config(model_path):
     print(config)
     config.save_pretrained(model_path)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model-path", type=str)
     args = parser.parse_args()
 
     add_image_token(args.model_path)
-    edit_model_config(args.model_path)
\ No newline at end of file
+    edit_model_config(args.model_path)