diff --git a/README.md b/README.md
index c118d6a1a..04dd913ba 100644
--- a/README.md
+++ b/README.md
@@ -22,12 +22,13 @@ The core features include:
 
 ## News
 - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
-- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
+- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 
 <details>
 <summary>More</summary>
 
+- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
 
@@ -227,19 +228,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
-- LLaVA 1.5 / 1.6
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
-  - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --host=127.0.0.1 --tp-size=1 --chat-template=llava_llama_3`
-  - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --host="127.0.0.1" --tp-size=8 --chat-template=chatml-llava`
-- LLaVA-NeXT-Video
-  - see [examples/usage/llava_video](examples/usage/llava_video)
-- [LLaVA-OneVision](https://arxiv.org/abs/2408.03326)
-  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
-  - see [test/srt/test_llava_onevision_openai_server.py](test/srt/test_llava_onevision_openai_server.py)
+- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
+  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
+- LLaVA 1.5 / 1.6 / NeXT
+  - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
+  - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
 - Yi-VL
-  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
 - StableLM
 - Command-R
 - DBRX
@@ -250,6 +246,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
 
 #### Use Models From ModelScope
+<details>
+
 To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
 ```
 export SGLANG_USE_MODELSCOPE=true
@@ -258,21 +256,20 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
 ```
 SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
 ```    
+</details>
 
 #### Run Llama 3.1 405B
 
 ```bash
-## Run 405B (fp8) on a single node
+# Run 405B (fp8) on a single node
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
 
-## Run 405B (fp16) on two nodes
-# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
+# Run 405B (fp16) on two nodes
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
 
-# on the first node
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
-
-# on the second
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
 ```
 
 ### Benchmark Performance
diff --git a/docs/en/sampling_params.md b/docs/en/sampling_params.md
index 7d866e692..54b03bf32 100644
--- a/docs/en/sampling_params.md
+++ b/docs/en/sampling_params.md
@@ -1,5 +1,8 @@
 # Sampling Parameters in SGLang Runtime
 This doc describes the sampling parameters of the SGLang Runtime.
+It is the low-level endpoint of the runtime.
+If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API
+](https://github.com/sgl-project/sglang?tab=readme-ov-file#openai-compatible-api).
 
 The `/generate` endpoint accepts the following arguments in the JSON format.
 
@@ -140,7 +143,7 @@ print("")
 
 Launch a server
 ```
-python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --chat-template chatml-llava
 ```
 
 Download an image
@@ -155,7 +158,9 @@ import requests
 response = requests.post(
     "http://localhost:30000/generate",
     json={
-        "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:",
+        "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                "<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n"
+                "<|im_start|>assistant\n",
         "image_data": "example_image.png",
         "sampling_params": {
             "temperature": 0,
diff --git a/examples/quick_start/anthropic_example_chat.py b/examples/frontend_language/quick_start/anthropic_example_chat.py
similarity index 100%
rename from examples/quick_start/anthropic_example_chat.py
rename to examples/frontend_language/quick_start/anthropic_example_chat.py
diff --git a/examples/quick_start/anthropic_example_complete.py b/examples/frontend_language/quick_start/anthropic_example_complete.py
similarity index 100%
rename from examples/quick_start/anthropic_example_complete.py
rename to examples/frontend_language/quick_start/anthropic_example_complete.py
diff --git a/examples/quick_start/azure_openai_example_chat.py b/examples/frontend_language/quick_start/azure_openai_example_chat.py
similarity index 100%
rename from examples/quick_start/azure_openai_example_chat.py
rename to examples/frontend_language/quick_start/azure_openai_example_chat.py
diff --git a/examples/quick_start/gemini_example_chat.py b/examples/frontend_language/quick_start/gemini_example_chat.py
similarity index 100%
rename from examples/quick_start/gemini_example_chat.py
rename to examples/frontend_language/quick_start/gemini_example_chat.py
diff --git a/examples/quick_start/gemini_example_complete.py b/examples/frontend_language/quick_start/gemini_example_complete.py
similarity index 100%
rename from examples/quick_start/gemini_example_complete.py
rename to examples/frontend_language/quick_start/gemini_example_complete.py
diff --git a/examples/quick_start/gemini_example_multimodal_chat.py b/examples/frontend_language/quick_start/gemini_example_multimodal_chat.py
similarity index 100%
rename from examples/quick_start/gemini_example_multimodal_chat.py
rename to examples/frontend_language/quick_start/gemini_example_multimodal_chat.py
diff --git a/examples/quick_start/images/cat.jpeg b/examples/frontend_language/quick_start/images/cat.jpeg
similarity index 100%
rename from examples/quick_start/images/cat.jpeg
rename to examples/frontend_language/quick_start/images/cat.jpeg
diff --git a/examples/quick_start/images/dog.jpeg b/examples/frontend_language/quick_start/images/dog.jpeg
similarity index 100%
rename from examples/quick_start/images/dog.jpeg
rename to examples/frontend_language/quick_start/images/dog.jpeg
diff --git a/examples/quick_start/srt_example_chat.py b/examples/frontend_language/quick_start/local_example_chat.py
similarity index 98%
rename from examples/quick_start/srt_example_chat.py
rename to examples/frontend_language/quick_start/local_example_chat.py
index b1e1658a2..e1e4b62cc 100644
--- a/examples/quick_start/srt_example_chat.py
+++ b/examples/frontend_language/quick_start/local_example_chat.py
@@ -1,6 +1,6 @@
 """
 Usage:
-python3 srt_example_chat.py
+python3 local_example_chat.py
 """
 
 import sglang as sgl
diff --git a/examples/quick_start/srt_example_complete.py b/examples/frontend_language/quick_start/local_example_complete.py
similarity index 97%
rename from examples/quick_start/srt_example_complete.py
rename to examples/frontend_language/quick_start/local_example_complete.py
index 056245979..00a451cf6 100644
--- a/examples/quick_start/srt_example_complete.py
+++ b/examples/frontend_language/quick_start/local_example_complete.py
@@ -1,6 +1,6 @@
 """
 Usage:
-python3 srt_example_complete.py
+python3 local_example_complete.py
 """
 
 import sglang as sgl
diff --git a/examples/quick_start/srt_example_llava.py b/examples/frontend_language/quick_start/local_example_llava_next.py
similarity index 69%
rename from examples/quick_start/srt_example_llava.py
rename to examples/frontend_language/quick_start/local_example_llava_next.py
index 5d8f75239..823dc7b0e 100644
--- a/examples/quick_start/srt_example_llava.py
+++ b/examples/frontend_language/quick_start/local_example_llava_next.py
@@ -1,8 +1,14 @@
 """
-Usage: python3 srt_example_llava.py
+Usage: python3 local_example_llava_next.py
 """
 
+from PIL import ImageFile
+
 import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
+from sglang.srt.utils import load_image
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True  # Allow loading of truncated images
 
 
 @sgl.function
@@ -44,10 +50,17 @@ def batch():
 
 
 if __name__ == "__main__":
-    runtime = sgl.Runtime(
-        model_path="liuhaotian/llava-v1.6-vicuna-7b",
-        tokenizer_path="llava-hf/llava-1.5-7b-hf",
-    )
+    import multiprocessing as mp
+
+    mp.set_start_method("spawn", force=True)
+
+    runtime = sgl.Runtime(model_path="lmms-lab/llama3-llava-next-8b")
+    runtime.endpoint.chat_template = get_chat_template("llama-3-instruct")
+
+    # Or you can use the 72B model
+    # runtime = sgl.Runtime(model_path="lmms-lab/llava-next-72b", tp_size=8)
+    # runtime.endpoint.chat_template = get_chat_template("chatml-llava")
+
     sgl.set_default_backend(runtime)
     print(f"chat template: {runtime.endpoint.chat_template.name}")
 
diff --git a/examples/quick_start/openai_example_chat.py b/examples/frontend_language/quick_start/openai_example_chat.py
similarity index 100%
rename from examples/quick_start/openai_example_chat.py
rename to examples/frontend_language/quick_start/openai_example_chat.py
diff --git a/examples/quick_start/openai_example_complete.py b/examples/frontend_language/quick_start/openai_example_complete.py
similarity index 100%
rename from examples/quick_start/openai_example_complete.py
rename to examples/frontend_language/quick_start/openai_example_complete.py
diff --git a/examples/quick_start/openrouter_example_chat.py b/examples/frontend_language/quick_start/openrouter_example_chat.py
similarity index 100%
rename from examples/quick_start/openrouter_example_chat.py
rename to examples/frontend_language/quick_start/openrouter_example_chat.py
diff --git a/examples/quick_start/together_example_chat.py b/examples/frontend_language/quick_start/together_example_chat.py
similarity index 100%
rename from examples/quick_start/together_example_chat.py
rename to examples/frontend_language/quick_start/together_example_chat.py
diff --git a/examples/quick_start/together_example_complete.py b/examples/frontend_language/quick_start/together_example_complete.py
similarity index 100%
rename from examples/quick_start/together_example_complete.py
rename to examples/frontend_language/quick_start/together_example_complete.py
diff --git a/examples/usage/chinese_regex.py b/examples/frontend_language/usage/chinese_regex.py
similarity index 100%
rename from examples/usage/chinese_regex.py
rename to examples/frontend_language/usage/chinese_regex.py
diff --git a/examples/usage/choices_logprob.py b/examples/frontend_language/usage/choices_logprob.py
similarity index 100%
rename from examples/usage/choices_logprob.py
rename to examples/frontend_language/usage/choices_logprob.py
diff --git a/examples/usage/cot_decoding.py b/examples/frontend_language/usage/cot_decoding.py
similarity index 100%
rename from examples/usage/cot_decoding.py
rename to examples/frontend_language/usage/cot_decoding.py
diff --git a/examples/usage/json_decode.py b/examples/frontend_language/usage/json_decode.py
similarity index 100%
rename from examples/usage/json_decode.py
rename to examples/frontend_language/usage/json_decode.py
diff --git a/examples/usage/json_logprobs.py b/examples/frontend_language/usage/json_logprobs.py
similarity index 100%
rename from examples/usage/json_logprobs.py
rename to examples/frontend_language/usage/json_logprobs.py
diff --git a/examples/usage/llava_video/srt_example_llava_v.py b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py
similarity index 99%
rename from examples/usage/llava_video/srt_example_llava_v.py
rename to examples/frontend_language/usage/llava_video/srt_example_llava_v.py
index 7421dfcdf..085bcea5a 100644
--- a/examples/usage/llava_video/srt_example_llava_v.py
+++ b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py
@@ -1,7 +1,8 @@
 """
 Usage:
 pip install opencv-python-headless
-python3 srt_example_llava.py
+
+python3 srt_example_llava_v.py
 """
 
 import argparse
@@ -9,6 +10,8 @@ import csv
 import os
 import time
 
+import requests
+
 import sglang as sgl
 
 
diff --git a/examples/usage/llava_video/srt_example_llava_v.sh b/examples/frontend_language/usage/llava_video/srt_example_llava_v.sh
similarity index 100%
rename from examples/usage/llava_video/srt_example_llava_v.sh
rename to examples/frontend_language/usage/llava_video/srt_example_llava_v.sh
diff --git a/examples/usage/openai_chat_speculative.py b/examples/frontend_language/usage/openai_chat_speculative.py
similarity index 100%
rename from examples/usage/openai_chat_speculative.py
rename to examples/frontend_language/usage/openai_chat_speculative.py
diff --git a/examples/usage/openai_parallel_sample.py b/examples/frontend_language/usage/openai_parallel_sample.py
similarity index 100%
rename from examples/usage/openai_parallel_sample.py
rename to examples/frontend_language/usage/openai_parallel_sample.py
diff --git a/examples/usage/openai_speculative.py b/examples/frontend_language/usage/openai_speculative.py
similarity index 100%
rename from examples/usage/openai_speculative.py
rename to examples/frontend_language/usage/openai_speculative.py
diff --git a/examples/usage/parallel_sample.py b/examples/frontend_language/usage/parallel_sample.py
similarity index 100%
rename from examples/usage/parallel_sample.py
rename to examples/frontend_language/usage/parallel_sample.py
diff --git a/examples/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
similarity index 100%
rename from examples/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
rename to examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
diff --git a/examples/usage/readme_examples.py b/examples/frontend_language/usage/readme_examples.py
similarity index 100%
rename from examples/usage/readme_examples.py
rename to examples/frontend_language/usage/readme_examples.py
diff --git a/examples/usage/streaming.py b/examples/frontend_language/usage/streaming.py
similarity index 100%
rename from examples/usage/streaming.py
rename to examples/frontend_language/usage/streaming.py
diff --git a/examples/usage/triton/Dockerfile b/examples/frontend_language/usage/triton/Dockerfile
similarity index 100%
rename from examples/usage/triton/Dockerfile
rename to examples/frontend_language/usage/triton/Dockerfile
diff --git a/examples/usage/triton/README.md b/examples/frontend_language/usage/triton/README.md
similarity index 100%
rename from examples/usage/triton/README.md
rename to examples/frontend_language/usage/triton/README.md
diff --git a/examples/usage/triton/models/character_generation/1/model.py b/examples/frontend_language/usage/triton/models/character_generation/1/model.py
similarity index 100%
rename from examples/usage/triton/models/character_generation/1/model.py
rename to examples/frontend_language/usage/triton/models/character_generation/1/model.py
diff --git a/examples/usage/triton/models/character_generation/config.pbtxt b/examples/frontend_language/usage/triton/models/character_generation/config.pbtxt
similarity index 100%
rename from examples/usage/triton/models/character_generation/config.pbtxt
rename to examples/frontend_language/usage/triton/models/character_generation/config.pbtxt
diff --git a/examples/quick_start/srt_example_yi_vl.py b/examples/quick_start/srt_example_yi_vl.py
deleted file mode 100644
index 66c7d5712..000000000
--- a/examples/quick_start/srt_example_yi_vl.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""
-Usage: python3 srt_example_yi_vl.py
-
-Requirements: transformers==4.38
-"""
-
-import sglang as sgl
-
-
-@sgl.function
-def image_qa(s, image_path, question):
-    s += sgl.user(sgl.image(image_path) + question)
-    s += sgl.assistant(sgl.gen("answer"))
-
-
-def single():
-    state = image_qa.run(
-        image_path="images/cat.jpeg",
-        question="What is this?",
-        max_new_tokens=64,
-        stop="###",
-    )
-    print(state["answer"], "\n")
-
-
-def stream():
-    state = image_qa.run(
-        image_path="images/cat.jpeg",
-        question="What is this?",
-        max_new_tokens=64,
-        stream=True,
-        stop="###",
-    )
-
-    for out in state.text_iter("answer"):
-        print(out, end="", flush=True)
-    print()
-
-
-def batch():
-    states = image_qa.run_batch(
-        [
-            {"image_path": "images/cat.jpeg", "question": "What is this?"},
-            {"image_path": "images/dog.jpeg", "question": "What is this?"},
-        ],
-        max_new_tokens=64,
-        stop="###",
-    )
-    for s in states:
-        print(s["answer"], "\n")
-
-
-if __name__ == "__main__":
-    runtime = sgl.Runtime(model_path="BabyChou/Yi-VL-6B")
-    # runtime = sgl.Runtime(model_path="BabyChou/Yi-VL-34B")
-    sgl.set_default_backend(runtime)
-
-    # Run a single request
-    print("\n========== single ==========\n")
-    single()
-
-    # Stream output
-    print("\n========== stream ==========\n")
-    stream()
-
-    # Run a batch of requests
-    print("\n========== batch ==========\n")
-    batch()
-
-    runtime.shutdown()
diff --git a/examples/usage/async_io.py b/examples/runtime/async_io_api.py
similarity index 100%
rename from examples/usage/async_io.py
rename to examples/runtime/async_io_api.py
diff --git a/examples/usage/llava/http_llama3_llava_test.py b/examples/runtime/llava_onevision/http_llama3_llava_test.py
similarity index 94%
rename from examples/usage/llava/http_llama3_llava_test.py
rename to examples/runtime/llava_onevision/http_llama3_llava_test.py
index 813a26af5..a019e214d 100644
--- a/examples/usage/llava/http_llama3_llava_test.py
+++ b/examples/runtime/llava_onevision/http_llama3_llava_test.py
@@ -4,7 +4,7 @@ Usage:
 # Installing latest sglang.
 
 # Endpoint Service CLI: 
-# python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --tokenizer-path lmms-lab/llama3-llava-next-8b-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4
+python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
 
 python3 http_llama3_llava_test.py
 
@@ -16,7 +16,6 @@ import argparse
 import asyncio
 import copy
 import json
-import time
 
 import aiohttp
 import requests
diff --git a/examples/usage/llava/http_llava_onevision_test.py b/examples/runtime/llava_onevision/http_llava_onevision_test.py
similarity index 96%
rename from examples/usage/llava/http_llava_onevision_test.py
rename to examples/runtime/llava_onevision/http_llava_onevision_test.py
index c32d52981..40dc27ec2 100644
--- a/examples/usage/llava/http_llava_onevision_test.py
+++ b/examples/runtime/llava_onevision/http_llava_onevision_test.py
@@ -1,3 +1,11 @@
+"""
+Usage:
+
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384
+
+python3 http_llava_onevision_test.py
+"""
+
 import base64
 import io
 import os
@@ -74,7 +82,6 @@ def video_stream_request_test(client, video_path):
     print("------------------------Video Stream Request Test----------------------")
     messages = prepare_video_messages(video_path)
 
-    start_time = time.time()
     video_request = client.chat.completions.create(
         model="default",
         messages=messages,
diff --git a/examples/usage/llava/http_qwen_llava_test.py b/examples/runtime/llava_onevision/http_qwen_llava_test.py
similarity index 95%
rename from examples/usage/llava/http_qwen_llava_test.py
rename to examples/runtime/llava_onevision/http_qwen_llava_test.py
index 1c29658c6..dca56e7a3 100644
--- a/examples/usage/llava/http_qwen_llava_test.py
+++ b/examples/runtime/llava_onevision/http_qwen_llava_test.py
@@ -4,7 +4,7 @@ Usage:
 # Installing latest sglang.
 
 # Endpoint Service CLI: 
-# python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --tokenizer-path lmms-lab/llavanext-qwen-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4
+python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
 
 python3 http_qwen_llava_test.py
 
@@ -16,7 +16,6 @@ import argparse
 import asyncio
 import copy
 import json
-import time
 
 import aiohttp
 import requests
diff --git a/examples/usage/openai_batch_chat.py b/examples/runtime/openai_batch_chat.py
similarity index 100%
rename from examples/usage/openai_batch_chat.py
rename to examples/runtime/openai_batch_chat.py
diff --git a/examples/usage/openai_batch_complete.py b/examples/runtime/openai_batch_complete.py
similarity index 100%
rename from examples/usage/openai_batch_complete.py
rename to examples/runtime/openai_batch_complete.py
diff --git a/examples/usage/llava/srt_llava_next_test.py b/examples/usage/llava/srt_llava_next_test.py
deleted file mode 100644
index 0f9621648..000000000
--- a/examples/usage/llava/srt_llava_next_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""
-Usage: python3 srt_example_llava.py
-"""
-
-from PIL import ImageFile
-
-import sglang as sgl
-from sglang.lang.chat_template import get_chat_template
-from sglang.srt.utils import load_image
-
-ImageFile.LOAD_TRUNCATED_IMAGES = True  # Allow loading of truncated images
-
-
-@sgl.function
-def image_qa(s, image, question):
-    s += sgl.user(sgl.image(image) + question)
-    s += sgl.assistant(sgl.gen("answer"))
-
-
-def single():
-    image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
-    pil_image, _ = load_image(image_url)
-    state = image_qa.run(image=pil_image, question="What is this?", max_new_tokens=512)
-    print(state["answer"], "\n")
-
-
-def stream():
-    image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
-    pil_image, _ = load_image(image_url)
-    state = image_qa.run(
-        image=pil_image,
-        question="Please generate short caption for this image.",
-        max_new_tokens=512,
-        temperature=0,
-        stream=True,
-    )
-
-    for out in state.text_iter("answer"):
-        print(out, end="", flush=True)
-    print()
-
-
-def batch():
-    image_url = "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg"
-    pil_image, _ = load_image(image_url)
-    states = image_qa.run_batch(
-        [
-            {"image": pil_image, "question": "What is this?"},
-            {"image": pil_image, "question": "What is this?"},
-        ],
-        max_new_tokens=512,
-    )
-    for s in states:
-        print(s["answer"], "\n")
-
-
-if __name__ == "__main__":
-    import multiprocessing as mp
-
-    mp.set_start_method("spawn", force=True)
-    runtime = sgl.Runtime(
-        model_path="lmms-lab/llama3-llava-next-8b",
-        tokenizer_path="lmms-lab/llama3-llava-next-8b-tokenizer",
-    )
-    runtime.endpoint.chat_template = get_chat_template("llama-3-instruct")
-    # runtime = sgl.Runtime(
-    #     model_path="lmms-lab/llava-next-72b",
-    #     tokenizer_path="lmms-lab/llavanext-qwen-tokenizer",
-    # )
-    # runtime.endpoint.chat_template = get_chat_template("chatml-llava")
-    sgl.set_default_backend(runtime)
-    print(f"chat template: {runtime.endpoint.chat_template.name}")
-
-    # Or you can use API models
-    # sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview"))
-    # sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
-
-    # Run a single request
-    print("\n========== single ==========\n")
-    single()
-
-    # Stream output
-    print("\n========== stream ==========\n")
-    stream()
-
-    # Run a batch of requests
-    print("\n========== batch ==========\n")
-    batch()
-
-    runtime.shutdown()
diff --git a/examples/usage/rag_using_parea/max-tokens-fixed-rag-trace.png b/examples/usage/rag_using_parea/max-tokens-fixed-rag-trace.png
deleted file mode 100644
index 2ea09fdc6..000000000
Binary files a/examples/usage/rag_using_parea/max-tokens-fixed-rag-trace.png and /dev/null differ
diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py
index ba1a81d54..dea910f57 100644
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -111,7 +111,11 @@ def load_model(server_args, tp_rank):
     suppress_other_loggers()
     rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
 
-    model_config = ModelConfig(path=server_args.model_path)
+    model_config = ModelConfig(
+        server_args.model_path,
+        server_args.trust_remote_code,
+        context_length=server_args.context_length,
+    )
     model_runner = ModelRunner(
         model_config=model_config,
         mem_fraction_static=server_args.mem_fraction_static,
diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py
index 92f717127..fa300b25f 100644
--- a/python/sglang/lang/chat_template.py
+++ b/python/sglang/lang/chat_template.py
@@ -1,6 +1,6 @@
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Tuple
 
 
 class ChatTemplateStyle(Enum):
diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py
deleted file mode 100644
index c34dd2116..000000000
--- a/python/sglang/launch_server_llavavid.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Launch the inference server for Llava-video model."""
-
-import argparse
-
-from sglang.srt.server import ServerArgs, launch_server
-
-if __name__ == "__main__":
-    model_overide_args = {}
-
-    model_overide_args["mm_spatial_pool_stride"] = 2
-    model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
-    model_overide_args["num_frames"] = 16
-    model_overide_args["model_type"] = "llavavid"
-    if model_overide_args["num_frames"] == 32:
-        model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
-        model_overide_args["max_sequence_length"] = 4096 * 2
-        model_overide_args["tokenizer_model_max_length"] = 4096 * 2
-        model_overide_args["model_max_length"] = 4096 * 2
-
-    parser = argparse.ArgumentParser()
-    ServerArgs.add_cli_args(parser)
-    args = parser.parse_args()
-
-    if "34b" in args.model_path.lower():
-        model_overide_args["image_token_index"] = 64002
-
-    server_args = ServerArgs.from_cli_args(args)
-
-    launch_server(server_args, model_overide_args, None)
diff --git a/python/sglang/srt/layers/decode_attention.py b/python/sglang/srt/layers/decode_attention.py
index eef3c0009..dc92a6548 100644
--- a/python/sglang/srt/layers/decode_attention.py
+++ b/python/sglang/srt/layers/decode_attention.py
@@ -26,7 +26,7 @@ import triton.language as tl
 
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 
-if global_server_args_dict.get("attention_reduce_in_fp32", False):
+if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
     REDUCE_TRITON_TYPE = tl.float32
     REDUCE_TORCH_TYPE = torch.float32
 else:
diff --git a/python/sglang/srt/layers/fused_moe/layer.py b/python/sglang/srt/layers/fused_moe/layer.py
index 0b17c14ff..e08ec5c58 100644
--- a/python/sglang/srt/layers/fused_moe/layer.py
+++ b/python/sglang/srt/layers/fused_moe/layer.py
@@ -239,7 +239,7 @@ class FusedMoE(torch.nn.Module):
         weight_name: str,
         shard_id: int,
         expert_id: int,
-        pre_sharded: bool,
+        use_presharded_weights: bool = False,
     ):
         param_data = param.data
 
@@ -273,7 +273,7 @@ class FusedMoE(torch.nn.Module):
         else:
             tp_rank = get_tensor_model_parallel_rank()
             shard_size = self.intermediate_size_per_partition
-            if pre_sharded:
+            if use_presharded_weights:
                 shard = slice(None)
             else:
                 shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index a5ba06de0..63f74d8b0 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -180,7 +180,7 @@ class LogitsProcessor(nn.Module):
 
         if hasattr(self.config, "final_logit_softcapping"):
             last_logits.div_(self.config.final_logit_softcapping)
-            last_logits = torch.tanh(last_logits)
+            torch.tanh(last_logits, out=last_logits)
             last_logits.mul_(self.config.final_logit_softcapping)
 
         # Return only last_logits if logprob is not requested
@@ -241,7 +241,7 @@ class LogitsProcessor(nn.Module):
 
                 if hasattr(self.config, "final_logit_softcapping"):
                     all_logits.div_(self.config.final_logit_softcapping)
-                    all_logits = torch.tanh(all_logits)
+                    torch.tanh(all_logits, out=all_logits)
                     all_logits.mul_(self.config.final_logit_softcapping)
 
                 all_logprobs = all_logits
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 75c33bb8b..e61f13cb9 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -35,7 +35,7 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 global_server_args_dict = {
     "disable_flashinfer": False,
     "disable_flashinfer_sampling": False,
-    "attention_reduce_in_fp32": False,
+    "triton_attention_reduce_in_fp32": False,
     "enable_mla": False,
 }
 
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 2d604d287..8420f20dd 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -606,6 +606,9 @@ class TokenizerManager:
         return background_tasks
 
     def create_handle_loop(self):
+        if not self.to_create_loop:
+            return
+
         self.to_create_loop = False
         loop = asyncio.get_event_loop()
         loop.create_task(self.handle_loop())
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index b91191c5d..661660281 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -20,7 +20,6 @@ import importlib
 import importlib.resources
 import logging
 import pkgutil
-import warnings
 from functools import lru_cache
 from typing import Optional, Type
 
@@ -91,23 +90,35 @@ class ModelRunner:
             {
                 "disable_flashinfer": server_args.disable_flashinfer,
                 "disable_flashinfer_sampling": server_args.disable_flashinfer_sampling,
-                "attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
+                "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
                 "enable_mla": server_args.enable_mla,
             }
         )
 
+        min_per_gpu_memory = self.init_torch_distributed()
+        self.load_model()
+        self.init_memory_pool(
+            min_per_gpu_memory,
+            server_args.max_num_reqs,
+            server_args.max_total_tokens,
+        )
+        self.init_cublas()
+        self.init_flashinfer()
+        self.init_cuda_graphs()
+
+    def init_torch_distributed(self):
         # Init torch distributed
         torch.cuda.set_device(self.gpu_id)
         logger.info(f"[gpu={self.gpu_id}] Init nccl begin.")
 
-        if not server_args.enable_p2p_check:
+        if not self.server_args.enable_p2p_check:
             monkey_patch_vllm_p2p_access_check(self.gpu_id)
 
-        if server_args.nccl_init_addr:
-            nccl_init_method = f"tcp://{server_args.nccl_init_addr}"
+        if self.server_args.nccl_init_addr:
+            nccl_init_method = f"tcp://{self.server_args.nccl_init_addr}"
         else:
             nccl_init_method = f"tcp://127.0.0.1:{self.nccl_port}"
-        set_custom_all_reduce(not server_args.disable_custom_all_reduce)
+        set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
         init_distributed_environment(
             backend="nccl",
             world_size=self.tp_size,
@@ -116,32 +127,28 @@ class ModelRunner:
             distributed_init_method=nccl_init_method,
         )
         initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
-        total_gpu_memory = get_available_gpu_memory(
+        min_per_gpu_memory = get_available_gpu_memory(
             self.gpu_id, distributed=self.tp_size > 1
         )
         self.tp_group = get_tp_group()
 
+        # Currently, there is a bug with mulit-node tensor parallelsim + padded cuda graph,
+        # so we disable padding in cuda graph.
+        if not all(in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)):
+            self.server_args.disable_cuda_graph_padding = True
+            logger.info(
+                "Setting disable_cuda_graph_padding to True because of multi-node tensor parallelism."
+            )
+
+        # Check memory for tensor parallelism
         if self.tp_size > 1:
-            total_local_gpu_memory = get_available_gpu_memory(self.gpu_id)
-            if total_local_gpu_memory < total_gpu_memory * 0.9:
+            local_gpu_memory = get_available_gpu_memory(self.gpu_id)
+            if min_per_gpu_memory < local_gpu_memory * 0.9:
                 raise ValueError(
                     "The memory capacity is unbalanced. Some GPUs may be occupied by other processes."
                 )
 
-        # Load the model and create memory pool
-        self.load_model()
-        self.init_memory_pool(
-            total_gpu_memory,
-            server_args.max_num_reqs,
-            server_args.max_total_tokens,
-        )
-        self.init_cublas()
-        self.init_flashinfer()
-
-        if self.is_generation:
-            # FIXME Currently, cuda graph only capture decode steps, which only exists in causal models
-            # Capture cuda graphs
-            self.init_cuda_graphs()
+        return min_per_gpu_memory
 
     def load_model(self):
         logger.info(
@@ -150,7 +157,7 @@ class ModelRunner:
         )
         if torch.cuda.get_device_capability()[0] < 8:
             logger.info(
-                "Compute capability below sm80 use float16 due to lack of bfloat16 support."
+                "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
             )
             self.server_args.dtype = "float16"
 
@@ -168,8 +175,9 @@ class ModelRunner:
             skip_tokenizer_init=True,
         )
 
+        # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints
+        # Drop this after Sept, 2024.
         if is_llama3_405b_fp8_head_16(self.model_config) and self.tp_size <= 8:
-            # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints
             self.model_config.hf_config.num_key_value_heads = 8
             self.vllm_model_config.hf_config.num_key_value_heads = 8
             monkey_patch_vllm_qvk_linear_loader()
@@ -191,8 +199,8 @@ class ModelRunner:
             cache_config=None,
         )
         self.sliding_window_size = (
-            self.model.get_window_size()
-            if hasattr(self.model, "get_window_size")
+            self.model.get_attention_sliding_window_size()
+            if hasattr(self.model, "get_attention_sliding_window_size")
             else None
         )
         self.is_generation = is_generation_model(
@@ -206,7 +214,8 @@ class ModelRunner:
             f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
         )
 
-    def update_weights(self, model_path, load_format):
+    def update_weights(self, model_path: str, load_format: str):
+        """Update weights in-place."""
         from vllm.model_executor.model_loader.loader import (
             DefaultModelLoader,
             device_loading_context,
@@ -222,6 +231,7 @@ class ModelRunner:
         target_device = torch.device(self.device_config.device)
 
         try:
+            # TODO: Use a better method to check this
             vllm_model_config = VllmModelConfig(
                 model=model_path,
                 quantization=self.server_args.quantization,
@@ -291,7 +301,7 @@ class ModelRunner:
         logger.info(f"[gpu={self.gpu_id}] Update weights end.")
         return True, "Succeeded to update model weights"
 
-    def profile_max_num_token(self, total_gpu_memory):
+    def profile_max_num_token(self, total_gpu_memory: int):
         available_gpu_memory = get_available_gpu_memory(
             self.gpu_id, distributed=self.tp_size > 1
         )
@@ -319,7 +329,10 @@ class ModelRunner:
         return max_num_token
 
     def init_memory_pool(
-        self, total_gpu_memory, max_num_reqs=None, max_total_tokens=None
+        self,
+        total_gpu_memory: int,
+        max_num_reqs: int = None,
+        max_total_tokens: int = None,
     ):
         self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
         if max_total_tokens is not None:
@@ -388,6 +401,7 @@ class ModelRunner:
         return c
 
     def init_flashinfer(self):
+        """Init flashinfer attention kernel wrappers."""
         if self.server_args.disable_flashinfer:
             assert (
                 self.sliding_window_size is None
@@ -448,6 +462,11 @@ class ModelRunner:
                 )
 
     def init_cuda_graphs(self):
+        """Capture cuda graphs."""
+        if not self.is_generation:
+            # TODO: Currently, cuda graph only captures decode steps, which only exists for generation models
+            return
+
         from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
 
         if self.server_args.disable_cuda_graph or self.server_args.disable_flashinfer:
@@ -457,7 +476,12 @@ class ModelRunner:
         logger.info(
             f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
         )
-        batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)]
+
+        if self.server_args.disable_cuda_graph_padding:
+            batch_size_list = list(range(1, 32)) + [64, 128]
+        else:
+            batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+
         self.cuda_graph_runner = CudaGraphRunner(
             self,
             max_batch_size_to_capture=max(batch_size_list),
diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
index 37d926c34..c6dbc7e55 100644
--- a/python/sglang/srt/models/gemma2.py
+++ b/python/sglang/srt/models/gemma2.py
@@ -46,7 +46,7 @@ from sglang.srt.model_executor.forward_batch_info import InputMetadata
 
 # Aligned with HF's implementation, using sliding window inclusive with the last token
 # SGLang assumes exclusive
-def get_window_size(config):
+def get_attention_sliding_window_size(config):
     return config.sliding_window - 1
 
 
@@ -213,7 +213,11 @@ class Gemma2Attention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_idx,
-            sliding_window_size=get_window_size(config) if use_sliding_window else None,
+            sliding_window_size=(
+                get_attention_sliding_window_size(config)
+                if use_sliding_window
+                else None
+            ),
             logit_cap=self.config.attn_logit_softcapping,
         )
 
@@ -406,8 +410,8 @@ class Gemma2ForCausalLM(nn.Module):
             input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata
         )
 
-    def get_window_size(self):
-        return get_window_size(self.config)
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py
index 75b086fd6..4a0a08bf8 100644
--- a/python/sglang/srt/models/grok.py
+++ b/python/sglang/srt/models/grok.py
@@ -295,12 +295,14 @@ class Grok1ModelForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = Grok1Model(config, quant_config=quant_config)
-        # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.lm_head = ReplicatedLinear(config.hidden_size, config.vocab_size)
-        self.logits_processor = LogitsProcessor(config, skip_all_gather=True)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config)
 
         # Monkey patch _prepare_weights to load pre-sharded weights
         setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
+
+        self.use_presharded_weights = True
+
         warnings.filterwarnings("ignore", category=FutureWarning)
 
     def forward(
@@ -356,6 +358,13 @@ class Grok1ModelForCausalLM(nn.Module):
                         continue
                     name = name.replace(weight_name, param_name)
 
+                    if self.use_presharded_weights:
+                        extra_kwargs = {
+                            "use_presharded_weights": self.use_presharded_weights
+                        }
+                    else:
+                        extra_kwargs = {}
+
                     param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(
@@ -364,7 +373,7 @@ class Grok1ModelForCausalLM(nn.Module):
                         weight_name,
                         shard_id=shard_id,
                         expert_id=expert_id,
-                        pre_sharded=get_tensor_model_parallel_world_size() > 1,
+                        **extra_kwargs,
                     )
                     break
                 else:
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 33451d645..870169c6d 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -81,13 +81,12 @@ class ServerArgs:
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
     disable_disk_cache: bool = False
+    disable_custom_all_reduce: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
     enable_p2p_check: bool = False
     enable_mla: bool = False
-    attention_reduce_in_fp32: bool = False
-    efficient_weight_load: bool = False
-    disable_custom_all_reduce: bool = False
+    triton_attention_reduce_in_fp32: bool = False
 
     # Distributed args
     nccl_init_addr: Optional[str] = None
@@ -404,6 +403,12 @@ class ServerArgs:
             action="store_true",
             help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
         )
+        parser.add_argument(
+            "--disable-custom-all-reduce",
+            action="store_true",
+            default=False,
+            help="Disable the custom all-reduce kernel and fall back to NCCL.",
+        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -425,7 +430,7 @@ class ServerArgs:
             help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
         )
         parser.add_argument(
-            "--attention-reduce-in-fp32",
+            "--triton-attention-reduce-in-fp32",
             action="store_true",
             help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
             "This only affects Triton attention kernels.",
@@ -435,12 +440,6 @@ class ServerArgs:
             action="store_true",
             help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
         )
-        parser.add_argument(
-            "--disable-custom-all-reduce",
-            action="store_true",
-            default=False,
-            help="Disable the custom all-reduce kernel and fall back to NCCL.",
-        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index a15ea1630..93c54782a 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -347,7 +347,7 @@ def suppress_other_loggers():
         logging.WARN
     )
     logging.getLogger("vllm.selector").setLevel(logging.WARN)
-    logging.getLogger("vllm.utils").setLevel(logging.WARN)
+    logging.getLogger("vllm.utils").setLevel(logging.ERROR)
 
 
 def assert_pkg_version(pkg: str, min_version: str, message: str):
@@ -451,10 +451,6 @@ def monkey_patch_vllm_dummy_weight_loader():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
                     quant_method.process_weights_after_loading(module)
-                # FIXME: Remove this after Mixtral is updated
-                # to use quant_method.
-                if hasattr(module, "process_weights_after_loading"):
-                    module.process_weights_after_loading()
 
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
index 9386d7f7a..e519c9282 100644
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -24,7 +24,6 @@ import torch.nn.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from sglang.srt.server import Runtime
-from sglang.srt.utils import is_generation_model
 
 DEFAULT_PROMPTS = [
     # the output of gemma-2-2b from SRT is unstable on the commented prompt
@@ -63,8 +62,8 @@ class HFRunner:
     def __init__(
         self,
         model_path,
-        torch_dtype=torch.float16,
-        is_generation_model=None,
+        torch_dtype,
+        is_generation_model,
     ):
         self.in_queue = multiprocessing.Queue()
         self.out_queue = multiprocessing.Queue()
@@ -90,11 +89,8 @@ class HFRunner:
             trust_remote_code=True,
         )
 
-        self.is_generation_model = (
-            is_generation_model(model_path)
-            if is_generation_model is None
-            else is_generation_model
-        )
+        self.is_generation_model = is_generation_model
+
         if self.is_generation_model:
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_path,
@@ -176,16 +172,12 @@ class SRTRunner:
     def __init__(
         self,
         model_path,
+        torch_dtype,
+        is_generation_model,
         tp_size=1,
-        torch_dtype=torch.float16,
-        is_generation_model=None,
         port=5157,
     ):
-        self.is_generation_model = (
-            is_generation_model(model_path)
-            if is_generation_model is None
-            else is_generation_model
-        )
+        self.is_generation_model = is_generation_model
         self.runtime = Runtime(
             model_path=model_path,
             tp_size=tp_size,
diff --git a/scripts/convert_yi_vl.py b/scripts/deprecated/convert_yi_vl.py
similarity index 100%
rename from scripts/convert_yi_vl.py
rename to scripts/deprecated/convert_yi_vl.py
diff --git a/scripts/convert_yi_vl.sh b/scripts/deprecated/convert_yi_vl.sh
similarity index 100%
rename from scripts/convert_yi_vl.sh
rename to scripts/deprecated/convert_yi_vl.sh
diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py
index 67e47d90d..44fed2ad0 100644
--- a/test/srt/models/test_embedding_models.py
+++ b/test/srt/models/test_embedding_models.py
@@ -59,7 +59,7 @@ class TestEmbeddingModels(unittest.TestCase):
                 tolerance = 1e-2
                 assert torch.all(
                     abs(similarities - 1) < tolerance
-                ), f"embeddings not all close"
+                ), "embeddings are not all close"
 
     def test_prefill_logits(self):
         for model, tp_size in MODELS:
diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py
index bb56ebdad..ba64907ea 100644
--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -59,7 +59,7 @@ class TestGenerationModels(unittest.TestCase):
                 tolerance = 3e-2
                 assert torch.all(
                     abs(hf_logprobs - srt_logprobs) < tolerance
-                ), f"prefill logprobs not all close"
+                ), "prefill logprobs are not all close"
 
         print(hf_outputs.output_strs)
         print(srt_outputs.output_strs)
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 4d3f7de30..8a887912a 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -14,7 +14,7 @@ suites = {
         "test_torch_compile.py",
         "test_triton_attn_backend.py",
         "test_vision_openai_server.py",
-        "test_large_max_new_tokens.py",
+        "test_update_weights.py",
         "models/test_generation_models.py",
         "models/test_embedding_models.py",
         "sampling/penaltylib",
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index 3e858dfa7..0a477a92a 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -2,8 +2,6 @@ import base64
 import io
 import json
 import os
-import sys
-import time
 import unittest
 
 import numpy as np
@@ -12,12 +10,10 @@ import requests
 from decord import VideoReader, cpu
 from PIL import Image
 
-from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.utils import kill_child_process
 from sglang.test.test_utils import DEFAULT_URL_FOR_UNIT_TEST, popen_launch_server
 
 
-# python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --tokenizer-path lmms-lab/llavanext-qwen-siglip-tokenizer --port=30000 --host=127.0.0.1 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384
 class TestOpenAIVisionServer(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -32,11 +28,9 @@ class TestOpenAIVisionServer(unittest.TestCase):
             other_args=[
                 "--chat-template",
                 "chatml-llava",
-                "--tokenizer-path",
-                "lmms-lab/llavanext-qwen-siglip-tokenizer",
                 "--chunked-prefill-size",
                 "16384",
-                "--log-requests",
+                # "--log-requests",
             ],
         )
         cls.base_url += "/v1"
@@ -132,7 +126,6 @@ class TestOpenAIVisionServer(unittest.TestCase):
 
         messages = self.prepare_video_messages(file_path)
 
-        start_time = time.time()
         video_request = client.chat.completions.create(
             model="default",
             messages=messages,
@@ -140,15 +133,14 @@ class TestOpenAIVisionServer(unittest.TestCase):
             max_tokens=1024,
             stream=True,
         )
+
         print("-" * 30)
         video_response = ""
-
         for chunk in video_request:
             if chunk.choices[0].delta.content is not None:
                 content = chunk.choices[0].delta.content
                 video_response += content
-                sys.stdout.write(content)
-                sys.stdout.flush()
+                print(content, end="", flush=True)
         print("-" * 30)
 
         # Add assertions to validate the video response