diff --git a/README.md b/README.md
index 9c6f1bd7c..e7a48ec24 100644
--- a/README.md
+++ b/README.md
@@ -154,6 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
 ```
 - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
+- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 
 ### Supported Models
 
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index b1abcd35e..7cf19f249 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -143,7 +143,7 @@ async def async_request_openai_completions(
             "temperature": 0.0,
             "best_of": 1,
             "max_tokens": request_func_input.output_len,
-            "stream": True,
+            "stream": not args.disable_stream,
             "ignore_eos": True,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
@@ -166,8 +166,9 @@ async def async_request_openai_completions(
                             continue
 
                         chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
                         if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
+                            pass
                         else:
                             data = json.loads(chunk)
 
@@ -897,6 +898,11 @@ if __name__ == "__main__":
         help="Range of request rates in the format start,stop,step. Default is 2,34,2",
     )
     parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--disable-stream",
+        action="store_true",
+        help="Disable streaming mode.",
+    )
 
     set_ulimit()
 
diff --git a/python/sglang/srt/managers/controller/schedule_heuristic.py b/python/sglang/srt/managers/controller/schedule_heuristic.py
index aae6cfb86..46a5bf239 100644
--- a/python/sglang/srt/managers/controller/schedule_heuristic.py
+++ b/python/sglang/srt/managers/controller/schedule_heuristic.py
@@ -28,11 +28,16 @@ class ScheduleHeuristic:
             # longest prefix match
             forward_queue.sort(key=lambda x: -len(x.prefix_indices))
             return forward_queue
+        elif self.schedule_heuristic == "fcfs":
+            # first come first serve
+            return forward_queue
+        elif self.schedule_heuristic == "lof":
+            # longest output first
+            forward_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
+            return forward_queue
         elif self.schedule_heuristic == "random":
             random.shuffle(forward_queue)
             return forward_queue
-        elif self.schedule_heuristic == "fcfs":
-            return forward_queue
         elif self.schedule_heuristic == "dfs-weight":
             last_node_to_reqs = defaultdict(list)
             for req in forward_queue: