diff --git a/README.md b/README.md
index a7b32628b..9c15f7b64 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,10 @@ The core features include:
 ### Method 1: With pip
 ```
 pip install "sglang[all]"
-```
 
-Next, [install FlashInfer](https://docs.flashinfer.ai/installation.html) for attention CUDA kernels.
+# Install FlashInfer CUDA kernels
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+```
 
 ### Method 2: From source
 ```
@@ -43,14 +44,15 @@ cd sglang
 
 pip install --upgrade pip
 pip install -e "python[all]"
+
+# Install FlashInfer CUDA kernels
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ```
 
-Next, [install FlashInfer](https://docs.flashinfer.ai/installation.html) for attention CUDA kernels.
-
 ### Notes
-- If you see triton errors, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
-- If you cannot install FlashInfer, you can use the slower triton kernels by adding `--disable-flashinfer` when launching the server.
-- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
+- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
+- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
+- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 
 ## Quick Start
 The example below shows how to use sglang to answer a mulit-turn question.
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 3beacacbb..4a9029706 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -20,8 +20,8 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
-       "zmq", "vllm==0.5.0", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.44"]
+srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
+       "psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 18305a2eb..c47b90cc6 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -291,11 +291,6 @@ class ServerArgs:
             action="store_true",
             help="Disable flashinfer inference kernels",
         )
-        parser.add_argument(
-            "--attention-reduce-in-fp32",
-            action="store_true",
-            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
-        )
         parser.add_argument(
             "--disable-radix-cache",
             action="store_true",
@@ -311,6 +306,12 @@ class ServerArgs:
             action="store_true",
             help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
         )
+        parser.add_argument(
+            "--attention-reduce-in-fp32",
+            action="store_true",
+            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
+            "This only affects Triton attention kernels",
+        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):