diff --git a/README.md b/README.md index a7b32628b..9c15f7b64 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,10 @@ The core features include: ### Method 1: With pip ``` pip install "sglang[all]" -``` -Next, [install FlashInfer](https://docs.flashinfer.ai/installation.html) for attention CUDA kernels. +# Install FlashInfer CUDA kernels +pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ +``` ### Method 2: From source ``` @@ -43,14 +44,15 @@ cd sglang pip install --upgrade pip pip install -e "python[all]" + +# Install FlashInfer CUDA kernels +pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ ``` -Next, [install FlashInfer](https://docs.flashinfer.ai/installation.html) for attention CUDA kernels. - ### Notes -- If you see triton errors, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html). -- If you cannot install FlashInfer, you can use the slower triton kernels by adding `--disable-flashinfer` when launching the server. -- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"` +- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html). +- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server. +- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`. ## Quick Start The example below shows how to use sglang to answer a mulit-turn question. diff --git a/python/pyproject.toml b/python/pyproject.toml index 3beacacbb..4a9029706 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -20,8 +20,8 @@ dependencies = [ ] [project.optional-dependencies] -srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn", - "zmq", "vllm==0.5.0", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.44"] +srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow", + "psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 18305a2eb..c47b90cc6 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -291,11 +291,6 @@ class ServerArgs: action="store_true", help="Disable flashinfer inference kernels", ) - parser.add_argument( - "--attention-reduce-in-fp32", - action="store_true", - help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.", - ) parser.add_argument( "--disable-radix-cache", action="store_true", @@ -311,6 +306,12 @@ class ServerArgs: action="store_true", help="Disable disk cache to avoid possible crashes related to file system or high concurrency.", ) + parser.add_argument( + "--attention-reduce-in-fp32", + action="store_true", + help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16." + "This only affects Triton attention kernels", + ) @classmethod def from_cli_args(cls, args: argparse.Namespace):