Update install commands (#583)
This commit is contained in:
16
README.md
16
README.md
@@ -32,9 +32,10 @@ The core features include:
|
|||||||
### Method 1: With pip
|
### Method 1: With pip
|
||||||
```
|
```
|
||||||
pip install "sglang[all]"
|
pip install "sglang[all]"
|
||||||
```
|
|
||||||
|
|
||||||
Next, [install FlashInfer](https://docs.flashinfer.ai/installation.html) for attention CUDA kernels.
|
# Install FlashInfer CUDA kernels
|
||||||
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
||||||
|
```
|
||||||
|
|
||||||
### Method 2: From source
|
### Method 2: From source
|
||||||
```
|
```
|
||||||
@@ -43,14 +44,15 @@ cd sglang
|
|||||||
|
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -e "python[all]"
|
pip install -e "python[all]"
|
||||||
|
|
||||||
|
# Install FlashInfer CUDA kernels
|
||||||
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
||||||
```
|
```
|
||||||
|
|
||||||
Next, [install FlashInfer](https://docs.flashinfer.ai/installation.html) for attention CUDA kernels.
|
|
||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
- If you see triton errors, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
||||||
- If you cannot install FlashInfer, you can use the slower triton kernels by adding `--disable-flashinfer` when launching the server.
|
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
||||||
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
The example below shows how to use sglang to answer a mulit-turn question.
|
The example below shows how to use sglang to answer a mulit-turn question.
|
||||||
|
|||||||
@@ -20,8 +20,8 @@ dependencies = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
||||||
"zmq", "vllm==0.5.0", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.44"]
|
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
|
||||||
openai = ["openai>=1.0", "tiktoken"]
|
openai = ["openai>=1.0", "tiktoken"]
|
||||||
anthropic = ["anthropic>=0.20.0"]
|
anthropic = ["anthropic>=0.20.0"]
|
||||||
litellm = ["litellm>=1.0.0"]
|
litellm = ["litellm>=1.0.0"]
|
||||||
|
|||||||
@@ -291,11 +291,6 @@ class ServerArgs:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Disable flashinfer inference kernels",
|
help="Disable flashinfer inference kernels",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--attention-reduce-in-fp32",
|
|
||||||
action="store_true",
|
|
||||||
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-radix-cache",
|
"--disable-radix-cache",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
@@ -311,6 +306,12 @@ class ServerArgs:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
|
help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--attention-reduce-in-fp32",
|
||||||
|
action="store_true",
|
||||||
|
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
||||||
|
"This only affects Triton attention kernels",
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_cli_args(cls, args: argparse.Namespace):
|
def from_cli_args(cls, args: argparse.Namespace):
|
||||||
|
|||||||
Reference in New Issue
Block a user