Multiple minor fixes (#1530)
This commit is contained in:
@@ -163,7 +163,8 @@ curl http://localhost:30000/generate \
|
|||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
Learn more about the argument format [here](docs/en/sampling_params.md).
|
|
||||||
|
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
||||||
|
|
||||||
### OpenAI Compatible API
|
### OpenAI Compatible API
|
||||||
In addition, the server supports OpenAI-compatible APIs.
|
In addition, the server supports OpenAI-compatible APIs.
|
||||||
@@ -202,7 +203,7 @@ response = client.embeddings.create(
|
|||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
||||||
|
|
||||||
### Additional Server Arguments
|
### Additional Server Arguments
|
||||||
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
||||||
@@ -223,6 +224,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
||||||
```
|
```
|
||||||
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
||||||
|
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
||||||
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||||
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
||||||
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
||||||
@@ -241,9 +243,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
||||||
- Mistral / Mixtral / Mistral NeMo
|
- Mistral / Mixtral / Mistral NeMo
|
||||||
- Gemma / Gemma 2
|
- Gemma / Gemma 2
|
||||||
- OLMoE
|
|
||||||
- Qwen / Qwen 2 / Qwen 2 MoE
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
||||||
- DeepSeek / DeepSeek 2
|
- DeepSeek / DeepSeek 2
|
||||||
|
- OLMoE
|
||||||
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
||||||
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
||||||
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
||||||
@@ -265,7 +267,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
- XVERSE / XVERSE MoE
|
- XVERSE / XVERSE MoE
|
||||||
- SmolLM
|
- SmolLM
|
||||||
|
|
||||||
|
|
||||||
**Embedding Models**
|
**Embedding Models**
|
||||||
|
|
||||||
- e5-mistral
|
- e5-mistral
|
||||||
|
|||||||
@@ -19,7 +19,8 @@ curl http://localhost:30000/generate \
|
|||||||
}
|
}
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
Learn more about the argument format [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
|
|
||||||
|
Learn more about the argument specification, streaming, and multi-modal support [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
|
||||||
|
|
||||||
### OpenAI Compatible API
|
### OpenAI Compatible API
|
||||||
In addition, the server supports OpenAI-compatible APIs.
|
In addition, the server supports OpenAI-compatible APIs.
|
||||||
@@ -58,7 +59,7 @@ response = client.embeddings.create(
|
|||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|
||||||
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
||||||
|
|
||||||
### Additional Server Arguments
|
### Additional Server Arguments
|
||||||
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
||||||
@@ -79,6 +80,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
||||||
```
|
```
|
||||||
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
||||||
|
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
||||||
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
||||||
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
||||||
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
|
||||||
@@ -99,6 +101,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
- Gemma / Gemma 2
|
- Gemma / Gemma 2
|
||||||
- Qwen / Qwen 2 / Qwen 2 MoE
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
||||||
- DeepSeek / DeepSeek 2
|
- DeepSeek / DeepSeek 2
|
||||||
|
- OLMoE
|
||||||
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
||||||
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
||||||
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
||||||
@@ -115,6 +118,10 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|||||||
- ChatGLM
|
- ChatGLM
|
||||||
- InternLM 2
|
- InternLM 2
|
||||||
- Exaone 3
|
- Exaone 3
|
||||||
|
- BaiChuan2
|
||||||
|
- MiniCPM / MiniCPM 3
|
||||||
|
- XVERSE / XVERSE MoE
|
||||||
|
- SmolLM
|
||||||
|
|
||||||
**Embedding Models**
|
**Embedding Models**
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
python3 async_io.py
|
python3 async_io.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
"""
|
"""
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||||
python openai_batch_chat.py
|
python openai_batch_chat.py
|
||||||
|
|
||||||
Note: Before running this script,
|
Note: Before running this script,
|
||||||
you should create the input.jsonl file with the following content:
|
you should create the input.jsonl file with the following content:
|
||||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world! List 3 NBA players and tell a story"}],"max_tokens": 300}}
|
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world! List 3 NBA players and tell a story"}],"max_tokens": 300}}
|
||||||
@@ -13,12 +15,10 @@ import os
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIBatchProcessor:
|
class OpenAIBatchProcessor:
|
||||||
def __init__(self, api_key):
|
def __init__(self):
|
||||||
# client = OpenAI(api_key=api_key)
|
|
||||||
client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
|
client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
|
||||||
|
|
||||||
self.client = client
|
self.client = client
|
||||||
@@ -81,8 +81,7 @@ class OpenAIBatchProcessor:
|
|||||||
|
|
||||||
|
|
||||||
# Initialize the OpenAIBatchProcessor
|
# Initialize the OpenAIBatchProcessor
|
||||||
api_key = os.environ.get("OPENAI_API_KEY")
|
processor = OpenAIBatchProcessor()
|
||||||
processor = OpenAIBatchProcessor(api_key)
|
|
||||||
|
|
||||||
# Process the batch job
|
# Process the batch job
|
||||||
input_file_path = "input.jsonl"
|
input_file_path = "input.jsonl"
|
||||||
|
|||||||
@@ -10,16 +10,13 @@ you should create the input.jsonl file with the following content:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIBatchProcessor:
|
class OpenAIBatchProcessor:
|
||||||
def __init__(self, api_key):
|
def __init__(self):
|
||||||
# client = OpenAI(api_key=api_key)
|
|
||||||
client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
|
client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
|
||||||
|
|
||||||
self.client = client
|
self.client = client
|
||||||
@@ -82,11 +79,10 @@ class OpenAIBatchProcessor:
|
|||||||
|
|
||||||
|
|
||||||
# Initialize the OpenAIBatchProcessor
|
# Initialize the OpenAIBatchProcessor
|
||||||
api_key = os.environ.get("OPENAI_API_KEY")
|
processor = OpenAIBatchProcessor()
|
||||||
processor = OpenAIBatchProcessor(api_key)
|
|
||||||
|
|
||||||
# Process the batch job
|
# Process the batch job
|
||||||
input_file_path = "input_complete.jsonl"
|
input_file_path = "input.jsonl"
|
||||||
endpoint = "/v1/completions"
|
endpoint = "/v1/completions"
|
||||||
completion_window = "24h"
|
completion_window = "24h"
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
# launch server
|
# launch server
|
||||||
# python -m sglang.launch_server --model LxzGordon/URM-LLaMa-3.1-8B --is-embedding
|
# python -m sglang.launch_server --model LxzGordon/URM-LLaMa-3.1-8B --is-embedding
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
url = "http://127.0.0.1:30000"
|
url = "http://127.0.0.1:30000"
|
||||||
|
|||||||
@@ -235,6 +235,7 @@ class RuntimeEndpoint(BaseBackend):
|
|||||||
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
||||||
obj = self._generate_http_request(s, data)
|
obj = self._generate_http_request(s, data)
|
||||||
prompt_len = obj["meta_info"]["prompt_tokens"]
|
prompt_len = obj["meta_info"]["prompt_tokens"]
|
||||||
|
logprob_start_len = max(prompt_len - 2, 0) # For token healing
|
||||||
|
|
||||||
# Compute logprob
|
# Compute logprob
|
||||||
data = {
|
data = {
|
||||||
@@ -245,7 +246,7 @@ class RuntimeEndpoint(BaseBackend):
|
|||||||
},
|
},
|
||||||
"return_logprob": True,
|
"return_logprob": True,
|
||||||
"return_text_in_logprobs": True,
|
"return_text_in_logprobs": True,
|
||||||
"logprob_start_len": prompt_len - 2, # For token healing
|
"logprob_start_len": logprob_start_len,
|
||||||
}
|
}
|
||||||
obj = self._generate_http_request(s, data)
|
obj = self._generate_http_request(s, data)
|
||||||
|
|
||||||
@@ -258,8 +259,8 @@ class RuntimeEndpoint(BaseBackend):
|
|||||||
# Remove extra token if no token healing occurred
|
# Remove extra token if no token healing occurred
|
||||||
for i in range(len(input_token_logprobs)):
|
for i in range(len(input_token_logprobs)):
|
||||||
healed_token_str = input_token_logprobs[i][0][-1]
|
healed_token_str = input_token_logprobs[i][0][-1]
|
||||||
healed_token_logprob = input_token_logprobs[i][0][0]
|
|
||||||
if s.text_.endswith(healed_token_str):
|
if s.text_.endswith(healed_token_str):
|
||||||
|
healed_token_logprob = input_token_logprobs[i][0][0]
|
||||||
normalized_prompt_logprobs[i] = (
|
normalized_prompt_logprobs[i] = (
|
||||||
normalized_prompt_logprobs[i] * len(input_token_logprobs[i])
|
normalized_prompt_logprobs[i] * len(input_token_logprobs[i])
|
||||||
- healed_token_logprob
|
- healed_token_logprob
|
||||||
|
|||||||
@@ -615,7 +615,7 @@ class Runtime:
|
|||||||
if chunk == "data: [DONE]\n\n":
|
if chunk == "data: [DONE]\n\n":
|
||||||
break
|
break
|
||||||
data = json.loads(chunk[5:].strip("\n"))
|
data = json.loads(chunk[5:].strip("\n"))
|
||||||
if hasattr(data, "text"):
|
if "text" in data:
|
||||||
cur = data["text"][pos:]
|
cur = data["text"][pos:]
|
||||||
if cur:
|
if cur:
|
||||||
yield cur
|
yield cur
|
||||||
|
|||||||
Reference in New Issue
Block a user