minor: Add basic editorconfig and pre-commit hooks to enforce style for whitespaces (#1926)

This commit is contained in:
Xuehai Pan
2024-11-06 21:46:04 +08:00
committed by GitHub
parent 96766101b4
commit a5e0defb5a
77 changed files with 209 additions and 172 deletions

25
.editorconfig Normal file
View File

@@ -0,0 +1,25 @@
# https://editorconfig.org/
root = true
[*]
charset = utf-8
end_of_line = lf
indent_style = space
indent_size = 4
trim_trailing_whitespace = true
insert_final_newline = true
[*.{json,yaml,yml}]
indent_size = 2
[*.md]
indent_size = 2
x-soft-wrap-text = true
[*.rst]
indent_size = 4
x-soft-wrap-text = true
[Makefile]
indent_style = tab

View File

@@ -12,4 +12,4 @@
- [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md). - [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md).
- [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md). - [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md).
- [ ] Update documentation as needed, including docstrings or example tutorials. - [ ] Update documentation as needed, including docstrings or example tutorials.

View File

@@ -20,10 +20,10 @@ jobs:
github-token: ${{secrets.GITHUB_TOKEN}} github-token: ${{secrets.GITHUB_TOKEN}}
script: | script: |
const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000); const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/'); const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
console.log(`Owner: ${owner}, Repo: ${repo}`); console.log(`Owner: ${owner}, Repo: ${repo}`);
async function fetchIssues(page = 1) { async function fetchIssues(page = 1) {
console.log(`Fetching issues for ${owner}/${repo}, page ${page}`); console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
return await github.rest.issues.listForRepo({ return await github.rest.issues.listForRepo({
@@ -36,23 +36,23 @@ jobs:
page: page page: page
}); });
} }
async function processIssues() { async function processIssues() {
console.log('Starting to process issues'); console.log('Starting to process issues');
console.log(`Repository: ${owner}/${repo}`); console.log(`Repository: ${owner}/${repo}`);
let page = 1; let page = 1;
let hasMoreIssues = true; let hasMoreIssues = true;
while (hasMoreIssues) { while (hasMoreIssues) {
try { try {
const issues = await fetchIssues(page); const issues = await fetchIssues(page);
console.log(`Fetched ${issues.data.length} issues on page ${page}`); console.log(`Fetched ${issues.data.length} issues on page ${page}`);
if (issues.data.length === 0) { if (issues.data.length === 0) {
hasMoreIssues = false; hasMoreIssues = false;
break; break;
} }
for (const issue of issues.data) { for (const issue of issues.data) {
if (new Date(issue.updated_at) < sixtyDaysAgo) { if (new Date(issue.updated_at) < sixtyDaysAgo) {
try { try {
@@ -87,5 +87,5 @@ jobs:
} }
console.log('Finished processing issues'); console.log('Finished processing issues');
} }
await processIssues(); await processIssues();

View File

@@ -18,7 +18,7 @@ concurrency:
group: execute-notebook-${{ github.ref }} group: execute-notebook-${{ github.ref }}
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
run-all-notebooks: run-all-notebooks:
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
@@ -45,4 +45,4 @@ jobs:
run: | run: |
cd docs cd docs
make clean make clean
make compile make compile

View File

@@ -36,4 +36,4 @@ jobs:
run: | run: |
source "$HOME/.cargo/env" source "$HOME/.cargo/env"
cd rust/ cd rust/
cargo test cargo test

View File

@@ -237,7 +237,7 @@ jobs:
run: | run: |
cd test/srt cd test/srt
python3 test_moe_eval_accuracy_large.py python3 test_moe_eval_accuracy_large.py
- name: Evaluate MLA Accuracy (TP=2) - name: Evaluate MLA Accuracy (TP=2)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |

View File

@@ -47,7 +47,7 @@ jobs:
make html make html
cd _build/html cd _build/html
git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1 git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
rm -rf ../sgl-project.github.io/* rm -rf ../sgl-project.github.io/*
cp -r * ../sgl-project.github.io cp -r * ../sgl-project.github.io

2
.gitignore vendored
View File

@@ -185,4 +185,4 @@ tmp*.txt
work_dirs/ work_dirs/
*.csv *.csv
!logo.png !logo.png

View File

@@ -1,3 +1,3 @@
[settings] [settings]
profile=black profile=black
known_first_party=sglang known_first_party=sglang

View File

@@ -1,7 +1,27 @@
default_language_version: default_language_version:
python: python3.9 python: python3.9
default_stages: [pre-commit, pre-push, manual]
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-symlinks
- id: destroyed-symlinks
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
args: [--allow-multiple-documents]
- id: check-toml
- id: check-ast
- id: check-added-large-files
- id: check-merge-conflict
- id: check-executables-have-shebangs
- id: check-shebang-scripts-are-executable
- id: detect-private-key
- id: debug-statements
- id: no-commit-to-branch
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.13.2 rev: 5.13.2
hooks: hooks:
@@ -13,8 +33,3 @@ repos:
additional_dependencies: ['.[jupyter]'] additional_dependencies: ['.[jupyter]']
types: [python, jupyter] types: [python, jupyter]
types_or: [python, jupyter] types_or: [python, jupyter]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: no-commit-to-branch

View File

@@ -6,5 +6,3 @@ Two primary methods are covered:
- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) - [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)

View File

@@ -29,18 +29,18 @@ def _triton_kernel_funtion():
... ...
``` ```
## 2. Torch Tunable Operations ## 2. Torch Tunable Operations
**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations. **TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
### Key Environment Variables: ### Key Environment Variables:
1. **PYTORCH_TUNABLEOP_ENABLED**: 1. **PYTORCH_TUNABLEOP_ENABLED**:
- Default: `0` - Default: `0`
- Set to `1` to enable TunableOp. - Set to `1` to enable TunableOp.
2. **PYTORCH_TUNABLEOP_TUNING**: 2. **PYTORCH_TUNABLEOP_TUNING**:
- Default: `1` - Default: `1`
- Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled. - Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled.
3. **PYTORCH_TUNABLEOP_VERBOSE**: 3. **PYTORCH_TUNABLEOP_VERBOSE**:
- Default: `0` - Default: `0`
- Set to `1` to enable verbose output for TunableOp. - Set to `1` to enable verbose output for TunableOp.
@@ -66,20 +66,20 @@ The following are suggestions for optimizing matrix multiplication (GEMM) and co
To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape. To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape.
### Key Configurations: ### Key Configurations:
1. **Max Autotune**: 1. **Max Autotune**:
- Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`. - Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`.
2. **Fine-Grained Control**: 2. **Fine-Grained Control**:
- Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`. - Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`.
- Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`. - Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`.
3. **Backend Selection**: 3. **Backend Selection**:
- Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance. - Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance.
4. **Freezing for Inference**: 4. **Freezing for Inference**:
- Use `torch._inductor.config.freezing=True` to enable constant folding optimizations. - Use `torch._inductor.config.freezing=True` to enable constant folding optimizations.
5. **Debugging**: 5. **Debugging**:
- Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor. - Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor.
### Example Code Block: ### Example Code Block:
@@ -98,4 +98,4 @@ TORCHINDUCTOR_FREEZING=1 your_script.sh
For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link: For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link:
[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization) [ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)

View File

@@ -21,4 +21,4 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35 python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35

View File

@@ -21,4 +21,4 @@ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name rando
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32 python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33 python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34 python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35 python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35

View File

@@ -30,22 +30,22 @@ def poignancy_event_prompt(persona_name, persona_iss, event):
@sgl.function @sgl.function
def generate_event_triple(s, persona_name, action): def generate_event_triple(s, persona_name, action):
s += """Task: Turn the input into (subject, predicate, object). s += """Task: Turn the input into (subject, predicate, object).
Input: Sam Johnson is eating breakfast. Input: Sam Johnson is eating breakfast.
Output: (Dolores Murphy, eat, breakfast) Output: (Dolores Murphy, eat, breakfast)
--- ---
Input: Joon Park is brewing coffee. Input: Joon Park is brewing coffee.
Output: (Joon Park, brew, coffee) Output: (Joon Park, brew, coffee)
--- ---
Input: Jane Cook is sleeping. Input: Jane Cook is sleeping.
Output: (Jane Cook, is, sleep) Output: (Jane Cook, is, sleep)
--- ---
Input: Michael Bernstein is writing email on a computer. Input: Michael Bernstein is writing email on a computer.
Output: (Michael Bernstein, write, email) Output: (Michael Bernstein, write, email)
--- ---
Input: Percy Liang is teaching students in a classroom. Input: Percy Liang is teaching students in a classroom.
Output: (Percy Liang, teach, students) Output: (Percy Liang, teach, students)
--- ---
Input: Merrie Morris is running on a treadmill. Input: Merrie Morris is running on a treadmill.
Output: (Merrie Morris, run, treadmill) Output: (Merrie Morris, run, treadmill)
---""" ---"""
s += persona_name + "is" + action + ".\n" s += persona_name + "is" + action + ".\n"
@@ -56,22 +56,22 @@ Output: (Merrie Morris, run, treadmill)
def generate_event_triple_prompt(persona_name, action): def generate_event_triple_prompt(persona_name, action):
s = "" s = ""
s += """Task: Turn the input into (subject, predicate, object). s += """Task: Turn the input into (subject, predicate, object).
Input: Sam Johnson is eating breakfast. Input: Sam Johnson is eating breakfast.
Output: (Dolores Murphy, eat, breakfast) Output: (Dolores Murphy, eat, breakfast)
--- ---
Input: Joon Park is brewing coffee. Input: Joon Park is brewing coffee.
Output: (Joon Park, brew, coffee) Output: (Joon Park, brew, coffee)
--- ---
Input: Jane Cook is sleeping. Input: Jane Cook is sleeping.
Output: (Jane Cook, is, sleep) Output: (Jane Cook, is, sleep)
--- ---
Input: Michael Bernstein is writing email on a computer. Input: Michael Bernstein is writing email on a computer.
Output: (Michael Bernstein, write, email) Output: (Michael Bernstein, write, email)
--- ---
Input: Percy Liang is teaching students in a classroom. Input: Percy Liang is teaching students in a classroom.
Output: (Percy Liang, teach, students) Output: (Percy Liang, teach, students)
--- ---
Input: Merrie Morris is running on a treadmill. Input: Merrie Morris is running on a treadmill.
Output: (Merrie Morris, run, treadmill) Output: (Merrie Morris, run, treadmill)
---""" ---"""
s += persona_name + "is" + action + ".\n" s += persona_name + "is" + action + ".\n"
@@ -107,9 +107,9 @@ def action_location_sector(
current_action, current_action,
next_action, next_action,
): ):
s += """Task -- choose an appropriate area from the area options for a task at hand. s += """Task -- choose an appropriate area from the area options for a task at hand.
Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen. Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen. Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}. Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place. * Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim. * Must be one of the "Area options," verbatim.
@@ -117,7 +117,7 @@ For taking a walk, Sam Kim should go to the following area: {Johnson Park}
--- ---
Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room. Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
Jane Anderson is currently in {Oak Hill College} that has a classroom, library Jane Anderson is currently in {Oak Hill College} that has a classroom, library
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}. Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place. * Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim. * Must be one of the "Area options," verbatim.
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe} For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
@@ -167,9 +167,9 @@ def action_location_sector_prompt(
next_action, next_action,
): ):
s = "" s = ""
s += """Task -- choose an appropriate area from the area options for a task at hand. s += """Task -- choose an appropriate area from the area options for a task at hand.
Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen. Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen. Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}. Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place. * Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim. * Must be one of the "Area options," verbatim.
@@ -177,7 +177,7 @@ For taking a walk, Sam Kim should go to the following area: {Johnson Park}
--- ---
Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room. Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
Jane Anderson is currently in {Oak Hill College} that has a classroom, library Jane Anderson is currently in {Oak Hill College} that has a classroom, library
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}. Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place. * Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
* Must be one of the "Area options," verbatim. * Must be one of the "Area options," verbatim.
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe} For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
@@ -226,7 +226,7 @@ Stay in the current area if the activity can be done there. Never go into other
For cooking, Jane Anderson should go to the following area in Jane Anderson's house: For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
Answer: {kitchen} Answer: {kitchen}
--- ---
Tom Watson is in common room in Tom Watson's apartment. Tom Watson is in common room in Tom Watson's apartment.
Tom Watson is going to Hobbs Cafe that has the following areas: {cafe} Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary. Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe: For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
@@ -240,7 +240,7 @@ Answer: {cafe}
+ target_sector_areas + target_sector_areas
+ "}\n" + "}\n"
) )
s += """* Stay in the current area if the activity can be done there. s += """* Stay in the current area if the activity can be done there.
* NEVER go into other people's rooms unless necessary.""" * NEVER go into other people's rooms unless necessary."""
s += ( s += (
persona_name persona_name
@@ -268,7 +268,7 @@ Stay in the current area if the activity can be done there. Never go into other
For cooking, Jane Anderson should go to the following area in Jane Anderson's house: For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
Answer: {kitchen} Answer: {kitchen}
--- ---
Tom Watson is in common room in Tom Watson's apartment. Tom Watson is in common room in Tom Watson's apartment.
Tom Watson is going to Hobbs Cafe that has the following areas: {cafe} Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary. Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe: For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
@@ -282,7 +282,7 @@ Answer: {cafe}
+ target_sector_areas + target_sector_areas
+ "}\n" + "}\n"
) )
s += """* Stay in the current area if the activity can be done there. s += """* Stay in the current area if the activity can be done there.
* NEVER go into other people's rooms unless necessary.""" * NEVER go into other people's rooms unless necessary."""
s += ( s += (
persona_name persona_name

View File

@@ -20,7 +20,7 @@ outlines 0.0.22
Run Llama-7B Run Llama-7B
``` ```
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
``` ```
Run Mixtral-8x7B Run Mixtral-8x7B

View File

@@ -23,7 +23,7 @@ python3 build_dataset.py
Run Llama-7B Run Llama-7B
```bash ```bash
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
``` ```
Benchmark Character Generation Benchmark Character Generation

View File

@@ -47,4 +47,4 @@ Quirinus Quirrell
Nearly Headless Nick Nearly Headless Nick
Aunt Marge Aunt Marge
Griphook Griphook
Ludo Bagman Ludo Bagman

View File

@@ -4,7 +4,7 @@
python3 download_images.py python3 download_images.py
``` ```
image benchmark source: https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild image benchmark source: https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
### Other Dependency ### Other Dependency
``` ```

0
benchmark/llava_bench/bench_hf_llava_bench.sh Normal file → Executable file
View File

0
benchmark/llava_bench/bench_hf_mme.sh Normal file → Executable file
View File

View File

@@ -30,4 +30,4 @@ python3 bench_other.py --backend guidance --num-questions 25 --parallel 1 --n-ct
``` ```
python3 bench_other.py --backend lmql --num-questions 25 --parallel 1 python3 bench_other.py --backend lmql --num-questions 25 --parallel 1
``` ```

View File

@@ -1,2 +1,2 @@
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
tar xf data.tar tar xf data.tar

View File

@@ -43,7 +43,7 @@ python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx
``` ```
### Benchmark lmql ### Benchmark lmql
``` ```
python3 bench_other.py --num-questions 64 --backend lmql --parallel 1 python3 bench_other.py --num-questions 64 --backend lmql --parallel 1
``` ```

View File

@@ -31,4 +31,4 @@ python3 bench_other.py --num-questions 100 --backend guidance --parallel 1 --n-c
``` ```
python3 bench_other.py --num-questions 100 --backend lmql --parallel 1 python3 bench_other.py --num-questions 100 --backend lmql --parallel 1
``` ```

View File

@@ -11,7 +11,7 @@ from sglang.utils import dump_state_text, read_jsonl
def get_prompt(question): def get_prompt(question):
prompt = ( prompt = (
"""Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. (1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. (2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task. (3) Finish[answer], which returns the answer and finishes the task.
@@ -37,7 +37,7 @@ Action 1: Search[Milhouse]
Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening. Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after". Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
Action 2: Lookup[named after] Action 2: Lookup[named after]
Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon. Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
Action 3: Finish[Richard Nixon] Action 3: Finish[Richard Nixon]
Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture? Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
@@ -62,10 +62,10 @@ Action 3: Finish[director, screenwriter, actor]
Question: Which magazine was started first Arthur's Magazine or First for Women? Question: Which magazine was started first Arthur's Magazine or First for Women?
Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first. Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
Action 1: Search[Arthur's Magazine] Action 1: Search[Arthur's Magazine]
Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century. Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next. Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
Action 2: Search[First for Women] Action 2: Search[First for Women]
Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first. Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
Action 3: Finish[Arthur's Magazine] Action 3: Finish[Arthur's Magazine]
Question: Were Pavel Urysohn and Leonid Levin known for the same type of work? Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
@@ -74,8 +74,8 @@ Action 1: Search[Pavel Urysohn]
Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory. Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work. Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
Action 2: Search[Leonid Levin] Action 2: Search[Leonid Levin]
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
Action 3: Finish[yes] Action 3: Finish[yes]
""" """
+ question + question

View File

@@ -13,7 +13,7 @@ from sglang.utils import dump_state_text, read_jsonl
@sgl.function @sgl.function
def webthink(s, question, triplets): def webthink(s, question, triplets):
s += ( s += (
"""Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. (1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. (2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task. (3) Finish[answer], which returns the answer and finishes the task.
@@ -39,7 +39,7 @@ Action 1: Search[Milhouse]
Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening. Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after". Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
Action 2: Lookup[named after] Action 2: Lookup[named after]
Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon. Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
Action 3: Finish[Richard Nixon] Action 3: Finish[Richard Nixon]
Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture? Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
@@ -64,10 +64,10 @@ Action 3: Finish[director, screenwriter, actor]
Question: Which magazine was started first Arthur's Magazine or First for Women? Question: Which magazine was started first Arthur's Magazine or First for Women?
Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first. Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
Action 1: Search[Arthur's Magazine] Action 1: Search[Arthur's Magazine]
Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century. Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next. Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
Action 2: Search[First for Women] Action 2: Search[First for Women]
Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first. Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
Action 3: Finish[Arthur's Magazine] Action 3: Finish[Arthur's Magazine]
Question: Were Pavel Urysohn and Leonid Levin known for the same type of work? Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
@@ -76,8 +76,8 @@ Action 1: Search[Pavel Urysohn]
Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory. Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work. Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
Action 2: Search[Leonid Levin] Action 2: Search[Leonid Levin]
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
Action 3: Finish[yes] Action 3: Finish[yes]
""" """
+ question + question

View File

@@ -1 +1 @@
!topic.jsonl !topic.jsonl

View File

@@ -30,4 +30,4 @@ python3 bench_other.py --backend guidance --num-questions 32 --parallel 1 --n-ct
``` ```
python3 bench_other.py --backend lmql --num-questions 32 --parallel 1 python3 bench_other.py --backend lmql --num-questions 32 --parallel 1
``` ```

View File

@@ -47,4 +47,4 @@
{"topic": "self-publishing a book", "number": 7} {"topic": "self-publishing a book", "number": 7}
{"topic": "starting an urban farm", "number": 6} {"topic": "starting an urban farm", "number": 6}
{"topic": "improving your memory", "number": 8} {"topic": "improving your memory", "number": 8}
{"topic": "creating a personal brand online", "number": 9} {"topic": "creating a personal brand online", "number": 9}

View File

@@ -31,4 +31,4 @@ Clone [sgl-project.github.io](https://github.com/sgl-project/sgl-project.github.
```bash ```bash
export DOC_SITE_PATH=../../sgl-project.github.io # update this with your path export DOC_SITE_PATH=../../sgl-project.github.io # update this with your path
python3 deploy.py python3 deploy.py
``` ```

View File

@@ -5,13 +5,13 @@
table.autosummary td { table.autosummary td {
width: 50% width: 50%
} }
img.align-center { img.align-center {
display: block; display: block;
margin-left: auto; margin-left: auto;
margin-right: auto; margin-right: auto;
} }
.output_area.stderr { .output_area.stderr {
color: #d3d3d3 !important; color: #d3d3d3 !important;
} }
@@ -26,4 +26,4 @@ div.output_area.stderr {
div.output_area.stdout { div.output_area.stdout {
color: #d3d3d3 !important; color: #d3d3d3 !important;
} }

View File

@@ -147,7 +147,7 @@ docker run --gpus all \
lmsysorg/sglang:latest \ lmsysorg/sglang:latest \
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000 python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
``` ```
</details> </details>
## Example: Run Llama 3.1 405B ## Example: Run Llama 3.1 405B

View File

@@ -198,4 +198,4 @@ nbsphinx_prolog = """
color: #d3d3d3 !important; /* light gray */ color: #d3d3d3 !important; /* light gray */
} }
</style> </style>
""" """

View File

@@ -1,22 +1,22 @@
# Deploy the documents # Deploy the documents
import os import os
from datetime import datetime from datetime import datetime
def run_cmd(cmd): def run_cmd(cmd):
print(cmd) print(cmd)
os.system(cmd) os.system(cmd)
run_cmd("cd $DOC_SITE_PATH; git pull") run_cmd("cd $DOC_SITE_PATH; git pull")
# (Optional) Remove old files # (Optional) Remove old files
# run_cmd("rm -rf $ALPA_SITE_PATH/*") # run_cmd("rm -rf $ALPA_SITE_PATH/*")
run_cmd("cp -r _build/html/* $DOC_SITE_PATH") run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
run_cmd( run_cmd(
f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main" f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
) )

View File

@@ -74,4 +74,4 @@ def example(s):
choices_method=sgl.unconditional_likelihood_normalized, choices_method=sgl.unconditional_likelihood_normalized,
) )
) )
``` ```

View File

@@ -37,4 +37,4 @@ You can also use the Jinja template format, defined by Hugging Face transformers
``` ```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.jinja python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.jinja
``` ```

View File

@@ -25,9 +25,9 @@ If you see `decode out of memory happened` occasionally but not frequently, it i
Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput. Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput.
### Avoid out-of-memory by Tuning `--chunked-prefill-size`, `--mem-fraction-static`, `--max-running-requests` ### Avoid out-of-memory by Tuning `--chunked-prefill-size`, `--mem-fraction-static`, `--max-running-requests`
If you see out of memory (OOM) errors, you can try to tune the following parameters. If you see out of memory (OOM) errors, you can try to tune the following parameters.
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
If OOM happens during decoding, try to decrease `--max-running-requests`. If OOM happens during decoding, try to decrease `--max-running-requests`.
You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
### Try Advanced Options ### Try Advanced Options

View File

@@ -1,3 +1,3 @@
# Learn more # Learn more
You can find more blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials). You can find more blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials).

View File

@@ -223,4 +223,4 @@ response = requests.post(
}, },
) )
print(response.json()) print(response.json())
``` ```

View File

@@ -3,9 +3,9 @@
This page lists some common errors and tips for fixing them. This page lists some common errors and tips for fixing them.
## CUDA out of memory ## CUDA out of memory
If you see out of memory (OOM) errors, you can try to tune the following parameters. If you see out of memory (OOM) errors, you can try to tune the following parameters.
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`. If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
If OOM happens during decoding, try to decrease `--max-running-requests`. If OOM happens during decoding, try to decrease `--max-running-requests`.
You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
## CUDA error: an illegal memory access was encountered ## CUDA error: an illegal memory access was encountered

View File

@@ -14,4 +14,4 @@ sphinx-book-theme
sphinx-copybutton sphinx-copybutton
sphinx-tabs sphinx-tabs
sphinxcontrib-mermaid sphinxcontrib-mermaid
urllib3<2.0.0 urllib3<2.0.0

View File

@@ -33,7 +33,7 @@ CUR_NODES_IDX=$2
VIDEO_DIR=$3 VIDEO_DIR=$3
MODEL_PATH=$4 MODEL_PATH=$4
NUM_FRAMES=$5 NUM_FRAMES=$5
@@ -73,16 +73,16 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
( (
START=$(((IDX-1) * GPUS_PER_CHUNK)) START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH}) CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string # Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}") CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX)) LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))
echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR" echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"
# Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk. # Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
PORT=$((10000 + RANDOM % 55536)) PORT=$((10000 + RANDOM % 55536))
@@ -92,7 +92,7 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))" echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"
#!/bin/bash #!/bin/bash
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \ CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \
--port $PORT \ --port $PORT \
@@ -102,10 +102,10 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
--video-dir $VIDEO_DIR \ --video-dir $VIDEO_DIR \
--model-path $MODEL_PATH \ --model-path $MODEL_PATH \
--num-frames $NUM_FRAMES #& --num-frames $NUM_FRAMES #&
wait $! # Wait for the process to finish and capture its exit status wait $! # Wait for the process to finish and capture its exit status
COMMAND_STATUS=$? COMMAND_STATUS=$?
if [ $COMMAND_STATUS -ne 0 ]; then if [ $COMMAND_STATUS -ne 0 ]; then
echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..." echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
RETRY_COUNT=$(($RETRY_COUNT + 1)) RETRY_COUNT=$(($RETRY_COUNT + 1))
@@ -124,8 +124,8 @@ done
wait wait
cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv
END_TIME=$(date +%s) # Capture end time END_TIME=$(date +%s) # Capture end time
ELAPSED_TIME=$(($END_TIME - $START_TIME)) ELAPSED_TIME=$(($END_TIME - $START_TIME))
echo "Total execution time: $ELAPSED_TIME seconds." echo "Total execution time: $ELAPSED_TIME seconds."

View File

@@ -4,8 +4,8 @@ Usage:
Show in "assistant" the desired answer format. Each "gen" term should have a stop token. Show in "assistant" the desired answer format. Each "gen" term should have a stop token.
The stream mode is not supported in speculative execution. The stream mode is not supported in speculative execution.
E.g. E.g.
correct: correct:
sgl.assistant("\nName:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n")) sgl.assistant("\nName:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
incorrect: incorrect:
s += sgl.assistant("\nName:" + sgl.gen("name", stop="\n")) s += sgl.assistant("\nName:" + sgl.gen("name", stop="\n"))

View File

@@ -7,4 +7,4 @@ RUN git clone https://github.com/sgl-project/sglang.git
WORKDIR /opt/sglang WORKDIR /opt/sglang
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip && \
pip install -e "python[all]" && \ pip install -e "python[all]" && \
pip install datasets pip install datasets

View File

@@ -32,4 +32,4 @@ curl -X POST http://localhost:8000/v2/models/character_generation/generate \
"INPUT_TEXT": ["harry"] "INPUT_TEXT": ["harry"]
}' }'
``` ```

View File

@@ -21,7 +21,7 @@ def main():
# Tokenize inputs # Tokenize inputs
tokenizer = get_tokenizer(MODEL_PATH) tokenizer = get_tokenizer(MODEL_PATH)
token_ids_list = [tokenizer.encode(prompt) for prompt in prompts] token_ids_list = [tokenizer.encode(prompt) for prompt in prompts]
# Create an LLM. # Create an LLM.
# You can also specify `skip_tokenizer_init=True`, but it requires explicit detokenization at the end # You can also specify `skip_tokenizer_init=True`, but it requires explicit detokenization at the end
llm = sgl.Engine(model_path=MODEL_PATH) llm = sgl.Engine(model_path=MODEL_PATH)
@@ -36,4 +36,4 @@ def main():
# The __main__ condition is necessary here because we use "spawn" to create subprocesses # The __main__ condition is necessary here because we use "spawn" to create subprocesses
# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine # Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -37,4 +37,4 @@ curl -X POST http://localhost:8000/generate -H "Content-Type: application/json"
curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
``` ```
This will send both non-streaming and streaming requests to the server. This will send both non-streaming and streaming requests to the server.

View File

@@ -3,7 +3,7 @@ Usage:
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git # Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
# Installing latest sglang. # Installing latest sglang.
# Endpoint Service CLI: # Endpoint Service CLI:
python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
python3 http_llama3_llava_test.py python3 http_llama3_llava_test.py

View File

@@ -3,7 +3,7 @@ Usage:
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git # Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
# Installing latest sglang. # Installing latest sglang.
# Endpoint Service CLI: # Endpoint Service CLI:
python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
python3 http_qwen_llava_test.py python3 http_qwen_llava_test.py

View File

@@ -134,4 +134,4 @@ def method_has_implemented_embedding(
class_embedding = inspect.getattr_static(method_class, "embedding", None) class_embedding = inspect.getattr_static(method_class, "embedding", None)
return (class_embedding is not None return (class_embedding is not None
and class_embedding is not base_embedding) and class_embedding is not base_embedding)

View File

@@ -311,7 +311,7 @@ class VocabParallelEmbedding(torch.nn.Module):
def get_sharded_to_full_mapping(self) -> Optional[List[int]]: def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
"""Get a mapping that can be used to reindex the gathered """Get a mapping that can be used to reindex the gathered
logits for sampling. logits for sampling.
During sampling, we gather logits from all ranks. The relationship During sampling, we gather logits from all ranks. The relationship
of index->token_id will follow the same format as outlined in the class of index->token_id will follow the same format as outlined in the class
docstring. However, after the gather, we want to reindex the final docstring. However, after the gather, we want to reindex the final
@@ -483,4 +483,4 @@ class ParallelLMHead(VocabParallelEmbedding):
def forward(self, input_): def forward(self, input_):
del input_ del input_
raise RuntimeError("LMHead's weights should be used in the sampler.") raise RuntimeError("LMHead's weights should be used in the sampler.")

View File

@@ -838,7 +838,7 @@ class Scheduler:
time_per_output_tokens_iter: List[float] = [] time_per_output_tokens_iter: List[float] = []
# Request stats # Request stats
# Decode # Decode
gen_throughput: float = 0.0 gen_throughput: float = 0.0
# Latency # Latency
time_e2e_requests: List[float] = [] time_e2e_requests: List[float] = []
@@ -866,11 +866,11 @@ class Scheduler:
time_waiting_requests.append(req.queued_time - req.created_time) time_waiting_requests.append(req.queued_time - req.created_time)
num_prompt_tokens_requests.append(len(req.origin_input_ids)) num_prompt_tokens_requests.append(len(req.origin_input_ids))
num_generation_tokens_requests.append(len(req.output_ids)) num_generation_tokens_requests.append(len(req.output_ids))
finished_reason_requests.append( finished_reason_requests.append(
req.finished_reason.to_json() req.finished_reason.to_json()
if req.finished_reason is not None if req.finished_reason is not None
else None) else None)
return Stats( return Stats(
new_seq=new_seq, new_seq=new_seq,
num_running_req=num_running_req, num_running_req=num_running_req,

View File

@@ -384,7 +384,7 @@ class TokenizerManager:
obj.load_format = self.server_args.load_format obj.load_format = self.server_args.load_format
if not self.model_update_lock.locked(): if not self.model_update_lock.locked():
async with self.model_update_lock: async with self.model_update_lock:
# wait for the previous generation requests to finish # wait for the previous generation requests to finish
while len(self.rid_to_state) > 0: while len(self.rid_to_state) > 0:

View File

@@ -151,7 +151,7 @@ class Metrics:
0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5 1.0, 2.5
]) ])
# Request Stats # Request Stats
# Metadata # Metadata
self.num_prompt_tokens_requests = Histogram( self.num_prompt_tokens_requests = Histogram(
@@ -253,7 +253,7 @@ class PrometheusMetricsCollector(MetricsCollector):
stats.time_to_first_tokens_iter) stats.time_to_first_tokens_iter)
self._log_histogram(self.metrics.histogram_time_per_output_token, self._log_histogram(self.metrics.histogram_time_per_output_token,
stats.time_per_output_tokens_iter) stats.time_per_output_tokens_iter)
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys) # self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req) self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req) self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
@@ -294,4 +294,4 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
buckets.append(value) buckets.append(value)
else: else:
return buckets return buckets
exponent += 1 exponent += 1

View File

@@ -54,4 +54,4 @@ class Stats:
num_prompt_tokens_iter: int = 0 num_prompt_tokens_iter: int = 0
num_generation_tokens_iter: int = 0 num_generation_tokens_iter: int = 0
time_to_first_tokens_iter: List[float] = field(default_factory=list) time_to_first_tokens_iter: List[float] = field(default_factory=list)
time_per_output_tokens_iter: List[float] = field(default_factory=list) time_per_output_tokens_iter: List[float] = field(default_factory=list)

View File

@@ -17,7 +17,7 @@ limitations under the License.
""" """
Utilities for multi-modal models. Utilities for multi-modal models.
This python file mainly contains utilities that were used in the This python file mainly contains utilities that were used in the
image processing logic of llava-next including operations such as image processing logic of llava-next including operations such as
anyres and anyres_max anyres and anyres_max

View File

@@ -136,7 +136,7 @@ class GPT2Block(nn.Module):
layer_id: int, layer_id: int,
config: GPT2Config, config: GPT2Config,
cache_config = None, cache_config = None,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
prefix: str = "", prefix: str = "",
): ):
@@ -284,4 +284,4 @@ class GPT2LMHeadModel(nn.Module):
default_weight_loader) default_weight_loader)
weight_loader(param, loaded_weight) weight_loader(param, loaded_weight)
EntryClass = GPT2LMHeadModel EntryClass = GPT2LMHeadModel

0
python/sglang/srt/models/olmo.py Executable file → Normal file
View File

View File

@@ -57,27 +57,27 @@ logger = init_logger(__name__)
class Qwen2VLImageInputs(TypedDict): class Qwen2VLImageInputs(TypedDict):
pixel_values: torch.Tensor pixel_values: torch.Tensor
"""Shape: """Shape:
`(num_patches, num_channels * patch_size * patch_size)` `(num_patches, num_channels * patch_size * patch_size)`
""" """
image_grid_thw: torch.Tensor image_grid_thw: torch.Tensor
"""Shape: `(num_images, 3)` """Shape: `(num_images, 3)`
This should be in `(grid_t, grid_h, grid_w)` format. This should be in `(grid_t, grid_h, grid_w)` format.
""" """
class Qwen2VLVideoInputs(TypedDict): class Qwen2VLVideoInputs(TypedDict):
pixel_values_videos: torch.Tensor pixel_values_videos: torch.Tensor
"""Shape: """Shape:
`(num_patches, `(num_patches,
num_channels * temporal_patch_size * patch_size * patch_size)` num_channels * temporal_patch_size * patch_size * patch_size)`
""" """
video_grid_thw: torch.Tensor video_grid_thw: torch.Tensor
"""Shape: `(num_videos, 3)` """Shape: `(num_videos, 3)`
This should be in `(grid_t, grid_h, grid_w)` format. This should be in `(grid_t, grid_h, grid_w)` format.
""" """

View File

@@ -759,7 +759,7 @@ class Engine:
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown() # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
atexit.register(self.shutdown) atexit.register(self.shutdown)
# runtime server default log level is log # runtime server default log level is log
# offline engine works in scripts, so we set it to error # offline engine works in scripts, so we set it to error

File diff suppressed because one or more lines are too long

View File

@@ -320,7 +320,7 @@ jinja_env = jinja2.Environment(
_message_template = """ _message_template = """
<div class="message {{ role }}"> <div class="message {{ role }}">
<div class="role"> <div class="role">
{{ role }} {{ role }}
{% if variant %}<span class="variant">({{ variant }})</span>{% endif %} {% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
</div> </div>
<div class="content"> <div class="content">

View File

@@ -2,8 +2,8 @@
""" """
HumanEval: Evaluating Large Language Models Trained on Code HumanEval: Evaluating Large Language Models Trained on Code
Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
""" """
import random import random

View File

@@ -1,10 +1,10 @@
# Adapted from https://github.com/openai/simple-evals/ # Adapted from https://github.com/openai/simple-evals/
""" """
MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems. MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
Language Models are Multilingual Chain-of-Thought Reasoners Language Models are Multilingual Chain-of-Thought Reasoners
Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
""" """
import re import re

View File

@@ -22,4 +22,4 @@ rand = "0.8.5"
reqwest = { version = "0.12.8", features = ["stream"] } reqwest = { version = "0.12.8", features = ["stream"] }
futures-util = "0.3" futures-util = "0.3"
serde_json = "=1.0.1" serde_json = "=1.0.1"
pyo3 = { version = "0.22.5", features = ["extension-module"] } pyo3 = { version = "0.22.5", features = ["extension-module"] }

View File

@@ -36,7 +36,7 @@ Usage: router [OPTIONS]
Options: Options:
--host <HOST> [default: 127.0.0.1] --host <HOST> [default: 127.0.0.1]
--port <PORT> [default: 3001] --port <PORT> [default: 3001]
--worker-urls <WORKER_URLS> --worker-urls <WORKER_URLS>
--policy <POLICY> [default: round_robin] [possible values: round_robin, random] --policy <POLICY> [default: round_robin] [possible values: round_robin, random]
-h, --help Print help -h, --help Print help
-V, --version Print version -V, --version Print version
@@ -82,11 +82,11 @@ $ pip install <path to wheel>
1. Run test 1. Run test
``` ```
$ cargo test $ cargo test
``` ```
2. Run lint 2. Run lint
``` ```
$ cargo fmt $ cargo fmt
``` ```

View File

@@ -7,9 +7,9 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
# Follow the installation prompts, then reload your shell # Follow the installation prompts, then reload your shell
. "$HOME/.cargo/env" . "$HOME/.cargo/env"
source $HOME/.cargo/env source $HOME/.cargo/env
# Verify installation # Verify installation
rustc --version rustc --version
cargo --version cargo --version

View File

@@ -10,7 +10,7 @@ HF_TOKEN = "..."
prompt = """ prompt = """
### Instruction: ### Instruction:
Write a poem about the transformers Python library. Write a poem about the transformers Python library.
Mention the word "large language models" in that poem. Mention the word "large language models" in that poem.
### Response: ### Response:
The Transformers are large language models, The Transformers are large language models,

View File

@@ -5,7 +5,7 @@ MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
ADAPTER = "/home/ying/test_lora" ADAPTER = "/home/ying/test_lora"
prompt = """ prompt = """
### Instruction: ### Instruction:
Write a poem about the transformers Python library. Write a poem about the transformers Python library.
Mention the word "large language models" in that poem. Mention the word "large language models" in that poem.
### Response: ### Response:
The Transformers are large language models, The Transformers are large language models,

5
scripts/version_branch_to_tag.sh Normal file → Executable file
View File

@@ -1,8 +1,8 @@
#!/bin/bash #!/bin/bash
# This script is used for release. # This script is used for release.
# It tags all remote branches starting with 'v' with the same name as the branch, # It tags all remote branches starting with 'v' with the same name as the branch,
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository. # deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
git fetch origin --prune git fetch origin --prune
@@ -27,4 +27,3 @@ done
git push --tags git push --tags
echo "All branches starting with 'v' have been tagged, deleted from remote, and pushed to the remote repository." echo "All branches starting with 'v' have been tagged, deleted from remote, and pushed to the remote repository."

View File

@@ -1,6 +1,6 @@
# Run Unit Tests # Run Unit Tests
SGLang uses the built-in library [unittest](https://docs.python.org/3/library/unittest.html) as the testing framework. SGLang uses the built-in library [unittest](https://docs.python.org/3/library/unittest.html) as the testing framework.
## Test Backend Runtime ## Test Backend Runtime
```bash ```bash

File diff suppressed because one or more lines are too long

0
test/srt/models/test_generation_models.py Executable file → Normal file
View File

View File

@@ -45,7 +45,7 @@ TORCH_DTYPES = [torch.float16]
PROMPTS = [ PROMPTS = [
""" """
### Instruction: ### Instruction:
Write a poem about the transformers Python library. Write a poem about the transformers Python library.
Mention the word "large language models" in that poem. Mention the word "large language models" in that poem.
### Response: ### Response:
The Transformers are large language models, The Transformers are large language models,

View File

@@ -53,7 +53,7 @@ class TestDataParallelism(unittest.TestCase):
# pause a few seconds then send again # pause a few seconds then send again
time.sleep(5) time.sleep(5)
response = requests.post( response = requests.post(
self.base_url + "/update_weights", self.base_url + "/update_weights",
json={"model_path": DEFAULT_MODEL_NAME_FOR_TEST}, json={"model_path": DEFAULT_MODEL_NAME_FOR_TEST},

View File

@@ -11,9 +11,9 @@ from sglang.test.test_utils import (
) )
MANY_NEW_TOKENS_PROMPT = """ MANY_NEW_TOKENS_PROMPT = """
Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters. Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters. Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
Each section should be as comprehensive as possible to create a rich and immersive experience for the reader. Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long. The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
""" """
@@ -109,7 +109,7 @@ class TestMatchedStop(unittest.TestCase):
llama_format_prompt = """ llama_format_prompt = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|> <|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|> You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|> What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
""" """
eos_token_id = 128009 eos_token_id = 128009