minor: Add basic editorconfig and pre-commit hooks to enforce style for whitespaces (#1926)
This commit is contained in:
25
.editorconfig
Normal file
25
.editorconfig
Normal file
@@ -0,0 +1,25 @@
|
||||
# https://editorconfig.org/
|
||||
|
||||
root = true
|
||||
|
||||
[*]
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
trim_trailing_whitespace = true
|
||||
insert_final_newline = true
|
||||
|
||||
[*.{json,yaml,yml}]
|
||||
indent_size = 2
|
||||
|
||||
[*.md]
|
||||
indent_size = 2
|
||||
x-soft-wrap-text = true
|
||||
|
||||
[*.rst]
|
||||
indent_size = 4
|
||||
x-soft-wrap-text = true
|
||||
|
||||
[Makefile]
|
||||
indent_style = tab
|
||||
2
.github/pull_request_template.md
vendored
2
.github/pull_request_template.md
vendored
@@ -12,4 +12,4 @@
|
||||
|
||||
- [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md).
|
||||
- [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md).
|
||||
- [ ] Update documentation as needed, including docstrings or example tutorials.
|
||||
- [ ] Update documentation as needed, including docstrings or example tutorials.
|
||||
|
||||
14
.github/workflows/close-inactive-issues.yml
vendored
14
.github/workflows/close-inactive-issues.yml
vendored
@@ -20,10 +20,10 @@ jobs:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
|
||||
|
||||
|
||||
const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
|
||||
console.log(`Owner: ${owner}, Repo: ${repo}`);
|
||||
|
||||
|
||||
async function fetchIssues(page = 1) {
|
||||
console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
|
||||
return await github.rest.issues.listForRepo({
|
||||
@@ -36,23 +36,23 @@ jobs:
|
||||
page: page
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
async function processIssues() {
|
||||
console.log('Starting to process issues');
|
||||
console.log(`Repository: ${owner}/${repo}`);
|
||||
|
||||
|
||||
let page = 1;
|
||||
let hasMoreIssues = true;
|
||||
while (hasMoreIssues) {
|
||||
try {
|
||||
const issues = await fetchIssues(page);
|
||||
console.log(`Fetched ${issues.data.length} issues on page ${page}`);
|
||||
|
||||
|
||||
if (issues.data.length === 0) {
|
||||
hasMoreIssues = false;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
for (const issue of issues.data) {
|
||||
if (new Date(issue.updated_at) < sixtyDaysAgo) {
|
||||
try {
|
||||
@@ -87,5 +87,5 @@ jobs:
|
||||
}
|
||||
console.log('Finished processing issues');
|
||||
}
|
||||
|
||||
|
||||
await processIssues();
|
||||
|
||||
4
.github/workflows/execute-notebook.yml
vendored
4
.github/workflows/execute-notebook.yml
vendored
@@ -18,7 +18,7 @@ concurrency:
|
||||
group: execute-notebook-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
|
||||
|
||||
jobs:
|
||||
run-all-notebooks:
|
||||
runs-on: 1-gpu-runner
|
||||
@@ -45,4 +45,4 @@ jobs:
|
||||
run: |
|
||||
cd docs
|
||||
make clean
|
||||
make compile
|
||||
make compile
|
||||
|
||||
2
.github/workflows/pr-test-rust.yml
vendored
2
.github/workflows/pr-test-rust.yml
vendored
@@ -36,4 +36,4 @@ jobs:
|
||||
run: |
|
||||
source "$HOME/.cargo/env"
|
||||
cd rust/
|
||||
cargo test
|
||||
cargo test
|
||||
|
||||
2
.github/workflows/pr-test.yml
vendored
2
.github/workflows/pr-test.yml
vendored
@@ -237,7 +237,7 @@ jobs:
|
||||
run: |
|
||||
cd test/srt
|
||||
python3 test_moe_eval_accuracy_large.py
|
||||
|
||||
|
||||
- name: Evaluate MLA Accuracy (TP=2)
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
|
||||
2
.github/workflows/release-docs.yml
vendored
2
.github/workflows/release-docs.yml
vendored
@@ -47,7 +47,7 @@ jobs:
|
||||
|
||||
make html
|
||||
cd _build/html
|
||||
|
||||
|
||||
git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
|
||||
rm -rf ../sgl-project.github.io/*
|
||||
cp -r * ../sgl-project.github.io
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -185,4 +185,4 @@ tmp*.txt
|
||||
work_dirs/
|
||||
*.csv
|
||||
|
||||
!logo.png
|
||||
!logo.png
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
[settings]
|
||||
profile=black
|
||||
known_first_party=sglang
|
||||
known_first_party=sglang
|
||||
|
||||
@@ -1,7 +1,27 @@
|
||||
default_language_version:
|
||||
python: python3.9
|
||||
|
||||
default_stages: [pre-commit, pre-push, manual]
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: check-symlinks
|
||||
- id: destroyed-symlinks
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
args: [--allow-multiple-documents]
|
||||
- id: check-toml
|
||||
- id: check-ast
|
||||
- id: check-added-large-files
|
||||
- id: check-merge-conflict
|
||||
- id: check-executables-have-shebangs
|
||||
- id: check-shebang-scripts-are-executable
|
||||
- id: detect-private-key
|
||||
- id: debug-statements
|
||||
- id: no-commit-to-branch
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 5.13.2
|
||||
hooks:
|
||||
@@ -13,8 +33,3 @@ repos:
|
||||
additional_dependencies: ['.[jupyter]']
|
||||
types: [python, jupyter]
|
||||
types_or: [python, jupyter]
|
||||
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: no-commit-to-branch
|
||||
2
3rdparty/amd/profiling/PROFILING.md
vendored
2
3rdparty/amd/profiling/PROFILING.md
vendored
@@ -6,5 +6,3 @@ Two primary methods are covered:
|
||||
|
||||
|
||||
- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
|
||||
|
||||
|
||||
|
||||
18
3rdparty/amd/tuning/TUNING.md
vendored
18
3rdparty/amd/tuning/TUNING.md
vendored
@@ -29,18 +29,18 @@ def _triton_kernel_funtion():
|
||||
...
|
||||
```
|
||||
## 2. Torch Tunable Operations
|
||||
**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
|
||||
**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
|
||||
|
||||
### Key Environment Variables:
|
||||
1. **PYTORCH_TUNABLEOP_ENABLED**:
|
||||
1. **PYTORCH_TUNABLEOP_ENABLED**:
|
||||
- Default: `0`
|
||||
- Set to `1` to enable TunableOp.
|
||||
|
||||
2. **PYTORCH_TUNABLEOP_TUNING**:
|
||||
2. **PYTORCH_TUNABLEOP_TUNING**:
|
||||
- Default: `1`
|
||||
- Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled.
|
||||
|
||||
3. **PYTORCH_TUNABLEOP_VERBOSE**:
|
||||
3. **PYTORCH_TUNABLEOP_VERBOSE**:
|
||||
- Default: `0`
|
||||
- Set to `1` to enable verbose output for TunableOp.
|
||||
|
||||
@@ -66,20 +66,20 @@ The following are suggestions for optimizing matrix multiplication (GEMM) and co
|
||||
To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape.
|
||||
|
||||
### Key Configurations:
|
||||
1. **Max Autotune**:
|
||||
1. **Max Autotune**:
|
||||
- Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`.
|
||||
|
||||
2. **Fine-Grained Control**:
|
||||
- Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`.
|
||||
- Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`.
|
||||
|
||||
3. **Backend Selection**:
|
||||
3. **Backend Selection**:
|
||||
- Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance.
|
||||
|
||||
4. **Freezing for Inference**:
|
||||
4. **Freezing for Inference**:
|
||||
- Use `torch._inductor.config.freezing=True` to enable constant folding optimizations.
|
||||
|
||||
5. **Debugging**:
|
||||
5. **Debugging**:
|
||||
- Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor.
|
||||
|
||||
### Example Code Block:
|
||||
@@ -98,4 +98,4 @@ TORCHINDUCTOR_FREEZING=1 your_script.sh
|
||||
|
||||
For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link:
|
||||
|
||||
[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)
|
||||
[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)
|
||||
|
||||
@@ -21,4 +21,4 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
|
||||
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
|
||||
|
||||
@@ -21,4 +21,4 @@ python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name rando
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35
|
||||
python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35
|
||||
|
||||
@@ -30,22 +30,22 @@ def poignancy_event_prompt(persona_name, persona_iss, event):
|
||||
@sgl.function
|
||||
def generate_event_triple(s, persona_name, action):
|
||||
s += """Task: Turn the input into (subject, predicate, object).
|
||||
Input: Sam Johnson is eating breakfast.
|
||||
Output: (Dolores Murphy, eat, breakfast)
|
||||
---
|
||||
Input: Sam Johnson is eating breakfast.
|
||||
Output: (Dolores Murphy, eat, breakfast)
|
||||
---
|
||||
Input: Joon Park is brewing coffee.
|
||||
Output: (Joon Park, brew, coffee)
|
||||
---
|
||||
Input: Jane Cook is sleeping.
|
||||
Input: Jane Cook is sleeping.
|
||||
Output: (Jane Cook, is, sleep)
|
||||
---
|
||||
Input: Michael Bernstein is writing email on a computer.
|
||||
Input: Michael Bernstein is writing email on a computer.
|
||||
Output: (Michael Bernstein, write, email)
|
||||
---
|
||||
Input: Percy Liang is teaching students in a classroom.
|
||||
Input: Percy Liang is teaching students in a classroom.
|
||||
Output: (Percy Liang, teach, students)
|
||||
---
|
||||
Input: Merrie Morris is running on a treadmill.
|
||||
Input: Merrie Morris is running on a treadmill.
|
||||
Output: (Merrie Morris, run, treadmill)
|
||||
---"""
|
||||
s += persona_name + "is" + action + ".\n"
|
||||
@@ -56,22 +56,22 @@ Output: (Merrie Morris, run, treadmill)
|
||||
def generate_event_triple_prompt(persona_name, action):
|
||||
s = ""
|
||||
s += """Task: Turn the input into (subject, predicate, object).
|
||||
Input: Sam Johnson is eating breakfast.
|
||||
Output: (Dolores Murphy, eat, breakfast)
|
||||
---
|
||||
Input: Sam Johnson is eating breakfast.
|
||||
Output: (Dolores Murphy, eat, breakfast)
|
||||
---
|
||||
Input: Joon Park is brewing coffee.
|
||||
Output: (Joon Park, brew, coffee)
|
||||
---
|
||||
Input: Jane Cook is sleeping.
|
||||
Input: Jane Cook is sleeping.
|
||||
Output: (Jane Cook, is, sleep)
|
||||
---
|
||||
Input: Michael Bernstein is writing email on a computer.
|
||||
Input: Michael Bernstein is writing email on a computer.
|
||||
Output: (Michael Bernstein, write, email)
|
||||
---
|
||||
Input: Percy Liang is teaching students in a classroom.
|
||||
Input: Percy Liang is teaching students in a classroom.
|
||||
Output: (Percy Liang, teach, students)
|
||||
---
|
||||
Input: Merrie Morris is running on a treadmill.
|
||||
Input: Merrie Morris is running on a treadmill.
|
||||
Output: (Merrie Morris, run, treadmill)
|
||||
---"""
|
||||
s += persona_name + "is" + action + ".\n"
|
||||
@@ -107,9 +107,9 @@ def action_location_sector(
|
||||
current_action,
|
||||
next_action,
|
||||
):
|
||||
s += """Task -- choose an appropriate area from the area options for a task at hand.
|
||||
s += """Task -- choose an appropriate area from the area options for a task at hand.
|
||||
Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
|
||||
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
|
||||
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
|
||||
Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
|
||||
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
|
||||
* Must be one of the "Area options," verbatim.
|
||||
@@ -117,7 +117,7 @@ For taking a walk, Sam Kim should go to the following area: {Johnson Park}
|
||||
---
|
||||
Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
|
||||
Jane Anderson is currently in {Oak Hill College} that has a classroom, library
|
||||
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
|
||||
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
|
||||
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
|
||||
* Must be one of the "Area options," verbatim.
|
||||
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
|
||||
@@ -167,9 +167,9 @@ def action_location_sector_prompt(
|
||||
next_action,
|
||||
):
|
||||
s = ""
|
||||
s += """Task -- choose an appropriate area from the area options for a task at hand.
|
||||
s += """Task -- choose an appropriate area from the area options for a task at hand.
|
||||
Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
|
||||
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
|
||||
Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
|
||||
Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
|
||||
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
|
||||
* Must be one of the "Area options," verbatim.
|
||||
@@ -177,7 +177,7 @@ For taking a walk, Sam Kim should go to the following area: {Johnson Park}
|
||||
---
|
||||
Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
|
||||
Jane Anderson is currently in {Oak Hill College} that has a classroom, library
|
||||
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
|
||||
Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
|
||||
* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
|
||||
* Must be one of the "Area options," verbatim.
|
||||
For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
|
||||
@@ -226,7 +226,7 @@ Stay in the current area if the activity can be done there. Never go into other
|
||||
For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
|
||||
Answer: {kitchen}
|
||||
---
|
||||
Tom Watson is in common room in Tom Watson's apartment.
|
||||
Tom Watson is in common room in Tom Watson's apartment.
|
||||
Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
|
||||
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
|
||||
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
|
||||
@@ -240,7 +240,7 @@ Answer: {cafe}
|
||||
+ target_sector_areas
|
||||
+ "}\n"
|
||||
)
|
||||
s += """* Stay in the current area if the activity can be done there.
|
||||
s += """* Stay in the current area if the activity can be done there.
|
||||
* NEVER go into other people's rooms unless necessary."""
|
||||
s += (
|
||||
persona_name
|
||||
@@ -268,7 +268,7 @@ Stay in the current area if the activity can be done there. Never go into other
|
||||
For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
|
||||
Answer: {kitchen}
|
||||
---
|
||||
Tom Watson is in common room in Tom Watson's apartment.
|
||||
Tom Watson is in common room in Tom Watson's apartment.
|
||||
Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
|
||||
Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
|
||||
For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
|
||||
@@ -282,7 +282,7 @@ Answer: {cafe}
|
||||
+ target_sector_areas
|
||||
+ "}\n"
|
||||
)
|
||||
s += """* Stay in the current area if the activity can be done there.
|
||||
s += """* Stay in the current area if the activity can be done there.
|
||||
* NEVER go into other people's rooms unless necessary."""
|
||||
s += (
|
||||
persona_name
|
||||
|
||||
@@ -20,7 +20,7 @@ outlines 0.0.22
|
||||
Run Llama-7B
|
||||
|
||||
```
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||
```
|
||||
|
||||
Run Mixtral-8x7B
|
||||
|
||||
@@ -23,7 +23,7 @@ python3 build_dataset.py
|
||||
Run Llama-7B
|
||||
|
||||
```bash
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
|
||||
```
|
||||
|
||||
Benchmark Character Generation
|
||||
|
||||
@@ -47,4 +47,4 @@ Quirinus Quirrell
|
||||
Nearly Headless Nick
|
||||
Aunt Marge
|
||||
Griphook
|
||||
Ludo Bagman
|
||||
Ludo Bagman
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
python3 download_images.py
|
||||
```
|
||||
|
||||
image benchmark source: https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
|
||||
image benchmark source: https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
|
||||
|
||||
### Other Dependency
|
||||
```
|
||||
|
||||
0
benchmark/llava_bench/bench_hf_llava_bench.sh
Normal file → Executable file
0
benchmark/llava_bench/bench_hf_llava_bench.sh
Normal file → Executable file
0
benchmark/llava_bench/bench_hf_mme.sh
Normal file → Executable file
0
benchmark/llava_bench/bench_hf_mme.sh
Normal file → Executable file
@@ -30,4 +30,4 @@ python3 bench_other.py --backend guidance --num-questions 25 --parallel 1 --n-ct
|
||||
|
||||
```
|
||||
python3 bench_other.py --backend lmql --num-questions 25 --parallel 1
|
||||
```
|
||||
```
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
||||
tar xf data.tar
|
||||
tar xf data.tar
|
||||
|
||||
@@ -43,7 +43,7 @@ python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx
|
||||
```
|
||||
|
||||
### Benchmark lmql
|
||||
|
||||
|
||||
```
|
||||
python3 bench_other.py --num-questions 64 --backend lmql --parallel 1
|
||||
```
|
||||
|
||||
@@ -31,4 +31,4 @@ python3 bench_other.py --num-questions 100 --backend guidance --parallel 1 --n-c
|
||||
|
||||
```
|
||||
python3 bench_other.py --num-questions 100 --backend lmql --parallel 1
|
||||
```
|
||||
```
|
||||
|
||||
@@ -11,7 +11,7 @@ from sglang.utils import dump_state_text, read_jsonl
|
||||
|
||||
def get_prompt(question):
|
||||
prompt = (
|
||||
"""Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
|
||||
"""Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
|
||||
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
|
||||
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
|
||||
(3) Finish[answer], which returns the answer and finishes the task.
|
||||
@@ -37,7 +37,7 @@ Action 1: Search[Milhouse]
|
||||
Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
|
||||
Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
|
||||
Action 2: Lookup[named after]
|
||||
Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
|
||||
Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
|
||||
Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
|
||||
Action 3: Finish[Richard Nixon]
|
||||
Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
|
||||
@@ -62,10 +62,10 @@ Action 3: Finish[director, screenwriter, actor]
|
||||
Question: Which magazine was started first Arthur's Magazine or First for Women?
|
||||
Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
|
||||
Action 1: Search[Arthur's Magazine]
|
||||
Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
|
||||
Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
|
||||
Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
|
||||
Action 2: Search[First for Women]
|
||||
Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
|
||||
Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
|
||||
Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
|
||||
Action 3: Finish[Arthur's Magazine]
|
||||
Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
|
||||
@@ -74,8 +74,8 @@ Action 1: Search[Pavel Urysohn]
|
||||
Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
|
||||
Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
|
||||
Action 2: Search[Leonid Levin]
|
||||
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
|
||||
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
|
||||
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
|
||||
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
|
||||
Action 3: Finish[yes]
|
||||
"""
|
||||
+ question
|
||||
|
||||
@@ -13,7 +13,7 @@ from sglang.utils import dump_state_text, read_jsonl
|
||||
@sgl.function
|
||||
def webthink(s, question, triplets):
|
||||
s += (
|
||||
"""Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
|
||||
"""Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
|
||||
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
|
||||
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
|
||||
(3) Finish[answer], which returns the answer and finishes the task.
|
||||
@@ -39,7 +39,7 @@ Action 1: Search[Milhouse]
|
||||
Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
|
||||
Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
|
||||
Action 2: Lookup[named after]
|
||||
Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
|
||||
Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
|
||||
Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
|
||||
Action 3: Finish[Richard Nixon]
|
||||
Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
|
||||
@@ -64,10 +64,10 @@ Action 3: Finish[director, screenwriter, actor]
|
||||
Question: Which magazine was started first Arthur's Magazine or First for Women?
|
||||
Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
|
||||
Action 1: Search[Arthur's Magazine]
|
||||
Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
|
||||
Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
|
||||
Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
|
||||
Action 2: Search[First for Women]
|
||||
Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
|
||||
Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
|
||||
Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
|
||||
Action 3: Finish[Arthur's Magazine]
|
||||
Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
|
||||
@@ -76,8 +76,8 @@ Action 1: Search[Pavel Urysohn]
|
||||
Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
|
||||
Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
|
||||
Action 2: Search[Leonid Levin]
|
||||
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
|
||||
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
|
||||
Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
|
||||
Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
|
||||
Action 3: Finish[yes]
|
||||
"""
|
||||
+ question
|
||||
|
||||
2
benchmark/tip_suggestion/.gitignore
vendored
2
benchmark/tip_suggestion/.gitignore
vendored
@@ -1 +1 @@
|
||||
!topic.jsonl
|
||||
!topic.jsonl
|
||||
|
||||
@@ -30,4 +30,4 @@ python3 bench_other.py --backend guidance --num-questions 32 --parallel 1 --n-ct
|
||||
|
||||
```
|
||||
python3 bench_other.py --backend lmql --num-questions 32 --parallel 1
|
||||
```
|
||||
```
|
||||
|
||||
@@ -47,4 +47,4 @@
|
||||
{"topic": "self-publishing a book", "number": 7}
|
||||
{"topic": "starting an urban farm", "number": 6}
|
||||
{"topic": "improving your memory", "number": 8}
|
||||
{"topic": "creating a personal brand online", "number": 9}
|
||||
{"topic": "creating a personal brand online", "number": 9}
|
||||
|
||||
@@ -31,4 +31,4 @@ Clone [sgl-project.github.io](https://github.com/sgl-project/sgl-project.github.
|
||||
```bash
|
||||
export DOC_SITE_PATH=../../sgl-project.github.io # update this with your path
|
||||
python3 deploy.py
|
||||
```
|
||||
```
|
||||
|
||||
6
docs/_static/css/custom_log.css
vendored
6
docs/_static/css/custom_log.css
vendored
@@ -5,13 +5,13 @@
|
||||
table.autosummary td {
|
||||
width: 50%
|
||||
}
|
||||
|
||||
|
||||
img.align-center {
|
||||
display: block;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
}
|
||||
|
||||
|
||||
.output_area.stderr {
|
||||
color: #d3d3d3 !important;
|
||||
}
|
||||
@@ -26,4 +26,4 @@ div.output_area.stderr {
|
||||
|
||||
div.output_area.stdout {
|
||||
color: #d3d3d3 !important;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,7 +147,7 @@ docker run --gpus all \
|
||||
lmsysorg/sglang:latest \
|
||||
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
||||
```
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
## Example: Run Llama 3.1 405B
|
||||
|
||||
@@ -198,4 +198,4 @@ nbsphinx_prolog = """
|
||||
color: #d3d3d3 !important; /* light gray */
|
||||
}
|
||||
</style>
|
||||
"""
|
||||
"""
|
||||
|
||||
@@ -1,22 +1,22 @@
|
||||
# Deploy the documents
|
||||
# Deploy the documents
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def run_cmd(cmd):
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
def run_cmd(cmd):
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
|
||||
run_cmd("cd $DOC_SITE_PATH; git pull")
|
||||
run_cmd("cd $DOC_SITE_PATH; git pull")
|
||||
|
||||
# (Optional) Remove old files
|
||||
# run_cmd("rm -rf $ALPA_SITE_PATH/*")
|
||||
# (Optional) Remove old files
|
||||
# run_cmd("rm -rf $ALPA_SITE_PATH/*")
|
||||
|
||||
run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
|
||||
run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
|
||||
|
||||
cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||
run_cmd(
|
||||
f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
|
||||
)
|
||||
cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||
run_cmd(
|
||||
f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
|
||||
)
|
||||
|
||||
@@ -74,4 +74,4 @@ def example(s):
|
||||
choices_method=sgl.unconditional_likelihood_normalized,
|
||||
)
|
||||
)
|
||||
```
|
||||
```
|
||||
|
||||
@@ -37,4 +37,4 @@ You can also use the Jinja template format, defined by Hugging Face transformers
|
||||
|
||||
```
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.jinja
|
||||
```
|
||||
```
|
||||
|
||||
@@ -25,9 +25,9 @@ If you see `decode out of memory happened` occasionally but not frequently, it i
|
||||
Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput.
|
||||
|
||||
### Avoid out-of-memory by Tuning `--chunked-prefill-size`, `--mem-fraction-static`, `--max-running-requests`
|
||||
If you see out of memory (OOM) errors, you can try to tune the following parameters.
|
||||
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
|
||||
If OOM happens during decoding, try to decrease `--max-running-requests`.
|
||||
If you see out of memory (OOM) errors, you can try to tune the following parameters.
|
||||
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
|
||||
If OOM happens during decoding, try to decrease `--max-running-requests`.
|
||||
You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
|
||||
|
||||
### Try Advanced Options
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# Learn more
|
||||
|
||||
You can find more blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials).
|
||||
You can find more blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials).
|
||||
|
||||
@@ -223,4 +223,4 @@ response = requests.post(
|
||||
},
|
||||
)
|
||||
print(response.json())
|
||||
```
|
||||
```
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
This page lists some common errors and tips for fixing them.
|
||||
|
||||
## CUDA out of memory
|
||||
If you see out of memory (OOM) errors, you can try to tune the following parameters.
|
||||
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
|
||||
If OOM happens during decoding, try to decrease `--max-running-requests`.
|
||||
If you see out of memory (OOM) errors, you can try to tune the following parameters.
|
||||
If OOM happens during prefill, try to decrease `--chunked-prefill-size` to `4096` or `2048`.
|
||||
If OOM happens during decoding, try to decrease `--max-running-requests`.
|
||||
You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
|
||||
|
||||
## CUDA error: an illegal memory access was encountered
|
||||
|
||||
@@ -14,4 +14,4 @@ sphinx-book-theme
|
||||
sphinx-copybutton
|
||||
sphinx-tabs
|
||||
sphinxcontrib-mermaid
|
||||
urllib3<2.0.0
|
||||
urllib3<2.0.0
|
||||
|
||||
18
examples/frontend_language/usage/llava_video/srt_example_llava_v.sh
Normal file → Executable file
18
examples/frontend_language/usage/llava_video/srt_example_llava_v.sh
Normal file → Executable file
@@ -33,7 +33,7 @@ CUR_NODES_IDX=$2
|
||||
|
||||
VIDEO_DIR=$3
|
||||
|
||||
MODEL_PATH=$4
|
||||
MODEL_PATH=$4
|
||||
|
||||
NUM_FRAMES=$5
|
||||
|
||||
@@ -73,16 +73,16 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
|
||||
(
|
||||
START=$(((IDX-1) * GPUS_PER_CHUNK))
|
||||
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
|
||||
|
||||
|
||||
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
|
||||
|
||||
|
||||
# Convert the chunk GPUs array to a comma-separated string
|
||||
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
|
||||
|
||||
LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))
|
||||
|
||||
echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"
|
||||
|
||||
|
||||
# Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
|
||||
PORT=$((10000 + RANDOM % 55536))
|
||||
|
||||
@@ -92,7 +92,7 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
|
||||
|
||||
while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
|
||||
echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"
|
||||
|
||||
|
||||
#!/bin/bash
|
||||
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \
|
||||
--port $PORT \
|
||||
@@ -102,10 +102,10 @@ for IDX in $(seq 1 $LOCAL_CHUNKS); do
|
||||
--video-dir $VIDEO_DIR \
|
||||
--model-path $MODEL_PATH \
|
||||
--num-frames $NUM_FRAMES #&
|
||||
|
||||
|
||||
wait $! # Wait for the process to finish and capture its exit status
|
||||
COMMAND_STATUS=$?
|
||||
|
||||
|
||||
if [ $COMMAND_STATUS -ne 0 ]; then
|
||||
echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
|
||||
RETRY_COUNT=$(($RETRY_COUNT + 1))
|
||||
@@ -124,8 +124,8 @@ done
|
||||
|
||||
wait
|
||||
|
||||
cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv
|
||||
cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv
|
||||
|
||||
END_TIME=$(date +%s) # Capture end time
|
||||
ELAPSED_TIME=$(($END_TIME - $START_TIME))
|
||||
echo "Total execution time: $ELAPSED_TIME seconds."
|
||||
echo "Total execution time: $ELAPSED_TIME seconds."
|
||||
|
||||
@@ -4,8 +4,8 @@ Usage:
|
||||
Show in "assistant" the desired answer format. Each "gen" term should have a stop token.
|
||||
The stream mode is not supported in speculative execution.
|
||||
|
||||
E.g.
|
||||
correct:
|
||||
E.g.
|
||||
correct:
|
||||
sgl.assistant("\nName:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
|
||||
incorrect:
|
||||
s += sgl.assistant("\nName:" + sgl.gen("name", stop="\n"))
|
||||
|
||||
@@ -7,4 +7,4 @@ RUN git clone https://github.com/sgl-project/sglang.git
|
||||
WORKDIR /opt/sglang
|
||||
RUN pip install --upgrade pip && \
|
||||
pip install -e "python[all]" && \
|
||||
pip install datasets
|
||||
pip install datasets
|
||||
|
||||
@@ -32,4 +32,4 @@ curl -X POST http://localhost:8000/v2/models/character_generation/generate \
|
||||
"INPUT_TEXT": ["harry"]
|
||||
}'
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
@@ -21,7 +21,7 @@ def main():
|
||||
# Tokenize inputs
|
||||
tokenizer = get_tokenizer(MODEL_PATH)
|
||||
token_ids_list = [tokenizer.encode(prompt) for prompt in prompts]
|
||||
|
||||
|
||||
# Create an LLM.
|
||||
# You can also specify `skip_tokenizer_init=True`, but it requires explicit detokenization at the end
|
||||
llm = sgl.Engine(model_path=MODEL_PATH)
|
||||
@@ -36,4 +36,4 @@ def main():
|
||||
# The __main__ condition is necessary here because we use "spawn" to create subprocesses
|
||||
# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -37,4 +37,4 @@ curl -X POST http://localhost:8000/generate -H "Content-Type: application/json"
|
||||
curl -X POST http://localhost:8000/generate_stream -H "Content-Type: application/json" -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
|
||||
```
|
||||
|
||||
This will send both non-streaming and streaming requests to the server.
|
||||
This will send both non-streaming and streaming requests to the server.
|
||||
|
||||
@@ -3,7 +3,7 @@ Usage:
|
||||
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
||||
# Installing latest sglang.
|
||||
|
||||
# Endpoint Service CLI:
|
||||
# Endpoint Service CLI:
|
||||
python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
|
||||
|
||||
python3 http_llama3_llava_test.py
|
||||
|
||||
@@ -3,7 +3,7 @@ Usage:
|
||||
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
||||
# Installing latest sglang.
|
||||
|
||||
# Endpoint Service CLI:
|
||||
# Endpoint Service CLI:
|
||||
python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
|
||||
|
||||
python3 http_qwen_llava_test.py
|
||||
|
||||
@@ -134,4 +134,4 @@ def method_has_implemented_embedding(
|
||||
class_embedding = inspect.getattr_static(method_class, "embedding", None)
|
||||
|
||||
return (class_embedding is not None
|
||||
and class_embedding is not base_embedding)
|
||||
and class_embedding is not base_embedding)
|
||||
|
||||
@@ -311,7 +311,7 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
|
||||
"""Get a mapping that can be used to reindex the gathered
|
||||
logits for sampling.
|
||||
|
||||
|
||||
During sampling, we gather logits from all ranks. The relationship
|
||||
of index->token_id will follow the same format as outlined in the class
|
||||
docstring. However, after the gather, we want to reindex the final
|
||||
@@ -483,4 +483,4 @@ class ParallelLMHead(VocabParallelEmbedding):
|
||||
|
||||
def forward(self, input_):
|
||||
del input_
|
||||
raise RuntimeError("LMHead's weights should be used in the sampler.")
|
||||
raise RuntimeError("LMHead's weights should be used in the sampler.")
|
||||
|
||||
@@ -838,7 +838,7 @@ class Scheduler:
|
||||
time_per_output_tokens_iter: List[float] = []
|
||||
|
||||
# Request stats
|
||||
# Decode
|
||||
# Decode
|
||||
gen_throughput: float = 0.0
|
||||
# Latency
|
||||
time_e2e_requests: List[float] = []
|
||||
@@ -866,11 +866,11 @@ class Scheduler:
|
||||
time_waiting_requests.append(req.queued_time - req.created_time)
|
||||
num_prompt_tokens_requests.append(len(req.origin_input_ids))
|
||||
num_generation_tokens_requests.append(len(req.output_ids))
|
||||
finished_reason_requests.append(
|
||||
finished_reason_requests.append(
|
||||
req.finished_reason.to_json()
|
||||
if req.finished_reason is not None
|
||||
else None)
|
||||
|
||||
|
||||
return Stats(
|
||||
new_seq=new_seq,
|
||||
num_running_req=num_running_req,
|
||||
|
||||
@@ -384,7 +384,7 @@ class TokenizerManager:
|
||||
obj.load_format = self.server_args.load_format
|
||||
|
||||
if not self.model_update_lock.locked():
|
||||
|
||||
|
||||
async with self.model_update_lock:
|
||||
# wait for the previous generation requests to finish
|
||||
while len(self.rid_to_state) > 0:
|
||||
|
||||
@@ -151,7 +151,7 @@ class Metrics:
|
||||
0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.04, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5
|
||||
])
|
||||
|
||||
|
||||
# Request Stats
|
||||
# Metadata
|
||||
self.num_prompt_tokens_requests = Histogram(
|
||||
@@ -253,7 +253,7 @@ class PrometheusMetricsCollector(MetricsCollector):
|
||||
stats.time_to_first_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter)
|
||||
|
||||
|
||||
# self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
|
||||
self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
|
||||
@@ -294,4 +294,4 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
|
||||
buckets.append(value)
|
||||
else:
|
||||
return buckets
|
||||
exponent += 1
|
||||
exponent += 1
|
||||
|
||||
@@ -54,4 +54,4 @@ class Stats:
|
||||
num_prompt_tokens_iter: int = 0
|
||||
num_generation_tokens_iter: int = 0
|
||||
time_to_first_tokens_iter: List[float] = field(default_factory=list)
|
||||
time_per_output_tokens_iter: List[float] = field(default_factory=list)
|
||||
time_per_output_tokens_iter: List[float] = field(default_factory=list)
|
||||
|
||||
@@ -17,7 +17,7 @@ limitations under the License.
|
||||
"""
|
||||
Utilities for multi-modal models.
|
||||
|
||||
This python file mainly contains utilities that were used in the
|
||||
This python file mainly contains utilities that were used in the
|
||||
image processing logic of llava-next including operations such as
|
||||
anyres and anyres_max
|
||||
|
||||
|
||||
@@ -136,7 +136,7 @@ class GPT2Block(nn.Module):
|
||||
layer_id: int,
|
||||
config: GPT2Config,
|
||||
cache_config = None,
|
||||
|
||||
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
):
|
||||
@@ -284,4 +284,4 @@ class GPT2LMHeadModel(nn.Module):
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
EntryClass = GPT2LMHeadModel
|
||||
EntryClass = GPT2LMHeadModel
|
||||
|
||||
0
python/sglang/srt/models/olmo.py
Executable file → Normal file
0
python/sglang/srt/models/olmo.py
Executable file → Normal file
@@ -57,27 +57,27 @@ logger = init_logger(__name__)
|
||||
|
||||
class Qwen2VLImageInputs(TypedDict):
|
||||
pixel_values: torch.Tensor
|
||||
"""Shape:
|
||||
"""Shape:
|
||||
`(num_patches, num_channels * patch_size * patch_size)`
|
||||
"""
|
||||
|
||||
image_grid_thw: torch.Tensor
|
||||
"""Shape: `(num_images, 3)`
|
||||
|
||||
|
||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
||||
"""
|
||||
|
||||
|
||||
class Qwen2VLVideoInputs(TypedDict):
|
||||
pixel_values_videos: torch.Tensor
|
||||
"""Shape:
|
||||
`(num_patches,
|
||||
"""Shape:
|
||||
`(num_patches,
|
||||
num_channels * temporal_patch_size * patch_size * patch_size)`
|
||||
"""
|
||||
|
||||
video_grid_thw: torch.Tensor
|
||||
"""Shape: `(num_videos, 3)`
|
||||
|
||||
|
||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
||||
"""
|
||||
|
||||
|
||||
@@ -759,7 +759,7 @@ class Engine:
|
||||
|
||||
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
|
||||
atexit.register(self.shutdown)
|
||||
|
||||
|
||||
# runtime server default log level is log
|
||||
# offline engine works in scripts, so we set it to error
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -320,7 +320,7 @@ jinja_env = jinja2.Environment(
|
||||
_message_template = """
|
||||
<div class="message {{ role }}">
|
||||
<div class="role">
|
||||
{{ role }}
|
||||
{{ role }}
|
||||
{% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
|
||||
</div>
|
||||
<div class="content">
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
|
||||
"""
|
||||
HumanEval: Evaluating Large Language Models Trained on Code
|
||||
Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
|
||||
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
||||
Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
|
||||
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
||||
"""
|
||||
|
||||
import random
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# Adapted from https://github.com/openai/simple-evals/
|
||||
|
||||
"""
|
||||
MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
|
||||
MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
|
||||
Language Models are Multilingual Chain-of-Thought Reasoners
|
||||
Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
|
||||
https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
|
||||
https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
@@ -22,4 +22,4 @@ rand = "0.8.5"
|
||||
reqwest = { version = "0.12.8", features = ["stream"] }
|
||||
futures-util = "0.3"
|
||||
serde_json = "=1.0.1"
|
||||
pyo3 = { version = "0.22.5", features = ["extension-module"] }
|
||||
pyo3 = { version = "0.22.5", features = ["extension-module"] }
|
||||
|
||||
@@ -36,7 +36,7 @@ Usage: router [OPTIONS]
|
||||
Options:
|
||||
--host <HOST> [default: 127.0.0.1]
|
||||
--port <PORT> [default: 3001]
|
||||
--worker-urls <WORKER_URLS>
|
||||
--worker-urls <WORKER_URLS>
|
||||
--policy <POLICY> [default: round_robin] [possible values: round_robin, random]
|
||||
-h, --help Print help
|
||||
-V, --version Print version
|
||||
@@ -82,11 +82,11 @@ $ pip install <path to wheel>
|
||||
1. Run test
|
||||
|
||||
```
|
||||
$ cargo test
|
||||
$ cargo test
|
||||
```
|
||||
|
||||
2. Run lint
|
||||
|
||||
```
|
||||
$ cargo fmt
|
||||
```
|
||||
```
|
||||
|
||||
@@ -7,9 +7,9 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
|
||||
|
||||
# Follow the installation prompts, then reload your shell
|
||||
. "$HOME/.cargo/env"
|
||||
. "$HOME/.cargo/env"
|
||||
source $HOME/.cargo/env
|
||||
|
||||
# Verify installation
|
||||
rustc --version
|
||||
cargo --version
|
||||
cargo --version
|
||||
|
||||
@@ -10,7 +10,7 @@ HF_TOKEN = "..."
|
||||
|
||||
prompt = """
|
||||
### Instruction:
|
||||
Write a poem about the transformers Python library.
|
||||
Write a poem about the transformers Python library.
|
||||
Mention the word "large language models" in that poem.
|
||||
### Response:
|
||||
The Transformers are large language models,
|
||||
|
||||
@@ -5,7 +5,7 @@ MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
ADAPTER = "/home/ying/test_lora"
|
||||
prompt = """
|
||||
### Instruction:
|
||||
Write a poem about the transformers Python library.
|
||||
Write a poem about the transformers Python library.
|
||||
Mention the word "large language models" in that poem.
|
||||
### Response:
|
||||
The Transformers are large language models,
|
||||
|
||||
5
scripts/version_branch_to_tag.sh
Normal file → Executable file
5
scripts/version_branch_to_tag.sh
Normal file → Executable file
@@ -1,8 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script is used for release.
|
||||
# It tags all remote branches starting with 'v' with the same name as the branch,
|
||||
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
|
||||
# It tags all remote branches starting with 'v' with the same name as the branch,
|
||||
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
|
||||
|
||||
git fetch origin --prune
|
||||
|
||||
@@ -27,4 +27,3 @@ done
|
||||
git push --tags
|
||||
|
||||
echo "All branches starting with 'v' have been tagged, deleted from remote, and pushed to the remote repository."
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Run Unit Tests
|
||||
|
||||
SGLang uses the built-in library [unittest](https://docs.python.org/3/library/unittest.html) as the testing framework.
|
||||
SGLang uses the built-in library [unittest](https://docs.python.org/3/library/unittest.html) as the testing framework.
|
||||
|
||||
## Test Backend Runtime
|
||||
```bash
|
||||
|
||||
File diff suppressed because one or more lines are too long
0
test/srt/models/test_generation_models.py
Executable file → Normal file
0
test/srt/models/test_generation_models.py
Executable file → Normal file
@@ -45,7 +45,7 @@ TORCH_DTYPES = [torch.float16]
|
||||
PROMPTS = [
|
||||
"""
|
||||
### Instruction:
|
||||
Write a poem about the transformers Python library.
|
||||
Write a poem about the transformers Python library.
|
||||
Mention the word "large language models" in that poem.
|
||||
### Response:
|
||||
The Transformers are large language models,
|
||||
|
||||
@@ -53,7 +53,7 @@ class TestDataParallelism(unittest.TestCase):
|
||||
|
||||
# pause a few seconds then send again
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
response = requests.post(
|
||||
self.base_url + "/update_weights",
|
||||
json={"model_path": DEFAULT_MODEL_NAME_FOR_TEST},
|
||||
|
||||
@@ -11,9 +11,9 @@ from sglang.test.test_utils import (
|
||||
)
|
||||
|
||||
MANY_NEW_TOKENS_PROMPT = """
|
||||
Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
|
||||
Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
|
||||
Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
|
||||
Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
|
||||
Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
|
||||
Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
|
||||
The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
|
||||
"""
|
||||
|
||||
@@ -109,7 +109,7 @@ class TestMatchedStop(unittest.TestCase):
|
||||
llama_format_prompt = """
|
||||
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
|
||||
What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
"""
|
||||
eos_token_id = 128009
|
||||
|
||||
Reference in New Issue
Block a user