sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct

2025-09-13 17:00:20 +08:00
commit 118f1fc726
2037 changed files with 515371 additions and 0 deletions
--- a/.clang-format-ignore
+++ b/.clang-format-ignore
@@ -0,0 +1 @@
+sgl-kernel/3rdparty/tensorrt_llm/*
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,35 @@
+FROM lmsysorg/sglang:dev
+
+# Create non-root user with specified UID and GID
+# NOTE: Replace with your own UID and GID. This is a workaround from https://github.com/microsoft/vscode-remote-release/issues/49#issuecomment-489060908.
+ARG HOST_UID=1003
+ARG HOST_GID=1003
+RUN groupadd -g $HOST_GID devuser && \
+    useradd -m -u $HOST_UID -g $HOST_GID -s /bin/zsh devuser
+
+# Give devuser sudo access
+RUN apt-get update && apt-get install -y sudo && \
+    echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser && \
+    rm -rf /var/lib/apt/lists/* && \
+    apt-get clean
+
+# Set up oh-my-zsh for devuser
+RUN cp -r /root/.oh-my-zsh /home/devuser/.oh-my-zsh && \
+    cp /root/.zshrc /home/devuser/.zshrc && \
+    cp /root/.vimrc /home/devuser/.vimrc && \
+    cp /root/.tmux.conf /home/devuser/.tmux.conf && \
+    sed -i 's|/root/.oh-my-zsh|/home/devuser/.oh-my-zsh|g' /home/devuser/.zshrc && \
+    chown -R devuser:devuser /home/devuser/
+
+# Set workspace directory and ownership
+WORKDIR /sgl-workspace/sglang
+RUN chown -R devuser:devuser /sgl-workspace
+
+# Switch to devuser
+USER devuser
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,30 @@
+{
+    "name": "sglang",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "remoteUser": "devuser",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                // Python development
+                "ms-python.python",
+                "charliermarsh.ruff",
+                // Rust development
+                "rust-lang.rust-analyzer",
+                "tamasfe.even-better-toml"
+            ]
+        }
+    },
+    "forwardPorts": [],
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    // The two lines below ensures that your local changes in the sglang
+    // repo is automatically synced to the sglang pip package installed
+    // in the dev docker container. You can remove / comment out these
+    // two lines if you prefer to sync code changes manually.
+    "workspaceMount": "source=${localWorkspaceFolder},target=/sgl-workspace/sglang,type=bind",
+    "workspaceFolder": "/sgl-workspace/sglang"
+}
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,25 @@
+# https://editorconfig.org/
+
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.{json,yaml,yml}]
+indent_size = 2
+
+[*.md]
+indent_size = 2
+x-soft-wrap-text = true
+
+[*.rst]
+indent_size = 4
+x-soft-wrap-text = true
+
+[Makefile]
+indent_style = tab
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -0,0 +1,21 @@
+.github @merrymercy @zhyncs
+/docker @zhyncs @HaiShaw @ByronHsu
+/python/pyproject.toml @merrymercy @zhyncs
+/python/sglang/* @merrymercy @Ying1123 @zhyncs @hnyls2002
+/python/sglang/srt/constrained @hnyls2002
+/python/sglang/srt/disaggregation @ByronHsu @hnyls2002
+/python/sglang/srt/disaggregation/mooncake @ShangmingCai
+/python/sglang/srt/distributed @yizhang2077 @merrymercy
+/python/sglang/srt/entrypoints @ispobock @CatherineSue @slin1237 @merrymercy
+/python/sglang/srt/eplb @fzyzcjy
+/python/sglang/srt/function_call @CatherineSue
+/python/sglang/srt/layers @merrymercy @Ying1123 @zhyncs @ispobock @HaiShaw @ch-wan @BBuf @kushanam @Edwardf0t1
+/python/sglang/srt/lora @Ying1123 @Fridge003 @lifuhuang
+/python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+/python/sglang/srt/mem_cache @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+/python/sglang/srt/model_executor @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock
+/python/sglang/srt/multimodal @mickqian @JustinTong0323
+/python/sglang/srt/speculative @Ying1123 @merrymercy @rkooo567 @kssteven418
+/sgl-kernel @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @merrymercy @FlamingoPg @HaiShaw
+/sgl-router @slin1237 @ByronHsu
+/test/srt/test_modelopt* @Edwardf0t1
--- a/.github/ISSUE_TEMPLATE/1-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -0,0 +1,38 @@
+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. I have searched related issues but cannot get the expected help.
+    - label: 2. The bug has not been fixed in the latest version.
+    - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
+    - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 5. Please use English, otherwise it will be closed.
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      What command or script did you run? Which **model** are you using?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      Please provide necessary environment information here with `python3 -m sglang.check_env`. Otherwise the issue will be closed.
+    placeholder: Environment here.
+  validations:
+    required: true
--- a/.github/ISSUE_TEMPLATE/2-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -0,0 +1,23 @@
+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 2. Please use English, otherwise it will be closed.
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
--- a/.github/REVIEWERS.md
+++ b/.github/REVIEWERS.md
@@ -0,0 +1,53 @@
+# Area Reviewer
+
+Here are some reviewers for common areas. You can ping them to review your code if you touch related parts.
+
+## Hardware platforms
+- general @Alcanderian
+- AMD GPU @HaiShaw
+- Blackwell GPU @kushanam @trevor-m @zhyncs
+- CPU @mingfeima
+
+## Kernel
+- general @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @HaiShaw
+- triton attention backend @ispobock
+- aiter attention backend @HaiShaw @kkHuang-amd @valarLip
+- flash attention backend @hebiao064
+- flashinfer attention backend @Fridge003
+- moe kernel @BBuf @fzyzcjy @ch-wan @Alcanderian
+
+## Scheduler and memory pool
+- general @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+- constrained decoding @hnyls2002
+- hierarchical cache @xiezhq-hermann @DarkSharpness
+- lora @Fridge003 @Ying1123 @lifuhuang
+- speculative decoding @merrymercy @Ying1123 @kssteven418 @Qiaolin-Yu
+- sliding window attention @hanming-lu
+
+## Parallelism
+- expert parallelism @fzyzcjy @ch-wan
+- data parallelism attention @ch-wan
+- pipeline parallelism @Ying1123
+- tensor parallelism @merrymercy
+
+## PD disaggregation
+- general @ByronHsu @ShangmingCai @hnyls2002
+- Mooncake backend @ShangmingCai
+
+## Build and release
+- general @zhyncs @merrymercy
+
+## API Server
+- general @CatherineSue @slin1237 @ispobock
+- function calling and reasoning parsing @CatherineSue
+- OpenAI API @CatherineSue @slin1237
+
+## SGL-Router
+- general @slin1237 @ByronHsu
+
+## Model
+- multimodal models @mickqian @JustinTong0323
+- other new models @zhaochenyang20
+
+## Reinforcment learning
+- general @zhaochenyang20 @hebiao064 @fzyzcjy @zhuzilin
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -0,0 +1,24 @@
+<!-- Thank you for your contribution! Please follow these guidelines to enhance your pull request. If anything is unclear, submit your PR and reach out to maintainers for assistance. Join our Slack community at https://slack.sglang.ai to discuss further. -->
+
+## Motivation
+
+<!-- Describe the purpose and goals of this pull request. -->
+
+## Modifications
+
+<!-- Detail the changes made in this pull request. -->
+
+## Accuracy Tests
+
+<!-- If this pull request affects model outputs (e.g., changes to the kernel or model forward code), provide accuracy test results. -->
+
+## Benchmarking and Profiling
+
+<!-- If this pull request impacts inference speed, provide benchmarking and profiling results. -->
+
+## Checklist
+
+- [ ] Format your code according to the [Format code with pre-commit](https://docs.sglang.ai/developer_guide/contribution_guide.html#format-code-with-pre-commit).
+- [ ] Add unit tests according to the [Run and add unit tests](https://docs.sglang.ai/developer_guide/contribution_guide.html#run-and-add-unit-tests).
+- [ ] Update documentation according to [Write documentations](https://docs.sglang.ai/developer_guide/contribution_guide.html#write-documentations).
+- [ ] Provide accuracy and speed benchmark results according to [Test the accuracy](https://docs.sglang.ai/developer_guide/contribution_guide.html#test-the-accuracy) and [Benchmark the speed](https://docs.sglang.ai/developer_guide/contribution_guide.html#benchmark-the-speed).
--- a/.github/workflows/cancel-all-pending-pr-test-runs.yml
+++ b/.github/workflows/cancel-all-pending-pr-test-runs.yml
@@ -0,0 +1,45 @@
+name: Cancel All Pending PR Test Runs
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflows:
+        description: 'Space-separated list of workflow filenames to cancel'
+        required: true
+        type: string
+        default: 'pr-test.yml pr-test-xeon.yml'
+
+permissions:
+  actions: write   # Needed to cancel runs
+  contents: read   # Needed to read repo info
+
+jobs:
+  cancel-pending:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install GitHub CLI
+        run: sudo apt-get install -y gh jq
+
+      - name: Cancel all pending/waiting runs for specified workflows
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+        run: |
+          # Read the space-separated string from the input into a bash array
+          WORKFLOW_FILES=(${{ github.event.inputs.workflows }})
+
+          echo "Targeting ${#WORKFLOW_FILES[@]} workflow(s): ${{ github.event.inputs.workflows }}"
+
+          for workflow_file in "${WORKFLOW_FILES[@]}"; do
+            echo "--- Checking workflow: $workflow_file ---"
+            gh run list \
+              --repo "$REPO" \
+              --workflow "$workflow_file" \
+              --json databaseId,status \
+              --limit 1000 \
+              | jq -r '.[] | select(.status=="queued" or .status=="in_progress") | .databaseId' \
+              | while read run_id; do
+                  echo "Cancelling run ID: $run_id for workflow: $workflow_file"
+                  gh run cancel "$run_id" --repo "$REPO"
+                done
+          done
--- a/.github/workflows/cancel-pr-workflow-on-merge.yml
+++ b/.github/workflows/cancel-pr-workflow-on-merge.yml
@@ -0,0 +1,22 @@
+name: Cancel PR Workflows on Merge
+
+on:
+  pull_request_target:
+    types:
+      - closed
+
+permissions:
+  actions: write
+
+jobs:
+  cancel:
+    if: github.event.pull_request.merged == true
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cancel Previous Runs
+        uses: styfle/cancel-workflow-action@0.12.1
+        with:
+          workflow_id: all
+          access_token: ${{ secrets.GITHUB_TOKEN }}
+          ignore_sha: true
+          pr_number: ${{ github.event.pull_request.number }}
--- a/.github/workflows/close-inactive-issues.yml
+++ b/.github/workflows/close-inactive-issues.yml
@@ -0,0 +1,96 @@
+name: Close Inactive Issues
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+
+permissions:
+  issues: write
+  contents: read
+
+jobs:
+  close-inactive-issues:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check and close inactive issues
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
+
+            const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
+            console.log(`Owner: ${owner}, Repo: ${repo}`);
+
+            async function fetchIssues(page = 1) {
+              console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
+              return await github.rest.issues.listForRepo({
+                owner,
+                repo,
+                state: 'open',
+                sort: 'updated',
+                direction: 'asc',
+                per_page: 100,
+                page: page
+              });
+            }
+
+            async function processIssues() {
+              console.log('Starting to process issues');
+              console.log(`Repository: ${owner}/${repo}`);
+
+              let page = 1;
+              let hasMoreIssues = true;
+              while (hasMoreIssues) {
+                try {
+                  const issues = await fetchIssues(page);
+                  console.log(`Fetched ${issues.data.length} issues on page ${page}`);
+
+                  if (issues.data.length === 0) {
+                    hasMoreIssues = false;
+                    break;
+                  }
+
+                  for (const issue of issues.data) {
+                    // Skip if the issue has 'good first issue' label
+                    if (issue.labels.some(label => label.name === 'good first issue')) {
+                      console.log(`Skipping issue #${issue.number} as it's marked as 'good first issue'`);
+                      continue;
+                    }
+                    if (new Date(issue.updated_at) < sixtyDaysAgo) {
+                      try {
+                        await github.rest.issues.update({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          state: 'closed',
+                          labels: [...issue.labels.map(l => l.name), 'inactive']
+                        });
+                        await github.rest.issues.createComment({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          body: 'This issue has been automatically closed due to inactivity. Please feel free to reopen it if needed.'
+                        });
+                        console.log(`Closed issue #${issue.number} due to inactivity.`);
+                      } catch (error) {
+                        console.error(`Failed to close issue #${issue.number}: ${error.message}`);
+                      }
+                    } else {
+                      console.log(`Issue #${issue.number} is still active. Stopping processing.`);
+                      hasMoreIssues = false;
+                      break;
+                    }
+                  }
+                  page += 1;
+                } catch (error) {
+                  console.error(`Error fetching issues on page ${page}: ${error.message}`);
+                  hasMoreIssues = false;
+                }
+              }
+              console.log('Finished processing issues');
+            }
+
+            await processIssues();
--- a/.github/workflows/execute-notebook.yml
+++ b/.github/workflows/execute-notebook.yml
@@ -0,0 +1,60 @@
+name: Execute Notebooks
+
+on:
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "docs/**"
+  workflow_dispatch:
+
+
+concurrency:
+  group: execute-notebook-${{ github.ref }}
+  cancel-in-progress: true
+
+
+jobs:
+  run-all-notebooks:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+          apt-get update && apt-get install -y pandoc parallel retry
+          ln -sf "$(which python3)" /usr/bin/python
+
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+
+      - name: Execute notebooks
+        timeout-minutes: 40
+        run: |
+          cd docs
+          make clean
+          make compile
+
+
+  notebook-finish:
+    needs: [
+      run-all-notebooks
+    ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/experiment-runner.yml
+++ b/.github/workflows/experiment-runner.yml
@@ -0,0 +1,30 @@
+name: Experiment Runner
+
+on:
+  workflow_dispatch:
+    inputs:
+      script:
+        description: "Experiment Runner Script"
+        default: "configs/sharegpt_config.yaml"
+
+concurrency:
+  group: experiment-runner-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  experiment-runner-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Test experiment runner
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 experiment_runner.py --config ${{ inputs.script }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,22 @@
+name: Lint
+
+on: [pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install pre-commit hook
+        run: |
+          python -m pip install pre-commit
+          pre-commit install
+
+      - name: Linting
+        run: pre-commit run --all-files --show-diff-on-failure
--- a/.github/workflows/nightly-test-amd.yml
+++ b/.github/workflows/nightly-test-amd.yml
@@ -0,0 +1,41 @@
+name: Nightly Test (AMD)
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-amd-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    strategy:
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2-nightly]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Nightly Test
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/nightly-test.yml
+++ b/.github/workflows/nightly-test.yml
@@ -0,0 +1,33 @@
+name: Nightly Test
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite nightly --timeout-per-file 3600
--- a/.github/workflows/open-pr-copy-from-oss.yml
+++ b/.github/workflows/open-pr-copy-from-oss.yml
@@ -0,0 +1,28 @@
+name: Open A PR to Copy Code From OSS
+
+on:
+  workflow_dispatch:
+  # schedule:
+  #   - cron: '0 10 * * *'
+
+permissions:
+  contents: write
+
+jobs:
+  copy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: 'main'
+
+      - name: Install GitHub CLI (if not present)
+        run: |
+          bash scripts/code_sync/install_github_cli.sh
+
+      - name: Copy from OSS code
+        env:
+          GH_TOKEN: ${{ secrets.PAT_FOR_CODE_SYNC_FROM_LIANMIN }}
+        run: |
+          python3 scripts/code_sync/copy_from_oss.py
--- a/.github/workflows/open-pr-copy-to-oss.yml
+++ b/.github/workflows/open-pr-copy-to-oss.yml
@@ -0,0 +1,31 @@
+name: Open A PR to Copy Diff To OSS
+
+on:
+  workflow_dispatch:
+    inputs:
+      commit_sha:
+        description: 'The commit SHA to copy. Defaults to LAST to copy the latest commit.'
+        required: false
+        default: 'LAST'
+
+permissions:
+  contents: write
+
+jobs:
+  copy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install GitHub CLI (if not present)
+        run: |
+          bash scripts/code_sync/install_github_cli.sh
+
+      - name: Copy to OSS code
+        env:
+          GH_TOKEN: ${{ secrets.PAT_FOR_CODE_SYNC_FROM_LIANMIN }}
+        run: |
+          python3 scripts/code_sync/copy_to_oss.py --commit ${{ github.event.inputs.commit_sha }}
--- a/.github/workflows/pr-benchmark-rust.yml
+++ b/.github/workflows/pr-benchmark-rust.yml
@@ -0,0 +1,306 @@
+name: PR Benchmark (Rust Router)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+    types: [opened, synchronize, reopened, labeled]
+  workflow_dispatch:
+
+concurrency:
+  group: pr-benchmark-rust-${{ github.ref }}
+  cancel-in-progress: true
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+jobs:
+  # Quick check job that always runs on PRs
+  benchmark-compile-check:
+    name: Benchmark Compilation Check
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Check benchmarks compile
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo check --benches
+
+  # Full benchmark jobs that only run with label or on main branch
+  benchmark-request-processing:
+    name: Request Processing Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          # Fetch enough history for baseline comparison
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run request processing benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          # Run only the summary benchmark for quick validation in PRs
+          cargo bench --bench request_processing -- benchmark_summary --exact
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: request-processing-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/benchmark_summary/
+          retention-days: 30
+
+  benchmark-tokenizer:
+    name: Tokenizer Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run tokenizer benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo bench --bench tokenizer_benchmark
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tokenizer-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/tokenizer*/
+          retention-days: 30
+
+  benchmark-tool-parser:
+    name: Tool Parser Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run tool parser benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo bench --bench tool_parser_benchmark
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tool-parser-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/tool_parser*/
+          retention-days: 30
+
+  benchmark-summary:
+    name: Benchmark Summary
+    needs: [benchmark-request-processing, benchmark-tokenizer, benchmark-tool-parser]
+    if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download all benchmark results
+        uses: actions/download-artifact@v4
+        with:
+          pattern: '*-results-${{ github.sha }}'
+          path: benchmark-results
+
+      - name: Generate summary
+        run: |
+          echo "## Benchmark Results Summary" > summary.md
+          echo "" >> summary.md
+          echo "### Request Processing" >> summary.md
+          if [ -d "benchmark-results/request-processing-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          echo "" >> summary.md
+          echo "### Tokenizer" >> summary.md
+          if [ -d "benchmark-results/tokenizer-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          echo "" >> summary.md
+          echo "### Tool Parser" >> summary.md
+          if [ -d "benchmark-results/tool-parser-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          cat summary.md
+
+      - name: Upload summary
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-summary-${{ github.sha }}
+          path: summary.md
+          retention-days: 30
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -0,0 +1,377 @@
+name: PR Test (AMD)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-amd-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  accuracy-test-1-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Evaluate Accuracy
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
+          bash scripts/ci/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
+          bash scripts/ci/amd_ci_exec.sh python3 models/test_qwen_models.py
+
+  accuracy-test-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2, linux-mi35x-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Evaluate accuracy (TP=2)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
+
+  mla-test-1-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: MLA TEST
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 test_mla.py
+
+  performance-test-1-gpu-part-1-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Benchmark single latency
+        timeout-minutes: 20
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
+
+      - name: Benchmark online latency
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
+
+      - name: Benchmark offline throughput
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
+
+      - name: Benchmark offline throughput (Non-streaming, small batch size)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
+
+  performance-test-1-gpu-part-2-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Benchmark offline throughput (w/o RadixAttention)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
+
+      - name: Benchmark offline throughput (w/ Triton)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+
+      - name: Benchmark offline throughput (w/ FP8)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+
+  bench-test-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Benchmark dummy grok (TP=2)
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
+
+      - name: Benchmark single latency (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+
+      - name: Benchmark single latency + torch.compile (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
+
+      - name: Benchmark offline throughput (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
+
+      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
+
+  unit-test-backend-1-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        part: [0, 1, 2, 3, 4, 5, 6, 7]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 50
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 8
+
+  unit-test-backend-1-gpu-amd-mi35x:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi35x-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 50
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd-mi35x
+
+  unit-test-backend-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 40
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
+
+  unit-test-backend-8-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-8]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
+
+  unit-test-sgl-kernel-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 14
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
+
+  pr-test-amd-finish:
+    if: always()
+    needs: [
+      accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd,
+      accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd,
+      unit-test-backend-1-gpu-amd, unit-test-backend-1-gpu-amd-mi35x, unit-test-backend-2-gpu-amd,
+      unit-test-backend-8-gpu-amd, unit-test-sgl-kernel-amd
+    ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/pr-test-h20.yml
+++ b/.github/workflows/pr-test-h20.yml
@@ -0,0 +1,81 @@
+name: PR Test (H20)
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+    inputs:
+      version:
+        required: true
+        type: choice
+        default: 'release'
+        options:
+          - 'release'
+          - 'nightly'
+
+concurrency:
+  group: pr-test-h20-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      src: ${{ steps.filter.outputs.src }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            src:
+              - "python/sglang/srt/models/deepseek*"
+              - "python/sglang/srt/layers/moe/**"
+              - ".github/workflows/pr-test-h20.yml"
+              - "python/pyproject.toml"
+
+  per-commit-8-gpu-h20:
+    needs: [check-changes]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 8-gpu-h20
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-h20
+
+  pr-test-h20-finish:
+    needs: [
+      check-changes,
+      per-commit-8-gpu-h20,
+    ]
+    if: needs.check-changes.outputs.src == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/pr-test-npu.yml
+++ b/.github/workflows/pr-test-npu.yml
@@ -0,0 +1,184 @@
+name: PR Test (Ascend NPU)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - ".github/workflows/pr-test-npu.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - ".github/workflows/pr-test-npu.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-npu-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  per-commit-1-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-1
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 60
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-1-ascend-npu
+
+  per-commit-2-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-2
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 90
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-2-ascend-npu
+
+  per-commit-4-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-4
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 120
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600
+
+  per-commit-16-ascend-a3:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-aarch64-a3-16
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 90
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-16-ascend-a3 --timeout-per-file 5400
+
+  pr-test-npu-finish:
+    if: always()
+    needs:
+      - per-commit-1-ascend-npu
+      - per-commit-2-ascend-npu
+      - per-commit-4-ascend-npu
+      - per-commit-16-ascend-a3
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/pr-test-pd-router.yml
+++ b/.github/workflows/pr-test-pd-router.yml
@@ -0,0 +1,599 @@
+name: PR Test (PD Router)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'python/sglang/srt/disaggregation/**'
+      - 'scripts/ci/ci_start_disaggregation_servers.sh'
+      - 'sgl-router/**'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'python/sglang/srt/disaggregation/**'
+      - 'scripts/ci/ci_start_disaggregation_servers.sh'
+      - 'sgl-router/**'
+  workflow_dispatch:
+
+concurrency:
+  group: test-disaggregation-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+jobs:
+  test-disaggregation:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: [h200]
+    timeout-minutes: 45
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 10
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Setup Rust
+      run: |
+        bash scripts/ci/ci_install_rust.sh
+
+    - name: Cache Rust dependencies
+      uses: actions/cache@v4
+      with:
+        path: |
+          ~/.cargo/bin/
+          ~/.cargo/registry/index/
+          ~/.cargo/registry/cache/
+          ~/.cargo/git/db/
+          sgl-router/target/
+        key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
+        restore-keys: |
+          ${{ runner.os }}-cargo-
+
+    - name: Cache pip dependencies
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+
+    - name: Validate environment
+      run: |
+        echo "=== System Validation ==="
+        nvidia-smi
+        echo "GPU count: $(nvidia-smi -L | wc -l)"
+        if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then
+          echo "Error: This test requires at least 8 GPUs"
+          exit 1
+        fi
+
+        echo "=== GPU Process Check ==="
+        # Fail fast if any GPU compute processes are active
+        if command -v nvidia-smi >/dev/null 2>&1; then
+          # Try to query compute apps first (preferred and concise)
+          gpu_procs=$(nvidia-smi --query-compute-apps=pid,process_name,gpu_uuid --format=csv,noheader 2>/dev/null | sed '/^$/d' || true)
+
+          # Fallback to detailed PIDS report if the query returns nothing but there might still be processes
+          if [ -z "$gpu_procs" ]; then
+            gpu_procs=$(nvidia-smi -q -d PIDS 2>/dev/null | awk '/Processes/{flag=1;next}/^$/{flag=0}flag' | sed '/^\s*Processes:/d' | sed '/^\s*$/d' || true)
+          fi
+
+          if [ -n "$gpu_procs" ]; then
+            echo "Error: Found active GPU processes using the device(s):"
+            echo "$gpu_procs"
+            exit 1
+          else
+            echo "No active GPU compute processes detected."
+          fi
+        else
+          echo "Error: nvidia-smi not found; skipping GPU process check."
+          exit 1
+        fi
+
+        echo "=== RDMA Validation ==="
+        if ! command -v ibv_devices >/dev/null 2>&1; then
+          echo "Error: InfiniBand tools not found"
+          exit 1
+        fi
+
+        # Check for active IB devices
+        found_active_device=false
+        for device in mlx5_{0..11}; do
+            if ibv_devinfo $device >/dev/null 2>&1; then
+                state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
+                if [[ "$state" == "PORT_ACTIVE" ]]; then
+                    echo "✓ Found active device: $device"
+                    found_active_device=true
+                    break
+                fi
+            fi
+        done
+
+        if [ "$found_active_device" = false ]; then
+          echo "Error: No active IB devices found"
+          echo "Available devices:"
+          ibv_devices || true
+          exit 1
+        fi
+
+        echo "=== Model Validation ==="
+        if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then
+          echo "Error: Model not found"
+          ls -la /raid/models/ || echo "No models directory"
+          exit 1
+        fi
+        echo "✓ Model found"
+
+    - name: Install SGLang dependencies
+      run: |
+        echo "Installing SGLang with all extras..."
+        python3 -m pip --no-cache-dir install --upgrade pip
+        python3 -m pip --no-cache-dir install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+        python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages
+        python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.5
+        python3 -m pip --no-cache-dir install --user --force-reinstall genai-bench==0.0.2
+        python3 -m pip --no-cache-dir install sgl-kernel==0.3.9.post2
+
+    - name: Build and install sgl-router
+      run: |
+        source "$HOME/.cargo/env"
+        echo "Building sgl-router..."
+        cd sgl-router
+        cargo build && python3 -m build && pip install --force-reinstall dist/*.whl
+
+    - name: Start disaggregation servers
+      id: start_servers
+      run: |
+        echo "Starting disaggregation servers..."
+        bash scripts/ci/ci_start_disaggregation_servers.sh &
+        SERVER_PID=$!
+        echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT
+
+        # Wait for all 8 servers to be healthy (script already does this)
+        wait_count=0
+        while [ $wait_count -lt 30 ]; do
+          if ps -p $SERVER_PID > /dev/null; then
+            # Check if the startup script printed success message
+            sleep 2
+            wait_count=$((wait_count + 1))
+          else
+            # Script exited - check if it was successful
+            wait $SERVER_PID
+            exit_code=$?
+            if [ $exit_code -eq 0 ]; then
+              echo "✓ All disaggregation servers are healthy"
+              break
+            else
+              echo "Error: Server startup failed with code $exit_code"
+              exit 1
+            fi
+          fi
+        done
+
+        echo "✓ Servers started (PID: $SERVER_PID)"
+
+    - name: Test all policies sequentially
+      timeout-minutes: 30
+      run: |
+        POLICIES=("random" "round_robin" "cache_aware" "power_of_two")
+        BASE_URL="http://127.0.0.9:8000"
+
+        # Free commonly used ports for router and metrics
+        echo "Freeing ports 29000 (metrics) and 8000 (API), if in use..."
+        fuser -k -n tcp 29000 2>/dev/null || true
+        fuser -k -n tcp 8000 2>/dev/null || true
+        sleep 1
+
+        for policy in "${POLICIES[@]}"; do
+          echo ""
+          echo "=================================================="
+          echo "Testing policy: $policy"
+          echo "=================================================="
+
+          # Free ports before starting router
+          fuser -k -n tcp 29000 2>/dev/null || true
+          fuser -k -n tcp 8000 2>/dev/null || true
+
+          # Start router with the current policy
+          echo "Starting router with policy: $policy..."
+          RUST_BACKTRACE=1 python3 -m sglang_router.launch_router \
+            --pd-disaggregation \
+            --policy "$policy" \
+            --prefill http://127.0.0.1:30001 9001 \
+            --prefill http://127.0.0.2:30002 9002 \
+            --prefill http://127.0.0.3:30003 9003 \
+            --prefill http://127.0.0.4:30004 9004 \
+            --decode http://127.0.0.5:30005 \
+            --decode http://127.0.0.6:30006 \
+            --decode http://127.0.0.7:30007 \
+            --decode http://127.0.0.8:30008 \
+            --host 127.0.0.9 \
+            --port 8000 &
+          ROUTER_PID=$!
+
+          # Wait for router to become healthy
+          echo "Waiting for router to become healthy..."
+          TIMEOUT=60
+          ELAPSED=0
+          while [ $ELAPSED -lt $TIMEOUT ]; do
+            if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then
+              echo "✓ Router is reachable"
+              break
+            fi
+            if ! ps -p $ROUTER_PID > /dev/null; then
+              echo "Error: Router process died"
+              exit 1
+            fi
+            sleep 5
+            ELAPSED=$((ELAPSED + 5))
+          done
+
+          if [ $ELAPSED -ge $TIMEOUT ]; then
+            echo "Error: Router health check timeout"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Test API functionality
+          echo "Testing API completions for $policy..."
+          response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer test-token" \
+            -d '{
+              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
+              "messages": [
+                {"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"}
+              ],
+              "stream": false,
+              "max_tokens": 100
+            }')
+
+          if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
+            echo "✓ API test passed for $policy"
+          else
+            echo "✗ API test failed for $policy: $response"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Test streaming
+          echo "Testing streaming API for $policy..."
+          stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer test-token" \
+            -d '{
+              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
+              "messages": [
+                {"role": "user", "content": "Count from 1 to 5"}
+              ],
+              "stream": true,
+              "max_tokens": 50
+            }')
+
+          if echo "$stream_response" | grep -q "data:"; then
+            echo "✓ Streaming API test passed for $policy"
+          else
+            echo "✗ Streaming API test failed for $policy"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Run genai-bench benchmark
+          echo "Running genai-bench for $policy..."
+          genai-bench benchmark \
+            --api-backend openai \
+            --api-base "http://127.0.0.9:8000" \
+            --api-key "dummy-token" \
+            --api-model-name "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \
+            --model-tokenizer /raid/models/meta-llama/Llama-3.1-8B-Instruct \
+            --task text-to-text \
+            --num-concurrency 64 \
+            --traffic-scenario "D(8000,2000)" \
+            --max-requests-per-run 640 \
+            --max-time-per-run 2 \
+            --experiment-folder-name "benchmark_${policy}" \
+            --experiment-base-dir "."
+
+          # Find the actual experiment folder
+          actual_folder=$(find . -maxdepth 1 -name "benchmark_${policy}" -type d | head -1)
+
+          if [ -n "$actual_folder" ]; then
+            # Extract metrics from the Excel summary or JSON files
+            summary_file="$actual_folder"/*_summary.xlsx
+            json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata)
+
+            echo "Genai-bench results saved in: $actual_folder"
+
+            # Extract mean values and validate performance thresholds
+            echo "📊 Extracting performance metrics for $policy..."
+
+            # Find JSON files excluding experiment metadata
+            json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata)
+
+            if [ -n "$json_files" ]; then
+              # Extract metrics using jq and validate against loose thresholds
+              for json_file in $json_files; do
+                echo "Processing: $(basename "$json_file")"
+
+                                # Extract mean values for performance validation
+                ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file")
+                e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file")
+                input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file")
+                output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file")
+
+                echo "  TTFT mean: ${ttft_mean}s"
+                echo "  E2E Latency mean: ${e2e_latency_mean}s"
+                echo "  Input Throughput mean: ${input_throughput_mean} tokens/s"
+                echo "  Output Throughput mean: ${output_throughput_mean} tokens/s"
+
+                # Set mean thresholds (allowing for reasonable variance)
+                # These can be adjusted based on your performance requirements
+                ttft_threshold=4.7          # Max 4.7 seconds for mean TTFT
+                e2e_latency_threshold=35.0   # Max 35.0 seconds for mean E2E latency
+                input_throughput_threshold=12000   # Min 12000 tokens/s for mean input throughput
+                output_throughput_threshold=68    # Min 68 tokens/s for mean output throughput
+
+
+                # Validate mean thresholds
+                validation_passed=true
+
+                if (( $(echo "$ttft_mean > $ttft_threshold" | bc -l) )); then
+                  echo "❌ TTFT validation failed: $ttft_mean > $ttft_threshold"
+                  validation_passed=false
+                fi
+
+                if (( $(echo "$e2e_latency_mean > $e2e_latency_threshold" | bc -l) )); then
+                  echo "❌ E2E Latency validation failed: $e2e_latency_mean > $e2e_latency_threshold"
+                  validation_passed=false
+                fi
+
+                if (( $(echo "$input_throughput_mean < $input_throughput_threshold" | bc -l) )); then
+                  echo "❌ Input Throughput validation failed: $input_throughput_mean < $input_throughput_threshold"
+                  validation_passed=false
+                fi
+
+                if (( $(echo "$output_throughput_mean < $output_throughput_threshold" | bc -l) )); then
+                  echo "❌ Output Throughput validation failed: $output_throughput_mean < $output_throughput_threshold"
+                  validation_passed=false
+                fi
+
+                if [ "$validation_passed" = true ]; then
+                  echo "✅ Performance validation passed for $policy"
+                else
+                  echo "❌ Performance validation failed for $policy"
+                  kill $ROUTER_PID 2>/dev/null || true
+                  exit 1
+                fi
+              done
+
+              echo "✓ Genai-bench completed successfully for $policy"
+              echo "📊 Detailed metrics and plots available in: $actual_folder"
+            else
+              echo "✗ Benchmark failed for $policy: No JSON results found"
+              kill $ROUTER_PID 2>/dev/null || true
+              exit 1
+            fi
+          else
+            echo "✗ Benchmark failed for $policy: Experiment folder not found"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Stop router before testing next policy
+          echo "Stopping router for $policy..."
+          # First try graceful shutdown
+          kill $ROUTER_PID 2>/dev/null || true
+
+          # Wait up to 5 seconds for graceful shutdown
+          for i in {1..5}; do
+            if ! ps -p $ROUTER_PID > /dev/null 2>&1; then
+              echo "Router stopped gracefully"
+              break
+            fi
+            sleep 1
+          done
+
+          # Force kill if still running
+          if ps -p $ROUTER_PID > /dev/null 2>&1; then
+            echo "Force killing router..."
+            kill -9 $ROUTER_PID 2>/dev/null || true
+          fi
+
+          # Short delay to ensure port is released
+          sleep 2
+
+          echo "✓ Completed testing for $policy"
+        done
+
+        echo ""
+        echo "✅ All policies tested successfully!"
+
+
+    - name: Upload benchmark results
+      if: success()
+      uses: actions/upload-artifact@v4
+      with:
+        name: genai-bench-results-all-policies
+        path: benchmark_**/
+
+    - name: Cleanup servers
+      if: always()
+      run: |
+        if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then
+          pkill -P ${{ steps.start_servers.outputs.server_pid }} || true
+          kill ${{ steps.start_servers.outputs.server_pid }} || true
+        fi
+        pkill -f "sglang.launch_server" || true
+        sleep 5
+        remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0")
+        echo "Cleanup completed. Remaining processes: $remaining"
+
+  summarize-benchmarks:
+    needs: test-disaggregation
+    runs-on: ubuntu-latest
+    if: success()
+
+    steps:
+    - name: Install jq
+      run: sudo apt-get update && sudo apt-get install -y jq bc
+
+    - name: Download benchmark results
+      uses: actions/download-artifact@v4
+      with:
+        name: genai-bench-results-all-policies
+
+    - name: List downloaded contents
+      run: |
+        echo "Contents after download:"
+        ls -la
+        find . -name "benchmark_*" -type d
+        echo "JSON files found:"
+        find . -name "*.json" | head -10
+
+    - name: Create benchmark summary
+      run: |
+        echo "=== DEBUG: Creating benchmark summary ==="
+        echo "Available benchmark directories:"
+        find . -name "benchmark_*" -type d
+        echo "=========================================="
+
+        echo "## PD Router Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "🚀 **Benchmarked with genai-bench for comprehensive LLM serving performance evaluation**" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "| Policy | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
+        echo "|--------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY
+
+        # First, complete the table with all policies
+        for policy in random round_robin cache_aware power_of_two; do
+          # Find genai-bench result folders for this policy (handle zip extraction structure)
+          result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1)
+          if [ -z "$result_folder" ]; then
+            # Try alternative patterns in case of different extraction structure
+            result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1)
+          fi
+
+          echo "DEBUG: Policy ${policy} -> Found folder: ${result_folder:-'NOT FOUND'}"
+
+          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
+            # Find JSON file with metrics
+            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)
+
+            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
+              # Extract performance metrics
+              ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+
+              # Format numbers for display (2 decimal places)
+              if [ "$ttft_mean" != "N/A" ] && [ "$ttft_mean" != "null" ]; then
+                ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean")
+              else
+                ttft_display="N/A"
+              fi
+
+              if [ "$e2e_latency_mean" != "N/A" ] && [ "$e2e_latency_mean" != "null" ]; then
+                e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean")
+              else
+                e2e_display="N/A"
+              fi
+
+              if [ "$input_throughput_mean" != "N/A" ] && [ "$input_throughput_mean" != "null" ]; then
+                input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean")
+              else
+                input_display="N/A"
+              fi
+
+              if [ "$output_throughput_mean" != "N/A" ] && [ "$output_throughput_mean" != "null" ]; then
+                output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean")
+              else
+                output_display="N/A"
+              fi
+
+              echo "| ${policy} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY
+            else
+              echo "| ${policy} | ❌ No Data | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY
+            fi
+          else
+            echo "| ${policy} | ❌ Failed | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY
+          fi
+        done
+
+        # Add performance validation summary
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "## 📊 Performance Validation" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "**Thresholds:** TTFT ≤ 2.0s | E2E Latency ≤ 8.0s | Input Throughput ≥ 10,000 tok/s | Output Throughput ≥ 100 tok/s" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+
+        validation_summary=""
+        for policy in random round_robin cache_aware power_of_two; do
+          # Use same robust path finding as above
+          result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1)
+          if [ -z "$result_folder" ]; then
+            result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1)
+          fi
+
+          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
+            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)
+            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
+              # Extract metrics for validation
+              ttft=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              e2e_latency=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              input_throughput=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              output_throughput=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+
+              # Check thresholds (using same values as in main workflow)
+              validation_status="✅"
+              if [ "$ttft" != "N/A" ] && [ "$ttft" != "null" ]; then
+                if (( $(echo "$ttft > 2.0" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+              if [ "$e2e_latency" != "N/A" ] && [ "$e2e_latency" != "null" ]; then
+                if (( $(echo "$e2e_latency > 24.0" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+              if [ "$input_throughput" != "N/A" ] && [ "$input_throughput" != "null" ]; then
+                if (( $(echo "$input_throughput < 10000" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+              if [ "$output_throughput" != "N/A" ] && [ "$output_throughput" != "null" ]; then
+                if (( $(echo "$output_throughput < 90" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+
+              validation_summary="${validation_summary}- **${policy}**: $validation_status\n"
+            else
+              validation_summary="${validation_summary}- **${policy}**: ❌ No data\n"
+            fi
+          else
+            validation_summary="${validation_summary}- **${policy}**: ❌ Failed\n"
+          fi
+        done
+
+        echo -e "$validation_summary" >> $GITHUB_STEP_SUMMARY
+
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "## 📊 Genai-Bench Features Used" >> $GITHUB_STEP_SUMMARY
+        echo "- **Token-level Performance**: TTFT, TPOT, End-to-End latency" >> $GITHUB_STEP_SUMMARY
+        echo "- **Throughput Analysis**: Input/Output/Total token throughput" >> $GITHUB_STEP_SUMMARY
+        echo "- **Statistical Analysis**: Percentiles, mean, std dev for all metrics" >> $GITHUB_STEP_SUMMARY
+        echo "- **Visual Reports**: Automated plots and Excel summaries" >> $GITHUB_STEP_SUMMARY
+        echo "- **SGLang Backend**: Native integration with SGLang serving" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "✅ All policies tested successfully with genai-bench!" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/pr-test-rust.yml
+++ b/.github/workflows/pr-test-rust.yml
@@ -0,0 +1,190 @@
+name: PR Test (Rust)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-rust-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-test-rust:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+
+      - name: Run lint
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo clippy --all-targets --all-features -- -D warnings
+
+      - name: Run fmt
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo fmt -- --check
+
+      - name: Run Rust tests
+        timeout-minutes: 20
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo test
+
+      - name: Check benchmark compilation
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo check --benches
+
+      - name: Quick benchmark sanity check
+        timeout-minutes: 15
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Run quick benchmarks to ensure they work using Python script
+          python3 scripts/run_benchmarks.py --quick
+
+  pytest-rust:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: BM.A10.4
+    timeout-minutes: 25
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install rust dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Install SGLang dependencies
+        run: |
+          sudo bash scripts/ci/ci_install_dependency.sh
+
+      - name: Build python binding
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router
+          pip install setuptools-rust wheel build
+          python3 -m build
+          pip install --force-reinstall dist/*.whl
+
+
+      - name: Run Python unit tests
+        run: |
+          cd sgl-router
+          source "$HOME/.cargo/env"
+          pip install pytest pytest-cov pytest-xdist
+          pytest -q py_test/unit --cov=sglang_router --cov-report=term-missing --cov-fail-under=80
+
+      - name: Run Python integration tests
+        run: |
+          cd sgl-router
+          source "$HOME/.cargo/env"
+          # Integration tests use FastAPI/uvicorn for mock workers
+          pip install fastapi uvicorn orjson
+          pytest -q -m integration
+
+      - name: Run Python E2E tests
+        run: |
+          bash scripts/killall_sglang.sh "nuk_gpus"
+          cd sgl-router
+          python3 -m pip --no-cache-dir install --upgrade --ignore-installed blinker
+          python3 -m pip --no-cache-dir install --upgrade --break-system-packages genai-bench==0.0.2
+          pytest -m e2e -s  -vv -o log_cli=true --log-cli-level=INFO
+
+      - name: Upload benchmark results
+        if: success()
+        uses: actions/upload-artifact@v4
+        with:
+          name: genai-bench-results-all-policies
+          path: sgl-router/benchmark_**/
+
+  finish:
+    needs: [unit-test-rust, pytest-rust]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
+
+  summarize-benchmarks:
+    needs: pytest-rust
+    runs-on: ubuntu-latest
+    if: success()
+
+    steps:
+    - name: Install jq
+      run: sudo apt-get update && sudo apt-get install -y jq bc
+
+    - name: Download benchmark results
+      uses: actions/download-artifact@v4
+      with:
+        name: genai-bench-results-all-policies
+
+    - name: List downloaded contents
+      run: |
+        echo "Contents after download:"
+        ls -la
+        find . -name "benchmark_*" -type d
+        echo "JSON files found:"
+        find . -name "*.json" | head -10
+
+    - name: Create benchmark summary
+      run: |
+        echo "=== DEBUG: Creating benchmark summary ==="
+        echo "Available benchmark directories:"
+        find . -name "benchmark_*" -type d || true
+        echo "=========================================="
+
+        echo "## Router E2E Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "Results captured from E2E tests for two scenarios: regular router (2 workers, dp=2) and PD router (2 prefill + 2 decode)." >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "| Scenario | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
+        echo "|----------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY
+
+        scenarios=$'Regular (dp=2, round_robin)|benchmark_round_robin_regular\nPD (2 prefill + 2 decode, round_robin)|benchmark_round_robin_pd'
+
+        echo "$scenarios" | sed 's/^\s*//' | while IFS='|' read -r label pattern; do
+          [ -z "$label" ] && continue
+          # Find the result folder (handle different extraction layouts)
+          result_folder=$(find . -maxdepth 3 \( -name "$pattern" -o -path "*${pattern}*" \) -type d | head -1)
+
+          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
+            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)
+
+            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
+              ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file")
+              e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file")
+              input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file")
+              output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file")
+
+              ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean")
+              e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean")
+              input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean")
+              output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean")
+
+              echo "| ${label} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY
+            fi
+          fi
+        done
--- a/.github/workflows/pr-test-sgl-kernel.yml
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -0,0 +1,151 @@
+name: PR Test (sgl-kernel)
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "sgl-kernel/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - "sgl-kernel/**"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-sgl-kernel-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Check clang-format
+        uses: DoozyX/clang-format-lint-action@v0.18.1
+        with:
+          source: sgl-kernel
+          extensions: h,c,cpp,hpp,cu,cuh,cc
+          clangFormatVersion: 18
+          style: file
+
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: sgl-kernel-build-node
+    strategy:
+      matrix:
+        include:
+          - python-version: "3.10"
+            cuda-version: "12.4"
+          - python-version: "3.10"
+            cuda-version: "12.8"
+          - python-version: "3.10"
+            cuda-version: "12.9"
+    name: Build Wheel (CUDA ${{ matrix.cuda-version }})
+    steps:
+      - name: Cleanup
+        run: |
+          sudo rm -rf $GITHUB_WORKSPACE/* || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        if: github.event_name != 'push' || (matrix.cuda-version != '12.4' && matrix.cuda-version != '12.8')
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  unit-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    needs: build-wheels
+    runs-on: 1-gpu-runner
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 && pip3 install pytest
+          pip3 uninstall sgl-kernel -y || true
+          pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
+          pip3 list | grep sgl-kernel
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd sgl-kernel
+          pytest tests/
+
+      - name: Uninstall dependencies
+        run: |
+          pip3 uninstall sgl-kernel -y
+
+  mla-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    needs: build-wheels
+    runs-on: 1-gpu-runner
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+          pip3 uninstall sgl-kernel -y || true
+          pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
+          pip3 list | grep sgl-kernel
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/srt
+          python3 test_mla_deepseek_v3.py
+
+      - name: Uninstall dependencies
+        run: |
+          pip3 uninstall sgl-kernel -y
+
+  finish:
+    needs: [unit-test, mla-test, lint, build-wheels]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/pr-test-xeon.yml
+++ b/.github/workflows/pr-test-xeon.yml
@@ -0,0 +1,106 @@
+name: PR Test (Xeon)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-xeon.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-xeon.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-xeon-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  build-test:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    runs-on: xeon-gnr
+    env:
+      HF_HOME: /home/sdp/.cache/huggingface
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t sglang_xeon --no-cache
+
+      - name: Run container
+        run: |
+          docker run -dt \
+            -v ${{ github.workspace }}:/sglang-checkout/ --ipc=host \
+            -v ${HF_HOME}:/root/.cache/huggingface \
+            --name ci_sglang_xeon \
+            sglang_xeon
+
+      - name: Install dependencies
+        timeout-minutes: 20
+        run: |
+          docker exec ci_sglang_xeon bash -c "python3 -m pip install --upgrade pip"
+          docker exec ci_sglang_xeon pip uninstall sgl-kernel -y || true
+          docker exec -w /sglang-checkout/sgl-kernel ci_sglang_xeon bash -c "cp pyproject_cpu.toml pyproject.toml && pip install -v ."
+          docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "pip install -e "python[dev_cpu]""
+
+      - name: Check AMX support
+        id: check_amx
+        timeout-minutes: 5
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "python3 -c 'import torch; import sgl_kernel; assert torch._C._cpu._is_amx_tile_supported(); assert hasattr(torch.ops.sgl_kernel, \"convert_weight_packed\"); '"
+        continue-on-error: true
+
+      - name: Run unit tests
+        if: steps.check_amx.outcome == 'success'
+        timeout-minutes: 36
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "cd ./test/srt && python3 run_suite.py --suite per-commit-cpu"
+
+      - name: Change permission
+        timeout-minutes: 2
+        run: |
+          docker exec -u root ci_sglang_xeon bash -c "
+            rm -rf /tmp/ci-home  &&
+            chown -R  $(id -u):$(id -g) /sglang-checkout/ 2>/dev/null || true
+          "
+
+      - name: Cleanup container
+        if: always()
+        run: |
+          docker rm -f ci_sglang_xeon || true
+
+  pr-test-xeon-finish:
+    if: always()
+    needs: [build-test]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -0,0 +1,437 @@
+name: PR Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "FlashInfer version"
+        required: true
+        type: choice
+        default: 'release'
+        options:
+          - 'release'
+          - 'nightly'
+
+concurrency:
+  group: pr-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      src: ${{ steps.filter.outputs.src }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            src:
+              - "python/**"
+              - "scripts/ci/**"
+              - "test/**"
+              - ".github/workflows/pr-test.yml"
+
+  unit-test-frontend:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          cd test/lang
+          python3 run_suite.py --suite per-commit
+
+  unit-test-backend-1-gpu:
+    needs: [check-changes, unit-test-frontend]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 10
+
+  unit-test-backend-2-gpu:
+    needs: [check-changes]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 2-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-2-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  unit-test-backend-4-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 4-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  unit-test-backend-8-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 8-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  performance-test-1-gpu-part-1:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Benchmark single latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
+
+      - name: Benchmark online latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
+
+      - name: Benchmark offline throughput
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
+
+      - name: Benchmark offline throughput (Non-streaming, small batch size)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
+
+      - name: Benchmark online latency (EAGLE)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
+
+      - name: Benchmark online latency (LoRA)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
+          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
+
+  performance-test-1-gpu-part-2:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Benchmark offline throughput (w/o RadixAttention)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
+
+      - name: Benchmark offline throughput (w/ Triton)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+
+      - name: Benchmark offline throughput (w/ FP8)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+
+      - name: Benchmark VLM offline throughput
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput
+
+      - name: Benchmark VLM online latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
+
+  performance-test-2-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Benchmark single latency (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+
+      - name: Benchmark single latency + torch.compile (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
+
+      - name: Benchmark offline throughput (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
+
+      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
+
+      - name: Benchmark offline PP decode throughput (PP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
+
+      - name: Benchmark offline PP prefill throughput (PP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill
+
+  accuracy-test-1-gpu:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_eval_accuracy_large.py
+
+  accuracy-test-2-gpu:
+    needs: [check-changes, accuracy-test-1-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate accuracy (TP=2)
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_moe_eval_accuracy_large.py
+
+  unit-test-deepep-4-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 4-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_deepep.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-gpu-deepep
+
+  unit-test-deepep-8-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 8-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_deepep.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-deepep
+
+  unit-test-backend-8-gpu-b200:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false &&
+      needs.check-changes.outputs.src == 'true'
+    runs-on: b200-runner
+    strategy:
+      fail-fast: false
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1
+
+
+  pr-test-finish:
+    needs: [
+      check-changes,
+      unit-test-frontend, unit-test-backend-1-gpu,
+      unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu,
+      performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
+      accuracy-test-1-gpu, accuracy-test-2-gpu,
+      unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
+      unit-test-backend-8-gpu-b200,
+    ]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.github/workflows/release-docker-amd-nightly.yml
+++ b/.github/workflows/release-docker-amd-nightly.yml
@@ -0,0 +1,65 @@
+name: Release Docker Images Nightly (AMD)
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 13 * * *'
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: amd-docker-scale
+    environment: 'prod'
+    strategy:
+      matrix:
+        gpu_arch: ['gfx942', 'gfx942-rocm700', 'gfx950']
+        build_type: ['all', 'srt']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: "Set Date"
+        run: |
+          echo "DATE=$(date +%Y%m%d)" >> $GITHUB_ENV
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_AMD_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_AMD_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+
+          if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then
+            rocm_tag="rocm630-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx942-rocm700" ]; then
+            rocm_tag="rocm700-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then
+            rocm_tag="rocm700-mi35x"
+          else
+            echo "Unsupported gfx arch"
+            exit 1
+          fi
+
+          tag=v${version}-${rocm_tag}
+
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+
+          docker build . -f docker/Dockerfile.rocm --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} -t rocm/sgl-dev:${tag}-${{ env.DATE }}${tag_suffix} --no-cache
+          docker push rocm/sgl-dev:${tag}-${{ env.DATE }}${tag_suffix}
--- a/.github/workflows/release-docker-amd.yml
+++ b/.github/workflows/release-docker-amd.yml
@@ -0,0 +1,56 @@
+name: Release Docker Images (AMD)
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: amd-docker-scale
+    environment: 'prod'
+    strategy:
+      matrix:
+        gpu_arch: ['gfx942', 'gfx942-rocm700', 'gfx950']
+        build_type: ['all', 'srt']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+
+          if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then
+            rocm_tag="rocm630-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx942-rocm700" ]; then
+            rocm_tag="rocm700-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then
+            rocm_tag="rocm700-mi35x"
+          else
+            echo "Unsupported gfx arch"
+            exit 1
+          fi
+
+          tag=v${version}-${rocm_tag}
+
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+
+          docker build . -f docker/Dockerfile.rocm --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
+          docker push lmsysorg/sglang:${tag}${tag_suffix}
--- a/.github/workflows/release-docker-dev.yml
+++ b/.github/workflows/release-docker-dev.yml
@@ -0,0 +1,49 @@
+name: Build Development Docker Image
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  build-dev:
+    if: ${{ github.repository == 'sgl-project/sglang' }}
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        variant:
+          - version: 12.6.1
+            type: all
+            tag: dev
+          - version: 12.8.1
+            type: blackwell
+            tag: blackwell
+          - version: 12.9.1
+            type: blackwell
+            tag: b200-cu129
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push Dev Image
+        run: |
+          docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache
+          docker push lmsysorg/sglang:${{ matrix.variant.tag }}
--- a/.github/workflows/release-docker-gb200.yml
+++ b/.github/workflows/release-docker-gb200.yml
@@ -0,0 +1,36 @@
+name: Release Docker Images (GB200)
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-22.04-arm
+    environment: "prod"
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-cu129-gb200
+
+          docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.9.1 --build-arg BUILD_TYPE=blackwell --no-cache .
--- a/.github/workflows/release-docker-npu-nightly.yml
+++ b/.github/workflows/release-docker-npu-nightly.yml
@@ -0,0 +1,78 @@
+name: Release Docker Images Nightly (Ascend NPU)
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/release-docker-npu-nightly.yml"
+      - "docker/Dockerfile.npu"
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      matrix:
+        cann_version: ["8.2.rc1"]
+        device_type: ["910b", "a3"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free up disk space
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
+
+      - name: Setup Docker buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            lmsysorg/sglang
+          # push with schedule event
+          # push with workflow_dispatch event
+          tags: |
+            type=ref,event=pr
+            type=ref,event=branch
+            type=schedule,pattern=main
+          flavor: |
+            latest=false
+            suffix=-cann${{ matrix.cann_version }}-${{ matrix.device_type }},onlatest=true
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Log into docker hub
+        uses: docker/login-action@v3
+        if: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      # Build and push Docker image with Buildx (don't push on PR)
+      # https://github.com/docker/build-push-action
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: docker
+          file: docker/Dockerfile.npu
+          # TODO: need add x86 platforms support when memfabric is ready
+          platforms: linux/arm64
+          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta.outputs.tags }}
+          push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+          provenance: false
+          build-args: |
+            SGLANG_KERNEL_NPU_TAG=20250901
+            CANN_VERSION=${{ matrix.cann_version }}
+            DEVICE_TYPE=${{ matrix.device_type }}
--- a/.github/workflows/release-docker-npu.yml
+++ b/.github/workflows/release-docker-npu.yml
@@ -0,0 +1,74 @@
+name: Release Docker Images (Ascend NPU)
+on:
+  push:
+    tags:
+      - "*" # Trigger on all tags and filterred by pep440 later
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/release-docker-npu.yml"
+      - "docker/Dockerfile.npu"
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      matrix:
+        cann_version: ["8.2.rc1"]
+        device_type: ["910b", "a3"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free up disk space
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
+
+        # push with tag
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            lmsysorg/sglang
+          tags: |
+            type=ref,event=pr
+            type=ref,event=tag,suffix=-cann${{ matrix.cann_version }}-${{ matrix.device_type }}
+          flavor: |
+            latest=false
+
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        if: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Get version
+        id: get_version
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          echo "TAG=lmsysorg/sglang:v$version-cann${{ matrix.cann_version }}-${{ matrix.device_type }}" >> $GITHUB_OUTPUT
+
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: docker
+          file: docker/Dockerfile.npu
+          # TODO: need add x86 platforms support when memfabric is ready
+          platforms: linux/arm64
+          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta.outputs.tags || steps.get_version.outputs.TAG }}
+          push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+          provenance: false
+          build-args: |
+            SGLANG_KERNEL_NPU_TAG=20250901
+            CANN_VERSION=${{ matrix.cann_version }}
+            DEVICE_TYPE=${{ matrix.device_type }}
--- a/.github/workflows/release-docker-router.yml
+++ b/.github/workflows/release-docker-router.yml
@@ -0,0 +1,30 @@
+name: Release SGLang Router Docker Image
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "sgl-router/py_src/sglang_router/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat sgl-router/py_src/sglang_router/version.py | cut -d'"' -f2)
+          tag=v${version}
+
+          docker build . -f docker/Dockerfile.router -t lmsysorg/sglang-router:${tag} --no-cache
+          docker push lmsysorg/sglang-router:${tag}
--- a/.github/workflows/release-docker-xeon.yml
+++ b/.github/workflows/release-docker-xeon.yml
@@ -0,0 +1,35 @@
+name: Release Docker Xeon Images
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-24.04
+    environment: 'prod'
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t lmsysorg/sglang:${tag} --no-cache
+          docker push lmsysorg/sglang:${tag}
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -0,0 +1,97 @@
+name: Release Docker Images
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    strategy:
+      matrix:
+        cuda_version: ['12.6.1', '12.8.1', '12.9.1']
+        build_type: ['all', 'blackwell']
+        exclude:
+          - cuda_version: '12.6.1'
+            build_type: 'blackwell'
+          - cuda_version: '12.8.1'
+            build_type: 'all'
+          - cuda_version: '12.9.1'
+            build_type: 'all'
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+
+          if [ "${{ matrix.cuda_version }}" = "11.8.0" ]; then
+            cuda_tag="cu118"
+          elif [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
+            cuda_tag="cu121"
+          elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
+            cuda_tag="cu124"
+          elif [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then
+            cuda_tag="cu125"
+          elif [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
+            cuda_tag="cu126"
+          elif [ "${{ matrix.cuda_version }}" = "12.8.1" ]; then
+            cuda_tag="cu128"
+          elif [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
+            cuda_tag="cu129"
+          else
+            echo "Unsupported CUDA version"
+            exit 1
+          fi
+
+          tag=v${version}-${cuda_tag}
+
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          elif [ "${{ matrix.build_type }}" = "blackwell" ]; then
+            tag_suffix="-b200"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+
+          docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
+          docker push lmsysorg/sglang:${tag}${tag_suffix}
+
+          if [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
+            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix}
+            docker push lmsysorg/sglang:latest${tag_suffix}
+          fi
+
+          if [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
+            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:v${version}
+            docker push lmsysorg/sglang:v${version}
+          fi
--- a/.github/workflows/release-docs.yml
+++ b/.github/workflows/release-docs.yml
@@ -0,0 +1,65 @@
+name: Release Documentation
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "docs/**"
+      - "python/sglang/version.py"
+      - "python/sglang/**"
+  workflow_dispatch:
+
+concurrency:
+  group: release-docs-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  execute-and-deploy:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+          apt-get update && apt-get install -y pandoc parallel retry
+          ln -sf "$(which python3)" /usr/bin/python
+
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+
+      - name: Execute notebooks
+        timeout-minutes: 40
+        run: |
+          cd docs
+          make clean
+          make compile
+
+      - name: Push HTML to sgl-project.github.io
+        timeout-minutes: 60
+        env:
+          GITHUB_TOKEN: ${{ secrets.DOCUMENTATION_PAT_TOKEN }}
+        run: |
+          cd docs
+          make html
+          python3 wrap_run_llm.py
+
+          cd _build/html
+
+          git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
+          find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -not -name ".jekyll" -not -name ".nojekyll" -delete
+          cp -r * ../sgl-project.github.io
+          cp ../../README.md ../sgl-project.github.io/README.md
+          cd ../sgl-project.github.io
+          git config user.name "zhaochenyang20"
+          git config user.email "zhaochenyang20@gmail.com"
+          git add .
+          git commit -m "Update $(date +'%Y-%m-%d %H:%M:%S')"
+          git push https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git main
+          cd ..
+          rm -rf sgl-project.github.io
--- a/.github/workflows/release-fake-tag.yml
+++ b/.github/workflows/release-fake-tag.yml
@@ -0,0 +1,35 @@
+name: Release Fake Tag
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Get version
+        id: get_version
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          echo "TAG=v$version" >> $GITHUB_OUTPUT
+
+      - name: Create and push fake tag
+        env:
+          GITHUB_TOKEN: ${{ secrets.REPO_TOKEN }}
+        run: |
+          git config user.name zhyncs
+          git config user.email me@zhyncs.com
+          git checkout -b ${{ steps.get_version.outputs.TAG }}
+          git push --set-upstream origin ${{ steps.get_version.outputs.TAG }}
--- a/.github/workflows/release-pypi-router.yml
+++ b/.github/workflows/release-pypi-router.yml
@@ -0,0 +1,119 @@
+# Reference: https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/.github/workflows/build_wheels.yml#L1
+
+name: Release SGLang Router to PyPI
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-router/pyproject.toml
+  workflow_dispatch:
+
+jobs:
+  build:
+    name: Build on ${{ matrix.os }} (${{ matrix.target }})
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu
+            target: x86_64
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          path: sglang-repo
+
+      - name: Move sgl-router folder to root and delete sglang-repo
+        run: |
+          mv sglang-repo/sgl-router/* .
+          rm -rf sglang-repo
+          ls -alt
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install build dependencies
+        run: |
+          python -m pip install -U pip
+          python -m pip install build twine auditwheel
+
+      - name: Build package
+        uses: pypa/cibuildwheel@v2.21.3
+        env:
+          CIBW_BUILD: "cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64 cp311-manylinux_x86_64 cp312-manylinux_x86_64"
+          CIBW_BEFORE_ALL: |
+            yum update -y && yum install -y openssl-devel wget unzip && \
+            # Install latest protoc (v32.0) that supports proto3
+            cd /tmp && \
+            wget https://github.com/protocolbuffers/protobuf/releases/download/v32.0/protoc-32.0-linux-x86_64.zip && \
+            unzip protoc-32.0-linux-x86_64.zip -d /usr/local && \
+            rm protoc-32.0-linux-x86_64.zip && \
+            # Install Rust
+            curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+          CIBW_ENVIRONMENT: "PATH=$HOME/.cargo/bin:$PATH"
+
+      - name: List built packages
+        run: ls -lh wheelhouse/
+
+      - name: Check packages
+        run: twine check --strict wheelhouse/*
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: packages-${{ matrix.os }}-${{ matrix.target }}
+          path: wheelhouse/
+
+  build-sdist:
+    name: Build SDist
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          path: sglang-repo
+
+      - name: Move sgl-router folder to root, copy the license file, and delete sglang-repo
+        run: |
+          mv sglang-repo/sgl-router/* .
+          mv sglang-repo/LICENSE .
+          rm -rf sglang-repo
+          ls -alt
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Build SDist
+        run: |
+          pip install build
+          python -m pip install -U packaging
+          python -m build --sdist
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: sdist
+          path: dist/*.tar.gz
+
+  upload:
+    name: Upload to PyPI
+    if: github.repository == 'sgl-project/sglang'  # Ensure this job only runs for the sgl-project/sglang repository
+    needs: [build, build-sdist]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          merge-multiple: true
+
+      - name: Upload to PyPI
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN_ROUTER }}
+        run: |
+          pip install twine
+          twine upload dist/* --verbose
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -0,0 +1,31 @@
+name: Release PyPI
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: "prod"
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Upload to pypi
+        run: |
+          cd python
+          cp ../README.md ../LICENSE .
+          pip install build
+          python3 -m build
+          pip install twine
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
--- a/.github/workflows/release-whl-kernel-cu118.yml
+++ b/.github/workflows/release-whl-kernel-cu118.yml
@@ -0,0 +1,92 @@
+name: Release SGLang Kernel Wheel (cu118)
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag_name:
+        type: string
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-kernel/python/sgl_kernel/version.py
+
+jobs:
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+        cuda-version: ["11.8"]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release:
+    needs: build-wheels
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
--- a/.github/workflows/release-whl-kernel.yml
+++ b/.github/workflows/release-whl-kernel.yml
@@ -0,0 +1,283 @@
+name: Release SGLang Kernels
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-kernel/python/sgl_kernel/version.py
+  workflow_dispatch:
+    inputs:
+      tag_name:
+        type: string
+        required: false
+
+concurrency:
+  group: release-sglang-kernels-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-cu129:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.9"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload to PyPI
+        working-directory: sgl-kernel
+        run: |
+          pip install twine
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+
+  build-cu124:
+    if: github.repository == 'sgl-project/sglang'
+    needs: build-cu129
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.4"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release-cu124:
+    needs: build-cu124
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py --cuda 124
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
+
+  build-cu128:
+    if: github.repository == 'sgl-project/sglang'
+    needs: build-cu129
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.8"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release-cu128:
+    needs: build-cu128
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py --cuda 128
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
+
+  build-cu129-aarch64:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: sgl-kernel-release-node-arm
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.9"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" aarch64
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
+          path: sgl-kernel/dist/*
+
+  release-cu129-aarch64:
+    needs: build-cu129-aarch64
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py --cuda 129
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
--- a/.github/workflows/vllm-dependency-test.yml
+++ b/.github/workflows/vllm-dependency-test.yml
@@ -0,0 +1,43 @@
+name: VLLM Dependency Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+
+concurrency:
+  group: vllm-dependency-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  vllm-dependency-test:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip install "bitsandbytes>=0.44.0"
+
+          pip install "sgl-kernel==0.3.7"
+
+      - name: Run vLLM dependency tests
+        timeout-minutes: 60
+        run: |
+          export SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK=1
+
+          cd test/srt
+          python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,240 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+
+# Tokenizer cache for tests
+.tokenizer_cache/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# MacOS
+.DS_Store
+
+# Vim
+*.swp
+
+# Documentation
+docs/_build
+
+# SGL
+benchmark/mmlu/data
+benchmark/mmlu/data.tar
+benchmark/llava_bench/images
+benchmark/llava_bench/mme_pack
+*.jsonl
+tmp*.txt
+
+# Plots
+*.png
+*.pdf
+
+# personnal
+work_dirs/
+*.csv
+
+!logo.png
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+compile_commands.json
+
+*.iml
+
+# VSCode
+.vscode
+
+1
+
+# Autoenv
+.env.leave
+
+# Rust lib
+Cargo.lock
+
+lmms-eval
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -0,0 +1,3 @@
+[settings]
+profile=black
+known_first_party=sglang
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,59 @@
+default_stages: [pre-commit, pre-push, manual]
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: check-toml
+      - id: check-ast
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: debug-statements
+      - id: no-commit-to-branch
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.7
+    hooks:
+      - id: ruff
+        args: [--select=F401, --fixable=F401]
+        files: ^(benchmark/|docs/|examples/)
+        exclude: \.ipynb$
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black-jupyter
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1
+    hooks:
+      - id: codespell
+        additional_dependencies: ['tomli']
+        args: ['--toml', 'python/pyproject.toml', '-L', 'cann,thi,makro,wil,rouge']
+        exclude: |
+          (?x)^(
+            test/srt/test_reasoning_parser\.py|
+            docs/advanced_features/vlm_query\.ipynb
+          )$
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+    - id: clang-format
+      types_or: [c++, cuda]
+      args: [--style=file, --verbose]
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+        args:
+          - '--keep-output'
+          - '--extra-keys=metadata.kernelspec metadata.language_info.version'
--- a/3rdparty/amd/profiling/PROFILING.md
+++ b/3rdparty/amd/profiling/PROFILING.md
@@ -0,0 +1,425 @@
+## Profiling SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang profiling technical, code augment and running steps for systems with AMD Instinct GPUs, nevertheless the same procedure may work with Nvidia GPUs too.
+Examples and steps are provided in detail, to facilitate easy reproduce and use to localize performance problem towards optimizations.
+Two primary methods are covered:
+- [RPD](https://github.com/ROCm/rocmProfileData.git)
+- [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
+
+### Profiling SGLang Infer System with RPD Profiler
+RPD profiler is a low-overhead cross-platform profiler. Therefore, the same RPD code augment not only works for profiling on ROCm/AMD GPUs, but also works for profiling on CUDA/Nvidia GPUs as well. To do RPD profiling on SGLang repository, please use scripts and patch files included in this directory and follow the steps below:
+1. Install RPD with rpd.patch applied during installation using install_rpd.sh, both files are in this directory.
+
+install_rpd.sh
+
+```bash
+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git checkout 976899e9c6dbc6dd2bccf770818e4e44125590ac
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..
+```
+
+rpd.patch
+
+```bash
+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
+++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
+        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif
+```
+2. Add loadTracer.sh file included in this directory to /sglang/python/sglang.
+
+loadTracer.sh
+
+```bash
+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"
+```
+3. Apply patch (provided in this directory) with "git apply rpd_profile_server_enable.patch" if the main profiling purpose is to get info on gpu kernels as well as limited cpu activity info.
+
+#### Common Notes 1
+Please note that although we are doing TP=8 in the example, we purposely only log RPD profiling on 2 ranks in the patch file (i.e.tp_rank=0/1) for profiling/visualization convenience, as even Perfetto streaming mode can only load maximal 8GB json file for visualization. With 2 ranks logged in RPD profiling, we could still check whether there are issues among ranks (e.g. load imbalance issue, nccl issue), and at the same time, we could log relatively longer time duration before the json file generated from RPD file hits 8GB size.
+
+rpd_profile_server_enable.patch
+
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
+            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
+        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
+        if self.tp_rank == 0 or self.tp_rank == 1:
+            self.rpd.start()
+            self.rpd.rangePush("", "rpd profile range", "")
+            logger.info("rpd is enabled")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
+        #self.profiler.stop()
+        #self.profiler.export_chrome_trace(
+        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+        #)
+        if self.tp_rank ==0 or self.tp_rank ==1:
+            self.rpd.rangePop()
+            self.rpd.stop()
+            self.rpd.flush()
+            logger.info("rpd is done")
+         logger.info("Profiler is done")
+```
+
+#### Advanced Debugging with RPD Profiler
+Sometimes, we want to use rpd profiler to capture more CPU and python activities in order to debug some challenging issues (e.g. root cause of load imbalance across gpu processes, root cause of bubbles, etc). Only in such cases, we need to apply patch "git apply rpd_profile_server_enable_wCPU_activities.patch", where 3 files are modified.
+
+rpd_profile_server_enable_wCPU_activities.patch
+
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
+            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
+        #self.profiler.start()
+        logger.info("torch profiler is disabled")
+        if self.tp_rank == 0 or self.tp_rank == 1:
+            self.rpd.setPythonTrace(True)
+            self.rpd.start()
+            self.rpd.rangePush("", "scheduler", "")
+        logger.info("rpd is enabled inside scheduler profiling")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
+        #self.profiler.stop()
+        #self.profiler.export_chrome_trace(
+        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+        #)
+        if self.tp_rank ==0 or self.tp_rank ==1:
+            self.rpd.rangePop()
+            self.rpd.stop()
+            self.rpd.flush()
+            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+
+
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
+
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+
+     def start_profile(self):
+        rpd = rpdTracerControl()
+        rpd.setPythonTrace(True)
+        rpd.start()
+        rpd.rangePush("", "tokenizer_manager", "")
+        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+     def stop_profile(self):
+        rpd = rpdTracerControl()
+        rpd.rangePop()
+        rpd.stop()
+        rpd.flush()
+        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
+    rpd = rpdTracerControl()
+    rpd.setPythonTrace(True)
+    rpd.start()
+    rpd.rangePush("", "server rpd profile range", "")
+    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
+    rpd = rpdTracerControl()
+    rpd.rangePop()
+    rpd.stop()
+    rpd.flush()
+    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,
+```
+
+4. As an example for grok1 profiling, we create a dummy_grok1 directory with config.json (see content below) inside this directory and copy this directory to the right path for "--model-path" if you want to use the example server.sh file provided.
+```bash
+cat ../dummy_grok1/config.json
+{
+  "architectures": [
+    "Grok1ModelForCausalLM"
+  ],
+  "embedding_multiplier_scale": 78.38367176906169,
+  "output_multiplier_scale": 0.5773502691896257,
+  "vocab_size": 131072,
+  "hidden_size": 6144,
+  "intermediate_size": 32768,
+  "max_position_embeddings": 8192,
+  "num_experts_per_tok": 2,
+  "num_local_experts": 8,
+  "num_attention_heads": 48,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "head_dim": 128,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "model_type": "mixtral",
+  "torch_dtype": "bfloat16"
+}
+```
+5. Launch server with rpd enabled script ./server.sh in one terminal inside the docker container.
+
+#### Common Notes 2
+- Remember to change model-path to the correct path
+- loadTracer.sh is needed to conduct profiling
+- SGLANG_TORCH_PROFILER_DIR is used for default torch profiler
+- Do not use loadTracer.sh if you are using the torch profiler, simply use python3 -m sglang.launch_server.
+
+
+server.sh
+
+```bash
+#!/bin/bash
+
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quantization fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"
+```
+6. Open another terminal for the same docker container, and run the rpd enabled ./client.sh after you see "The server is fired up and is ready to roll!" message from server side terminal.
+
+#### Common Notes 3
+- Use curl http://localhost:30000/start_profile & curl http://localhost:30000/stop_profile to control the start and end of profiling. Check sglang/python/sglang/srt/managers/scheduler.py for more details.
+- Please don't use RPD profiler together with PyTorch profiler to avoid interference.
+- The rocmProfileData/tools/rpd2tracing.py file is used to generate json file from RPD file.
+
+client.sh
+
+```bash
+#!/bin/bash
+
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 120 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 ./rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+7. Follow [Perfetto docs](https://perfetto.dev/docs/visualization/large-traces) to visualize large json files. Try to adjust parameters so that the trace.json file size is less than 9GB.
+
+### Profiling SGLang Infer System with PyTorch Profiler
+
+Please use the steps as follows:
+
+1. Apply the patch torch_profiler.patch. Note that you can modify "if self.tp_rank == 0" in the patch to allow more ranks be recorded in profiling.
+
+torch_profiler.patch
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..6ecd78c 100644
+--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
+@@ -240,7 +240,6 @@ class Scheduler:
+             )
+             self.profiler = torch.profiler.profile(
+                 activities=[
+-                    torch.profiler.ProfilerActivity.CPU,
+                     torch.profiler.ProfilerActivity.CUDA,
+                 ],
+                 with_stack=True,
+@@ -1033,9 +1032,11 @@ class Scheduler:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+         self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
+        if self.tp_rank == 0:
+            with open(f"stats_repro_{int(time.time())}.txt", "w") as f:
+                print(self.profiler.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=-1), file=f)
+                print("Profiling stats done.")
+
+         logger.info("Profiler is done")
+```
+
+2. Create the model path directory and copy it to the right path for "--model-path" if you want to use the server.sh file provided.
+
+3. Modify the included server.sh by removing "loadTracer.sh" before python command and launch script ./server.sh in one terminal inside the docker container.
+
+4. Similar to step 6 in RPD profiling section, but remove the last 2 lines in client.sh, which converted rpd file into csv and json files. Run modified client.sh for PyTorch profiling.
+-------
+- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
--- a/3rdparty/amd/profiling/client.sh
+++ b/3rdparty/amd/profiling/client.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 240 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 /sgl-workspace/rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
--- a/3rdparty/amd/profiling/install_rpd.sh
+++ b/3rdparty/amd/profiling/install_rpd.sh
@@ -0,0 +1,10 @@
+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..
--- a/3rdparty/amd/profiling/loadTracer.sh
+++ b/3rdparty/amd/profiling/loadTracer.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"
--- a/3rdparty/amd/profiling/rpd.patch
+++ b/3rdparty/amd/profiling/rpd.patch
@@ -0,0 +1,12 @@
+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
+++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
+        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif
--- a/3rdparty/amd/profiling/rpd_profile_server_enable.patch
+++ b/3rdparty/amd/profiling/rpd_profile_server_enable.patch
@@ -0,0 +1,49 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
+            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
+        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
+        if self.tp_rank == 0 or self.tp_rank == 1:
+            self.rpd.start()
+            self.rpd.rangePush("", "rpd profile range", "")
+            logger.info("rpd is enabled")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
+        #self.profiler.stop()
+        #self.profiler.export_chrome_trace(
+        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+        #)
+        if self.tp_rank ==0 or self.tp_rank ==1:
+            self.rpd.rangePop()
+            self.rpd.stop()
+            self.rpd.flush()
+            logger.info("rpd is done")
+         logger.info("Profiler is done")
--- a/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch
+++ b/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch
@@ -0,0 +1,126 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
+            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
+        #self.profiler.start()
+        logger.info("torch profiler is disabled")
+        if self.tp_rank == 0 or self.tp_rank == 1:
+            self.rpd.setPythonTrace(True)
+            self.rpd.start()
+            self.rpd.rangePush("", "scheduler", "")
+        logger.info("rpd is enabled inside scheduler profiling")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
+        #self.profiler.stop()
+        #self.profiler.export_chrome_trace(
+        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+        #)
+        if self.tp_rank ==0 or self.tp_rank ==1:
+            self.rpd.rangePop()
+            self.rpd.stop()
+            self.rpd.flush()
+            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+
+
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
+
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+
+     def start_profile(self):
+        rpd = rpdTracerControl()
+        rpd.setPythonTrace(True)
+        rpd.start()
+        rpd.rangePush("", "tokenizer_manager", "")
+        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+     def stop_profile(self):
+        rpd = rpdTracerControl()
+        rpd.rangePop()
+        rpd.stop()
+        rpd.flush()
+        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
+from rpdTracerControl import rpdTracerControl
+rpdTracerControl.skipCreate()
+
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
+    rpd = rpdTracerControl()
+    rpd.setPythonTrace(True)
+    rpd.start()
+    rpd.rangePush("", "server rpd profile range", "")
+    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
+    rpd = rpdTracerControl()
+    rpd.rangePop()
+    rpd.stop()
+    rpd.flush()
+    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,
--- a/3rdparty/amd/profiling/server.sh
+++ b/3rdparty/amd/profiling/server.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quantization fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"
--- a/3rdparty/amd/profiling/torch_profiler.patch
+++ b/3rdparty/amd/profiling/torch_profiler.patch
@@ -0,0 +1,25 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..6ecd78c 100644
+--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
+@@ -240,7 +240,6 @@ class Scheduler:
+             )
+             self.profiler = torch.profiler.profile(
+                 activities=[
+-                    torch.profiler.ProfilerActivity.CPU,
+                     torch.profiler.ProfilerActivity.CUDA,
+                 ],
+                 with_stack=True,
+@@ -1033,9 +1032,11 @@ class Scheduler:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+         self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
+        if self.tp_rank == 0:
+            with open(f"stats_repro_{int(time.time())}.txt", "w") as f:
+                print(self.profiler.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=-1), file=f)
+                print("Profiling stats done.")
+
+         logger.info("Profiler is done")
--- a/3rdparty/amd/tuning/TUNING.md
+++ b/3rdparty/amd/tuning/TUNING.md
@@ -0,0 +1,118 @@
+## Tuning SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang performance tuning technical, code harness and running steps for systems with AMD Instinct GPUs.
+Harness code, examples and steps are provided in detail, to facilitate easy reproduce & use to tune performance towards workloads.
+Three primary runtime areas are covered:
+
+## 1. Triton Kernels
+To maximize Triton kernel efficiency, several strategies can be employed:
+
+### Key Environment Variables:
+- **num_stages**: Adjusts the number of pipeline stages to optimize kernel efficiency based on the specific type of operations (e.g., General Matrix Multiplication - GEMM).
+- **waves_per_eu**: Controls the usage of Vector General Purpose Registers (VGPR) to enhance occupancy, thereby improving latency or throughput.
+- **BLOCK_M, BLOCK_N, BLOCK_K**: Tunable tile sizes that assist in balancing memory transfer and computational efficiency.
+- **matrix_instr_nonkdim**: Optimizes the usage of Matrix-Fused Multiply-Add (MFMA) instructions for specific kernel types, such as Flash Attention.
+- **OPTIMIZE_EPILOGUE**: An environment variable that can be set to `1` to enhance performance by eliminating the `convert_layout` operation in the kernel's epilogue.
+```python
+@triton.autotune(configs=[
+        triton.Config({'waves_per_eu': 1}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=16, num_stages=1),
+    ], key=['BLOCK_N', 'NUM_TOKEN_BLKS'], use_cuda_graph=True)
+@triton.jit
+def _triton_kernel_funtion():
+    ...
+```
+## 2. Torch Tunable Operations
+**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
+
+### Key Environment Variables:
+1. **PYTORCH_TUNABLEOP_ENABLED**:
+   - Default: `0`
+   - Set to `1` to enable TunableOp.
+
+2. **PYTORCH_TUNABLEOP_TUNING**:
+   - Default: `1`
+   - Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled.
+
+3. **PYTORCH_TUNABLEOP_VERBOSE**:
+   - Default: `0`
+   - Set to `1` to enable verbose output for TunableOp.
+
+### Usage Example:
+To enable TunableOp and tuning, and optionally enable verbose mode, you can run the following command in your terminal:
+
+```bash
+#Tuning
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=1 your_script.sh
+
+#Inference with tuning op
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_script.sh
+
+#Print out the log
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 PYTORCH_TUNABLEOP_VERBOSE=1 your_script.sh
+
+```
+## 3. Torch Compilation
+
+
+The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (conv) operations in PyTorch using Inductor, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better performance.
+
+To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape.
+
+### Key Configurations:
+1. **Max Autotune**:
+   - Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`.
+
+2. **Fine-Grained Control**:
+   - Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`.
+   - Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`.
+
+3. **Backend Selection**:
+   - Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance.
+
+4. **Freezing for Inference**:
+   - Use `torch._inductor.config.freezing=True` to enable constant folding optimizations.
+
+5. **Debugging**:
+   - Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor.
+
+### Example Code Block:
+```bash
+#Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 your_script.sh
+
+#Specify your backend to TRITON for Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=TRITON your_script.sh
+
+#Inference with large improvement on AMD GPU
+TORCHINDUCTOR_FREEZING=1 your_script.sh
+```
+## 4. Fused MOE kernel
+To maximize moe kernel efficiency, need to use below scripts to find out the best launch configuration
+
+### Key parameters:
+- **--model**: what moe model type to do tuning, it will automatically decide the size of d_model, model_intermediate_size, num_layers
+- **--tp-size**: simulate the whole model run configuration to set the dimension size using tp correctly
+- **--batch**: M dimension size of moe kernel, for prefill moe kernel the value is batch*input_len, for decode moe kernel the value is batch
+- **--dtype**: computation type
+
+```bash
+#Tuning
+#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
+#so we can tune decode moe use below command
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
+# and use this command to tune prefill moe
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32768"
+```
+
+## Reference
+
+For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link:
+
+[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)
--- a/3rdparty/amd/tuning/benchmark_moe_rocm.py
+++ b/3rdparty/amd/tuning/benchmark_moe_rocm.py
@@ -0,0 +1,380 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from tqdm import tqdm
+from transformers import AutoConfig
+
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe,
+    get_config_file_name,
+)
+
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+def main(model, tp_size, dtype: str, batches):
+    method = fused_moe
+
+    for bs in batches:
+        run_grid(int(bs), model=model, method=method, tp_size=tp_size, dtype=dtype)
+
+
+def prune_configs(M, N, K, configs):
+    pruned_configs = []
+    elemBytes_a = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+    elemBytes_b = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+
+    mfma = 16 if M < 32 or N < 32 else 32
+
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+        matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+        # kpack = config.get("kpack")
+        if matrix_instr_nonkdim > mfma:
+            continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = 1  # config.get("SPLIT_K")
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if matrix_instr_nonkdim > BLOCK_SIZE_M or matrix_instr_nonkdim > BLOCK_SIZE_N:
+            continue
+        if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M:
+            continue
+        if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N:
+            continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (
+            BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a
+            + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b
+        )
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def union_of_list_of_dicts(l1, l2):
+    result = []
+    temp_list = l1.copy()
+    temp_list.extend(l2)
+    for myDict in temp_list:
+        if myDict not in result:
+            result.append(myDict)
+
+    return result
+
+
+def run_grid(bs, model, method, tp_size, dtype: str):
+
+    config = AutoConfig.from_pretrained(model)
+
+    top_k = config.num_experts_per_tok
+    d_model = config.hidden_size
+    model_intermediate_size = config.intermediate_size
+    num_layers = config.num_hidden_layers
+    hidden_states_dtype = config.torch_dtype
+
+    if config.num_experts_per_tok:
+        if config.architectures[0] == "Grok1ModelForCausalLM":
+            num_total_experts = config.num_experts
+        else:
+            num_total_experts = config.num_local_experts
+    else:
+        raise ValueError(f"Unsupported Mixtral model {model}")
+
+    # tp_size = 2
+    num_warmup_calls = 10
+    num_calls = 30
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    full_configs = []
+
+    block_m_range = [16, 32, 64, 128, 256]
+    block_n_range = [16, 32, 64, 128, 256]
+    block_k_range = [32, 64, 128, 256]  # MUST >= 32
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    # For now we see better perf with num_stages=0 for all gemm configs we care
+    # But keep this explicit so that we do not forget we may need to set it to
+    # other values in the future
+    num_stage_range = [2]
+    waves_per_eu_range = [0, 1, 2, 4, 8]
+    # Remove 32 because of triton compiling error
+    matrix_instr_nonkdim_range = [16]
+    kpack_range = [1, 2]
+
+    for block_size_m in block_m_range:
+        for block_size_n in block_n_range:
+            for block_size_k in block_k_range:
+                for group_size_m in group_m_range:
+                    for num_warps in num_warps_range:
+                        for num_stages in num_stage_range:
+                            for waves_per_eu in waves_per_eu_range:
+                                for matrix_instr_nonkdim in matrix_instr_nonkdim_range:
+                                    for kpack in kpack_range:
+                                        full_configs.append(
+                                            {
+                                                "BLOCK_SIZE_M": block_size_m,
+                                                "BLOCK_SIZE_N": block_size_n,
+                                                "BLOCK_SIZE_K": block_size_k,
+                                                "GROUP_SIZE_M": group_size_m,
+                                                "num_warps": num_warps,
+                                                "num_stages": num_stages,
+                                                "waves_per_eu": waves_per_eu,
+                                                "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                                "kpack": kpack,
+                                            }
+                                        )
+
+    M1 = bs * 2
+    N1 = model_intermediate_size * 2 // tp_size
+    K1 = d_model
+    prune_configs_1 = prune_configs(M1, N1, K1, full_configs)
+
+    M2 = bs * 2
+    N2 = d_model
+    K2 = model_intermediate_size // tp_size
+    prune_configs_2 = prune_configs(M2, N2, K2, full_configs)
+
+    configs = union_of_list_of_dicts(prune_configs_1, prune_configs_2)
+
+    print(
+        f"{bs=} || {len(full_configs)=} | {len(prune_configs_1)=} | \
+            {len(prune_configs_2)=} | {len(configs)=}"
+    )
+
+    best_config = None
+    best_time_us = 1e20
+
+    print(f"{tp_size=} {bs=}")
+
+    for config in tqdm(configs):
+        # warmup
+        try:
+            print(config)
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_warmup_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                    hidden_states_dtype=hidden_states_dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+                hidden_states_dtype=hidden_states_dtype,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+                tqdm.write(
+                    f"{kernel_dur_us=:.1f} {model_dur_ms=:.1f}"
+                    f" {bs=} {tp_size=} {top_k=} {num_total_experts=} "
+                    f"{d_model=} {model_intermediate_size=} {num_layers=}"
+                )
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(
+        num_total_experts,
+        model_intermediate_size // tp_size,
+        "float8" if dtype == "float8" else None,
+    )
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+
+
+def run_timing(
+    num_calls: int,
+    bs: int,
+    d_model: int,
+    num_total_experts: int,
+    top_k: int,
+    tp_size: int,
+    model_intermediate_size: int,
+    method,
+    config,
+    dtype: str,
+    hidden_states_dtype,
+) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=hidden_states_dtype,
+    )
+
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fnuz)
+        w2 = w2.to(torch.float8_e4m3fnuz)
+        w1_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        w2_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        a1_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+        a2_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+
+    gating_output = F.softmax(
+        torch.rand(
+            (num_calls, bs, num_total_experts),
+            device=hidden_states.device,
+            dtype=torch.float32,
+        ),
+        dim=-1,
+    )
+
+    ##################################
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[0],
+            topk=top_k,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="benchmark_mixtral_moe",
+        description="Benchmark and tune the fused_moe kernel",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["float8", "float16", "bfloat16"],
+        help="Data type used for fused_moe kernel computations",
+    )
+    parser.add_argument("--model", type=str, default="hpcai-tech/grok-1")
+
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor paralleli size")
+    parser.add_argument("-b", "--batches", type=str)
+
+    args = parser.parse_args()
+
+    batches = args.batches.split(",")
+
+    sys.exit(main(args.model, args.tp_size, args.dtype, batches))
--- a/201
+++ b/201
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023-2024 SGLang Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/46
+++ b/46
@@ -0,0 +1,46 @@
+.PHONY: check-deps install-deps format update help
+
+# Show help for each target
+help:
+	@echo "Available targets:"
+	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
+
+check-deps: ## Check and install required Python formatting dependencies
+	@command -v isort >/dev/null 2>&1 || (echo "Installing isort..." && pip install isort)
+	@command -v black >/dev/null 2>&1 || (echo "Installing black..." && pip install black)
+
+install-deps: ## Install Python formatting tools (isort and black)
+	pip install isort black
+
+format: check-deps ## Format modified Python files using isort and black
+	@echo "Formatting modified Python files..."
+	git diff --name-only --diff-filter=M | grep '\.py$$' | xargs -I {} sh -c 'isort {} && black {}'
+
+FILES_TO_UPDATE = docker/Dockerfile.rocm \
+                 python/pyproject.toml \
+                 python/sglang/version.py \
+                 docs/developer_guide/setup_github_runner.md \
+                 docs/get_started/install.md \
+                 docs/platforms/amd_gpu.md \
+                 docs/platforms/ascend_npu.md \
+				 benchmark/deepseek_v3/README.md
+
+update: ## Update version numbers across project files. Usage: make update <new_version>
+	@if [ -z "$(filter-out $@,$(MAKECMDGOALS))" ]; then \
+		echo "Version required. Usage: make update <new_version>"; \
+		exit 1; \
+	fi
+	@OLD_VERSION=$$(grep "version" python/sglang/version.py | cut -d '"' -f2); \
+	NEW_VERSION=$(filter-out $@,$(MAKECMDGOALS)); \
+	echo "Updating version from $$OLD_VERSION to $$NEW_VERSION"; \
+	for file in $(FILES_TO_UPDATE); do \
+		if [ "$(shell uname)" = "Darwin" ]; then \
+			sed -i '' -e "s/$$OLD_VERSION/$$NEW_VERSION/g" $$file; \
+		else \
+			sed -i -e "s/$$OLD_VERSION/$$NEW_VERSION/g" $$file; \
+		fi \
+	done; \
+	echo "Version update complete"
+
+%:
+	@:
--- a/README.md
+++ b/README.md
@@ -0,0 +1,78 @@
+<div align="center" id="sglangtop">
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
+
+[![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
+![PyPI - Downloads](https://static.pepy.tech/badge/sglang?period=month)
+[![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
+[![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/sgl-project/sglang)
+
+</div>
+
+--------------------------------------------------------------------------------
+
+| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
+| [**Documentation**](https://docs.sglang.ai/)
+| [**Join Slack**](https://slack.sglang.ai/)
+| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
+| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
+| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
+
+## News
+- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
+- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
+- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
+- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
+- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
+- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
+- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
+- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
+
+<details>
+<summary>More</summary>
+
+- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
+- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
+- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
+- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
+- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
+- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
+- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
+
+</details>
+
+## About
+SGLang is a fast serving framework for large language models and vision language models.
+It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
+The core features include:
+
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
+
+## Getting Started
+- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
+- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
+- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
+- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
+- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
+
+## Benchmark and Performance
+Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).
+
+## Roadmap
+[Development Roadmap (2025 H2)](https://github.com/sgl-project/sglang/issues/7736)
+
+## Adoption and Sponsorship
+SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
+
+<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
+
+## Contact Us
+For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
+
+## Acknowledgment
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/logo.svg
+++ b/assets/logo.svg
--- a/assets/logo_square.svg
+++ b/assets/logo_square.svg
--- a/benchmark/bench_attention_sink/bench_attention_sink_triton.py
+++ b/benchmark/bench_attention_sink/bench_attention_sink_triton.py
@@ -0,0 +1,250 @@
+import argparse
+
+import torch
+import triton
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    decode_attention_fwd_grouped,
+)
+from sglang.srt.layers.attention.triton_ops.extend_attention import extend_attention_fwd
+
+# gpt oss
+head_num = 64
+head_dim = 64
+head_kv_num = 8
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["S"],  # sequence length on x-axis
+        x_vals=[128, 256, 512, 1024, 2048, 4096],
+        x_log=True,
+        line_arg="B",  # batch size as different lines
+        line_vals=[1, 8, 32, 128],
+        line_names=["B=1", "B=8", "B=32", "B=128"],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+            ("red", "-"),
+            ("cyan", "-"),
+        ],
+        ylabel="TFLOPS",
+        plot_name="attention-sink-triton-decode",
+        args={},
+    )
+)
+def benchmark_decode(B, S, H_Q, H_KV, D):
+    D_V = D
+    dtype = torch.bfloat16
+    seq_len = S
+    total_tokens = B * seq_len
+    device = torch.device("cuda")
+    sm_scale = 1.0 / (D**0.5)
+    max_kv_splits = 8
+    num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
+
+    # k_buffer and v_buffer represent all previous tokens
+    k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+    b_seq_len = torch.full((B,), seq_len, device="cuda")
+
+    kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+    kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len, dim=0)
+    kv_indices = torch.arange(total_tokens, device="cuda")
+
+    attn_logits1 = torch.empty(
+        (B, H_Q, max_kv_splits, D_V),
+        dtype=torch.float32,
+        device="cuda",
+    )
+    attn_lse1 = torch.empty(
+        (B, H_Q, max_kv_splits, D_V),
+        dtype=torch.float32,
+        device="cuda",
+    )
+    sink = torch.randn(H_Q, device=device, dtype=torch.float32)
+
+    # warmup
+    for _ in range(5):
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits1,
+            attn_lse1,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=0.0,
+            sinks=sink,
+        )
+
+    # benchmark
+    run_step = 500
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(run_step):
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits1,
+            attn_lse1,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=0.0,
+            sinks=sink,
+        )
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    ms = start_event.elapsed_time(end_event) / run_step
+    tflops = lambda ms: (2 * B * S * H_Q * D) * 1e-9 / ms  # must be causal
+    return tflops(ms)
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["S"],  # sequence length on x-axis
+        x_vals=[128, 256, 512, 1024, 2048, 4096],
+        x_log=True,
+        line_arg="B",  # batch size as different lines
+        line_vals=[1, 8, 32, 128],
+        line_names=["B=1", "B=8", "B=32", "B=128"],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+            ("red", "-"),
+            ("cyan", "-"),
+        ],
+        ylabel="TFLOPS",
+        plot_name="attention-sink-triton-extend",
+        args={},
+    )
+)
+def benchmark_extend(B, S, H_Q, H_KV, D):
+    # S here represents N_CTX from the test
+    dtype = torch.bfloat16
+    device = "cuda"
+
+    # Split S into prefix and extend lengths
+    prefill_len = S // 2  # Similar to test's N_CTX // 2
+    extend_len = S // 4  # Make extend length smaller than prefix
+
+    # Calculate total tokens and extend tokens
+    total_extend_tokens = B * extend_len
+    total_prefix_tokens = B * prefill_len
+
+    # Create query, key, value tensors for extension
+    q_extend = torch.randn(total_extend_tokens, H_Q, D, dtype=dtype, device=device)
+    k_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
+    v_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
+    o_extend = torch.empty_like(q_extend)
+
+    # Create key-value buffers for prefix
+    k_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
+    v_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
+
+    # Create index pointers
+    qo_indptr = torch.arange(0, (B + 1) * extend_len, extend_len, device=device).to(
+        torch.int32
+    )
+    kv_indptr = torch.arange(0, (B + 1) * prefill_len, prefill_len, device=device).to(
+        torch.int32
+    )
+    kv_indices = torch.arange(0, total_prefix_tokens, device=device).to(torch.int32)
+
+    sm_scale = 1.0 / (D**0.5)
+    # sliding_window = 128  # From GPT-OSS config, skip for now
+    sliding_window = -1
+
+    sink = torch.randn(H_Q, device=device, dtype=torch.float32)
+
+    # warmup
+    for _ in range(5):
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask=None,
+            is_causal=True,
+            mask_indptr=None,
+            max_len_extend=extend_len,
+            sm_scale=sm_scale,
+            sliding_window_size=sliding_window,
+            sinks=sink,
+        )
+
+    # benchmark
+    run_step = 500
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(run_step):
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask=None,
+            is_causal=True,
+            mask_indptr=None,
+            max_len_extend=extend_len,
+            sm_scale=sm_scale,
+            sliding_window_size=sliding_window,
+            sinks=sink,
+        )
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    ms = start_event.elapsed_time(end_event) / run_step
+
+    # FLOPS calculation: each attention operation requires 2 multiplications per element
+    total_flops = 2 * total_extend_tokens * H_Q * (prefill_len + extend_len / 2) * D
+    tflops = lambda ms: total_flops * 1e-12 / (ms * 1e-3)  # convert to TFLOPS
+    return tflops(ms)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bench", type=str, default="all", help="all, extend, decode")
+    args = parser.parse_args()
+
+    kwargs = {
+        "H_Q": head_num,
+        "H_KV": head_kv_num,
+        "D": head_dim,
+    }
+
+    if args.bench in ["all", "decode"]:
+        benchmark_decode.run(print_data=True, show_plots=False, **kwargs)
+
+    if args.bench in ["all", "extend"]:
+        benchmark_extend.run(print_data=True, show_plots=False, **kwargs)
+
+    print("Benchmark finished!")
--- a/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
+++ b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
@@ -0,0 +1,130 @@
+# Benchmark with lots of common prefixes. Used to benchmark prefix caching performance.
+#
+# Launch a server:
+# python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --log-level-http warning
+
+import random
+import string
+import time
+
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+import sglang as sgl
+from sglang import set_default_backend
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+
+def generate_random_string(token_length: int) -> str:
+    random_string = "".join(
+        random.choices(string.ascii_letters + string.digits, k=token_length * 100)
+    )
+    tokenized_output = tokenizer.encode(random_string, add_special_tokens=False)[
+        :token_length
+    ]
+
+    if len(tokenized_output) < token_length:
+        tokenized_output = tokenized_output + [tokenizer.pad_token_id] * (
+            token_length - len(tokenized_output)
+        )
+
+    decoded_string = tokenizer.decode(tokenized_output, skip_special_tokens=False)
+    return decoded_string
+
+
+def generate_unique_prefix(base_text, index):
+    return str(index) + base_text[len(str(index)) :]
+
+
+@sgl.function
+def text_qa(s, question, gen_len):
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0, max_tokens=gen_len)
+
+
+def prepare_prompts(num_prefix, num_samples_per_prefix, prefix_length, suffix_length):
+    base_prefix = generate_random_string(prefix_length)
+
+    tot_input_len = 0
+    all_prompts = []
+    for i in tqdm(range(num_prefix), desc="prepare prompts"):
+        unique_prefix = generate_unique_prefix(base_prefix, i)
+        prompt_list = []
+        for j in range(num_samples_per_prefix):
+            suffix = generate_random_string(suffix_length)
+            prompt = unique_prefix + suffix
+            prompt_list.append(prompt)
+            tot_input_len += len(tokenizer.encode(prompt))
+        all_prompts.append(prompt_list)
+    return all_prompts, tot_input_len
+
+
+def test_batch_by_batch(all_prompts, gen_len):
+    backend.flush_cache()
+
+    tot_time = 0
+    for i in range(len(all_prompts)):
+        tic = time.perf_counter()
+        text_qa.run_batch(
+            list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
+        )
+        tot_time += time.perf_counter() - tic
+
+    return tot_time
+
+
+def test_batch_by_batch_with_hint(all_prompts, gen_len):
+    backend.flush_cache()
+
+    tot_time = 0
+    for i in range(len(all_prompts)):
+        tic = time.perf_counter()
+        # Send a hint to cache the prefix
+        text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
+        # Send the batch
+        text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
+
+        tot_time += time.perf_counter() - tic
+
+    return tot_time
+
+
+def test_send_all(all_prompts, gen_len):
+    backend.flush_cache()
+
+    all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
+
+    tic = time.perf_counter()
+    text_qa.run_batch(
+        list(zip(all_prompts, [gen_len] * len(all_prompts))),
+    )
+    tot_time = time.perf_counter() - tic
+
+    return tot_time
+
+
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    backend = RuntimeEndpoint("http://127.0.0.1:30000")
+    set_default_backend(backend)
+
+    random.seed(0)
+    num_prefix = 10
+    num_samples_per_prefix = 32
+    prefix_length = 1024
+    suffix_length = 128
+    gen_len = 1
+    all_prompts, tot_input_len = prepare_prompts(
+        num_prefix, num_samples_per_prefix, prefix_length, suffix_length
+    )
+
+    print(f"Total input token length: {tot_input_len}\n")
+
+    cost = test_batch_by_batch(all_prompts, gen_len)
+    print(f"Latency of test_batch_by_batch          : {cost:.4f} s\n")
+
+    cost = test_batch_by_batch_with_hint(all_prompts, gen_len)
+    print(f"Latency of test_batch_by_batch_with_hint: {cost:.4f} s\n")
+
+    cost = test_send_all(all_prompts, gen_len)
+    print(f"Latency of test_send_all                : {cost:.4f} s\n")
--- a/benchmark/benchmark_batch/benchmark_batch.py
+++ b/benchmark/benchmark_batch/benchmark_batch.py
@@ -0,0 +1,193 @@
+import concurrent.futures
+import os
+import random
+import time
+from concurrent.futures import ProcessPoolExecutor
+from statistics import mean
+
+import requests
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+###############################################################################
+# CONFIG
+###############################################################################
+ENDPOINT_URL = "http://127.0.0.1:30000"
+TOKENIZER_DIR = "/models/meta-llama/Llama-3.2-3B"
+
+# Benchmark configurations
+NUM_REQUESTS = 10  # Total number of requests (each with BATCH_SIZE prompts)
+NUM_TOKENS = 32000  # Tokens per prompt
+BATCH_SIZE = 8  # Number of prompts per request
+GEN_TOKENS = 0  # Tokens to generate per prompt
+
+
+###############################################################################
+# REQUEST GENERATION (in parallel)
+###############################################################################
+def generate_random_prompt(index, tokenizer_dir, num_tokens):
+    """Generate a single random prompt with specified token count."""
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    vocab_size = tokenizer.vocab_size
+
+    def generate_random_text(num_toks):
+        random_token_ids = [random.randint(0, vocab_size - 1) for _ in range(num_toks)]
+        return tokenizer.decode(random_token_ids, clean_up_tokenization_spaces=True)
+
+    random_text = generate_random_text(num_tokens)
+    return f"Prompt {index}: {random_text}"
+
+
+def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_dir):
+    """Generate prompts for all requests in parallel."""
+    total_prompts = num_requests * batch_size
+    all_prompts = [None] * total_prompts
+    max_workers = min(os.cpu_count() or 1, total_prompts)
+
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(generate_random_prompt, i, tokenizer_dir, num_tokens)
+            for i in range(total_prompts)
+        ]
+        for future in tqdm(
+            concurrent.futures.as_completed(futures),
+            total=total_prompts,
+            desc="Generating prompts",
+        ):
+            index = futures.index(future)
+            all_prompts[index] = future.result()
+
+    batched_prompts = [
+        all_prompts[i * batch_size : (i + 1) * batch_size] for i in range(num_requests)
+    ]
+
+    print(
+        f"Generated {total_prompts} prompts with {num_tokens} tokens each, grouped into {num_requests} requests of {batch_size} prompts.\n"
+    )
+    return batched_prompts
+
+
+###############################################################################
+# HTTP CALLS
+###############################################################################
+def send_batch_request(endpoint, prompts, gen_tokens, request_id):
+    """Send a batch of prompts to the /generate endpoint synchronously."""
+    sampling_params = {
+        "max_new_tokens": gen_tokens,
+        "temperature": 0.7,
+        "stop": "\n",
+    }
+    data = {"text": prompts, "sampling_params": sampling_params}
+
+    start_time = time.perf_counter()
+    try:
+        response = requests.post(
+            endpoint.base_url + "/generate", json=data, timeout=3600
+        )
+        if response.status_code != 200:
+            error = response.json()
+            raise RuntimeError(f"Request {request_id} failed: {error}")
+        result = response.json()
+        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
+        avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
+        return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
+    except Exception as e:
+        print(f"[Request] Error for request {request_id}: {e}")
+        return request_id, 0, 0, False, len(prompts)
+
+
+def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
+    """Run the benchmark sequentially."""
+    results = []
+    num_requests = len(batched_prompts)
+
+    # Record start time for total latency
+    benchmark_start_time = time.perf_counter()
+
+    for i, batch_prompts in enumerate(batched_prompts):
+        request_id = i + 1
+        assert (
+            len(batch_prompts) == batch_size
+        ), f"Request {request_id} should have {batch_size} prompts, got {len(batch_prompts)}"
+
+        print(
+            f"[Request] Sending request {request_id}/{num_requests} with {len(batch_prompts)} prompts at {int(time.time()*1000)}"
+        )
+        result = send_batch_request(endpoint, batch_prompts, gen_tokens, request_id)
+        results.append(result)
+
+    # Calculate total latency
+    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms
+
+    return results, total_latency
+
+
+###############################################################################
+# RESULTS
+###############################################################################
+def process_results(results, total_latency, num_requests):
+    """Process and display benchmark results."""
+    total_time = 0
+    successful_requests = 0
+    failed_requests = 0
+    request_latencies = []
+    per_prompt_latencies = []
+    total_prompts = 0
+
+    for request_id, elapsed_time, avg_per_prompt, success, batch_size in results:
+        if success:
+            successful_requests += 1
+            total_prompts += batch_size
+            request_latencies.append(elapsed_time)
+            per_prompt_latencies.append(avg_per_prompt)
+            total_time += elapsed_time / 1000  # Convert to seconds
+        else:
+            failed_requests += 1
+
+    avg_request_latency = mean(request_latencies) if request_latencies else 0
+    avg_per_prompt_latency = mean(per_prompt_latencies) if per_prompt_latencies else 0
+    throughput = total_prompts / total_time if total_time > 0 else 0
+
+    print("\nBenchmark Summary:")
+    print(f"  Total requests sent:         {len(results)}")
+    print(f"  Total prompts sent:          {total_prompts}")
+    print(f"  Successful requests:         {successful_requests}")
+    print(f"  Failed requests:             {failed_requests}")
+    print(f"  Total latency (all requests): {total_latency:.2f} ms")
+    print(f"  Avg per request latency:     {avg_request_latency:.2f} ms")
+    print(f"  Avg per prompt latency:      {avg_per_prompt_latency:.2f} ms")
+    print(f"  Throughput:                  {throughput:.2f} prompts/second\n")
+
+
+###############################################################################
+# MAIN
+###############################################################################
+def main():
+    # Initialize endpoint
+    endpoint = RuntimeEndpoint(ENDPOINT_URL)
+
+    # Generate prompts
+    batched_prompts = prepare_all_prompts(
+        NUM_REQUESTS, BATCH_SIZE, NUM_TOKENS, TOKENIZER_DIR
+    )
+
+    # Flush cache before benchmark
+    # endpoint.flush_cache()
+
+    # Run benchmark
+    print(
+        f"Starting benchmark: NUM_TOKENS={NUM_TOKENS}, BATCH_SIZE={BATCH_SIZE}, NUM_REQUESTS={NUM_REQUESTS}\n"
+    )
+    results, total_latency = run_benchmark(
+        endpoint, batched_prompts, BATCH_SIZE, GEN_TOKENS
+    )
+
+    # Process and display results
+    process_results(results, total_latency, NUM_REQUESTS)
+
+
+if __name__ == "__main__":
+    random.seed(0)
+    main()
--- a/benchmark/benchmark_batch/benchmark_tokenizer.py
+++ b/benchmark/benchmark_batch/benchmark_tokenizer.py
@@ -0,0 +1,126 @@
+import random
+import time
+from statistics import mean
+
+from transformers import AutoTokenizer
+
+# CONFIG
+TOKENIZER_DIR = (
+    "/shared/public/sharing/fait360brew/training/models/meta-llama/Llama-3.2-3B"
+)
+NUM_TOKENS = 20000  # Each prompt should contain this many tokens
+BATCH_SIZES = [1, 2, 4, 8]  # Test different batch sizes
+NUM_RUNS = 5  # Number of runs for each batch size to get reliable measurements
+
+
+def generate_random_prompts(num_prompts, num_tokens, tokenizer):
+    """Generate random prompts with specified token count."""
+    vocab_size = tokenizer.vocab_size
+    all_prompts = []
+
+    print(f"Generating {num_prompts} random prompts with {num_tokens} tokens each...")
+    for i in range(num_prompts):
+        # Generate random token IDs - this directly gives us the exact token count
+        random_token_ids = [
+            random.randint(0, vocab_size - 1) for _ in range(num_tokens)
+        ]
+        random_text = tokenizer.decode(
+            random_token_ids, clean_up_tokenization_spaces=True
+        )
+
+        prompt = f"Prompt {i}: {random_text}"
+        tokens = tokenizer.encode(prompt)
+        print(f"  Prompt {i}: {len(tokens)} tokens")
+        all_prompts.append(prompt)
+
+    return all_prompts
+
+
+def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
+    """Compare sequential vs batch tokenization for a given batch size."""
+
+    # Sequential tokenization using encode()
+    sequential_times = []
+    for run in range(NUM_RUNS):
+        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
+
+        start_time = time.perf_counter()
+        for prompt in batch_prompts:
+            tokens = tokenizer.encode(prompt)
+        sequential_time = (time.perf_counter() - start_time) * 1000
+        sequential_times.append(sequential_time)
+
+    # Batch tokenization using tokenizer()
+    batch_times = []
+    for run in range(NUM_RUNS):
+        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
+
+        start_time = time.perf_counter()
+        tokens = tokenizer(batch_prompts)
+        batch_time = (time.perf_counter() - start_time) * 1000
+        batch_times.append(batch_time)
+
+    return {
+        "batch_size": batch_size,
+        "avg_sequential_ms": mean(sequential_times),
+        "avg_batch_ms": mean(batch_times),
+        "speedup_factor": (
+            mean(sequential_times) / mean(batch_times) if mean(batch_times) > 0 else 0
+        ),
+        "sequential_runs": sequential_times,
+        "batch_runs": batch_times,
+    }
+
+
+def main():
+    print("Tokenizer Benchmark: Sequential vs Batch Processing")
+    print("-" * 60)
+    print(f"Tokenizer: {TOKENIZER_DIR}")
+    print(f"Tokens per prompt: {NUM_TOKENS}")
+    print(f"Number of runs per batch size: {NUM_RUNS}")
+    print("-" * 60)
+
+    # Load tokenizer once for all operations
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
+
+    # The largest batch size determines how many prompts we need
+    max_batch_size = max(BATCH_SIZES)
+    all_prompts = generate_random_prompts(max_batch_size, NUM_TOKENS, tokenizer)
+
+    results = []
+    print("\nRunning benchmark...")
+
+    for batch_size in BATCH_SIZES:
+        print(f"\nBenchmarking batch size: {batch_size}")
+        result = benchmark_sequential_vs_batch(all_prompts, batch_size, tokenizer)
+        results.append(result)
+
+        print(f"  Sequential tokenization (encode):")
+        for i, run_time in enumerate(result["sequential_runs"]):
+            print(f"    Run {i+1}: {run_time:.2f} ms")
+        print(f"    Average: {result['avg_sequential_ms']:.2f} ms")
+
+        print(f"  Batch tokenization (tokenizer):")
+        for i, run_time in enumerate(result["batch_runs"]):
+            print(f"    Run {i+1}: {run_time:.2f} ms")
+        print(f"    Average: {result['avg_batch_ms']:.2f} ms")
+
+        print(f"  Speedup factor: {result['speedup_factor']:.2f}x")
+
+    print("\n" + "=" * 60)
+    print("SUMMARY OF RESULTS")
+    print("=" * 60)
+    print(
+        f"{'Batch Size':<10} {'Sequential (ms)':<18} {'Batch (ms)':<18} {'Speedup':<10}"
+    )
+    print("-" * 60)
+
+    for result in results:
+        print(
+            f"{result['batch_size']:<10} {result['avg_sequential_ms']:.2f} ms{' ' * 8} {result['avg_batch_ms']:.2f} ms{' ' * 8} {result['speedup_factor']:.2f}x"
+        )
+
+
+if __name__ == "__main__":
+    random.seed(0)
+    main()
--- a/benchmark/benchmark_vllm_060/README.md
+++ b/benchmark/benchmark_vllm_060/README.md
@@ -0,0 +1,89 @@
+## How to reproduce the benchmark results for SGLang v0.3.0 compared to vLLM v0.6.0
+
+In short, with multi step enabled, in online scenarios that we benchmarked, the Median TTFT of vLLM is **3 times** that of SGLang, and the Median ITL is **10 times** that of SGLang. Lower Median TTFT and ITL are better. vLLM's multi-step optimization did not improve throughput while ensuring lower Median TTFT and ITL. Also, under maximum throughput benchmark, if vLLM does not set gpu util to 0.95 separately and uses the default configuration instead, its maximum throughput is **lower** than that of SGLang.
+
+## Online benchmark results
+
+### Llama 3.1 8B Instruct 1 x A100 80G
+
+| RPS  | Num prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
+|------|-------------|--------|--------------------|-------------|-------------|------------|
+| 4    | 1200        | SGLang | 1564.17            | **31.98**   | 13.17       | **11.93**  |
+| 4    | 1200        | vLLM   | 1691.97            | **100.48**  | 14.14       | **129.32** |
+| 8    | 2400        | SGLang | 2175.02            | **35.68**   | 17.85       | **14.41**  |
+| 8    | 2400        | vLLM   | 2137.16            | **120.39**  | 17.09       | **158.63** |
+
+### Llama 3.1 70B Insruct 4 x H100 80G
+
+| RPS  | Num Prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
+|------|-------------|--------|--------------------|-------------|-------------|------------|
+| 4    | 1200        | SGLang | 3005.24            | **53.94**   | 25.03       | **21.67**  |
+| 4    | 1200        | vLLM   | 2915.60            | **179.15**  | 23.58       | **231.23** |
+| 8    | 2400        | SGLang | 4064.98            | **58.11**   | 33.07       | **24.45**  |
+| 8    | 2400        | vLLM   | 3752.38            | **207.12**  | 29.15       | **275.32** |
+
+## Offline benchmark results
+
+### Llama 3.1 8B Instruct 1 x A100 80G
+
+| RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
+|------|-------------|--------|--------------------|-------------------------|
+| inf  | 5000        | SGLang | 22.03              | **4281.51**             |
+| inf  | 5000        | vLLM   | 21.27              | **4132.37**             |
+
+### Llama 3.1 70B Insruct 4 x H100 80G
+
+| RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
+|------|-------------|--------|--------------------|-------------------------|
+| inf  | 5000        | SGLang | 19.84              | **3856.01**             |
+| inf  | 5000        | vLLM   | 19.04              | **3700.64**             |
+
+## Installation
+
+```bash
+# install sglang v0.3.0
+pip install --upgrade pip
+pip install "sglang[all]"==0.3.0
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
+
+# install vllm v0.6.0
+pip install vllm==0.6.0
+```
+
+## Notes
+
+We referred to the reproduction method in https://github.com/vllm-project/vllm/issues/8176, and added the `--num-scheduler-steps 10` parameter when starting the vLLM server. The `gpu_memory_utilization` of vLLM is by default 0.9 at both TP 1 and TP 4, while SGLang's `mem_frac` is 0.88 at TP 1 and 0.85 at TP 4, so we manually set it to 0.88 at TP 4.
+
+## Online benchmarks
+
+```bash
+# Llama 3.1 8B Instruct on 1 x A100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+
+# Llama 3.1 70B Instruct on 4 x H100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+
+# bench serving
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 2400 --request-rate 8
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 1200 --request-rate 4
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 2400 --request-rate 8
+```
+
+## Offline benchmarks
+
+```bash
+# Llama 3.1 8B Instruct on 1 x A100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+
+# Llama 3.1 70B Instruct on 4 x H100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+
+# bench serving
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 5000
+```
--- a/benchmark/blog_v0_2/405b_sglang.sh
+++ b/benchmark/blog_v0_2/405b_sglang.sh
@@ -0,0 +1,24 @@
+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+
+# Launch sglang
+# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quantization fp8 --disable-radix --mem-frac 0.87
+
+# offline
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
+
+# online
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
--- a/benchmark/blog_v0_2/405b_trt.sh
+++ b/benchmark/blog_v0_2/405b_trt.sh
@@ -0,0 +1,17 @@
+# Launch trtllm
+# https://github.com/sgl-project/tensorrt-demo
+
+# offline
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log11
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log12
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log13
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log14
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log15
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 2000 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log21
+
+# online
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log31
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log32
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log33
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log34
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log35
--- a/benchmark/blog_v0_2/405b_vllm.sh
+++ b/benchmark/blog_v0_2/405b_vllm.sh
@@ -0,0 +1,24 @@
+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+
+# Launch vllm
+# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
+
+# offline
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > vllm_log11
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > vllm_log12
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > vllm_log13
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > vllm_log14
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > vllm_log15
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 2000 > vllm_log21
+
+# online
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > vllm_log31
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35
--- a/benchmark/blog_v0_2/README.md
+++ b/benchmark/blog_v0_2/README.md
@@ -0,0 +1,164 @@
+# How to reproduce the benchmark results of SGLang
+
+## Prerequisite
+
+### Install the latest SGLang
+
+```bash
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+git checkout v0.2.7
+
+pip install --upgrade pip
+pip install -e "python[all]"
+
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+```
+
+### Set up ulimit and HF_TOKEN
+
+```bash
+ulimit -n 65535
+# Change the token to a real and usable one, with access permissions for the Llama 3 models.
+export HF_TOKEN=hf_token
+```
+
+### Launch the server
+
+```bash
+# Meta-Llama-3.1-8B-Instruct
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+
+# Meta-Llama-3.1-70B-Instruct
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
+
+# Meta-Llama-3-70B-Instruct-FP8
+python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8
+```
+
+## Benchmark
+
+### Hardware Requirements
+
+- 8B models: Single NVIDIA A100 80GB GPU
+- 70B models: 8 x NVIDIA A100 80GB GPUs with Tensor Parallelism (TP) 8
+- 70B FP8 models: 8 x NVIDIA H100 GPUs with Tensor Parallelism (TP) 8
+
+Please ensure you have the appropriate hardware before running the benchmarks.
+
+#### Offline benchmark
+
+```bash
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
+cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+#### Online benchmark
+
+```bash
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
+cat online.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+
+## Other
+
+We tried using vLLM 0.5.3.post1, but it often crashes under high loads, and it seems to have similar or worse performance compared to vLLM 0.5.2 from our partial benchmarking, so we are using the older version, vLLM 0.5.2.
+
+Preparation for TensorRT LLM can refer to https://github.com/sgl-project/tensorrt-demo. Specifically, we used a batch size of 512, a max input length of 8192, and a max number of tokens of 8192. The instance count for preprocessing and postprocessing in Triton Server is 16.
+
+```bash
+# vLLM
+pip install vllm==0.5.2
+pip install jsonschema==4.21.1
+
+# Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B-Instruct --disable-log-requests
+
+# meta-llama/Meta-Llama-3-70B-Instruct
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B-Instruct --disable-log-requests --tensor 8
+
+# neuralmagic/Meta-Llama-3-70B-Instruct-FP8
+python -m vllm.entrypoints.openai.api_server --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-log-requests --tensor 8
+```
+
+```bash
+wget https://raw.githubusercontent.com/sgl-project/sglang/main/python/sglang/bench_serving.py
+```
+
+```bash
+# vLLM Offline
+
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name sharegpt --num-prompts 3000 --output-file offline_vllm.jsonl
+cat offline_vllm.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+```bash
+# vLLM Online
+
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_vllm.jsonl
+cat online_vllm.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Offline 8B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_8b.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_8b.jsonl
+cat offline_trt_8b.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Online 8B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_8b.jsonl
+cat online_trt_8b.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Offline 70B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_70b.jsonl --model meta-llama/Meta-Llama-3-70B-Instruct
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_70b.jsonl
+cat offline_trt_70b.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Online 70B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_70b.jsonl
+cat online_trt_70b.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
--- a/benchmark/blog_v0_2/config.md
+++ b/benchmark/blog_v0_2/config.md
@@ -0,0 +1,100 @@
+### used for TensorRT LLM
+
+```
+{
+    "architecture": "LlamaForCausalLM",
+    "dtype": "float16",
+    "logits_dtype": "float32",
+    "vocab_size": 128256,
+    "max_position_embeddings": 8192,
+    "hidden_size": 16384,
+    "num_hidden_layers": 126,
+    "num_attention_heads": 128,
+    "num_key_value_heads": 16,
+    "head_size": 128,
+    "qk_layernorm": false,
+    "hidden_act": "silu",
+    "intermediate_size": 53248,
+    "norm_epsilon": 1e-05,
+    "position_embedding_type": "rope_gpt_neox",
+    "use_parallel_embedding": false,
+    "embedding_sharding_dim": 0,
+    "share_embedding_table": false,
+    "mapping": {
+        "world_size": 8,
+        "tp_size": 8,
+        "pp_size": 1,
+        "gpus_per_node": 8
+    },
+    "quantization": {
+        "quant_algo": "FP8",
+        "kv_cache_quant_algo": null,
+        "group_size": 128,
+        "smoothquant_val": null,
+        "has_zero_point": false,
+        "pre_quant_scale": false,
+        "exclude_modules": [
+            "lm_head"
+        ]
+    },
+    "kv_dtype": "float16",
+    "rotary_scaling": null,
+    "residual_mlp": false,
+    "moe_normalization_mode": null,
+    "rotary_base": 500000.0,
+    "moe_num_experts": 0,
+    "moe_top_k": 0,
+    "moe_tp_mode": 2,
+    "attn_bias": false,
+    "disable_weight_only_quant_plugin": false,
+    "mlp_bias": false
+}
+```
+
+### used for vLLM and SGLang
+
+```
+{
+  "_name_or_path": "dummy_fp8",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128009,
+  "hidden_act": "silu",
+  "hidden_size": 16384,
+  "initializer_range": 0.02,
+  "intermediate_size": 53248,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 128,
+  "num_hidden_layers": 126,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "activation_scheme": "static",
+    "ignored_layers": [
+      "lm_head"
+    ],
+    "quant_method": "fp8"
+  },
+  "rope_scaling": {
+    "factor": 8.0,
+    "low_freq_factor": 1.0,
+    "high_freq_factor": 4.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "max_position_embeddings": 131072,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.1",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+```
--- a/benchmark/boolq/README.md
+++ b/benchmark/boolq/README.md
@@ -0,0 +1,19 @@
+## Download data
+```
+git clone https://hf-mirror.com/datasets/google/boolq
+```
+
+## Convert parquet to json
+```
+bash parquet_to_json.sh
+```
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+
+```
+python3 bench_sglang.py
+```
--- a/benchmark/boolq/bench_sglang.py
+++ b/benchmark/boolq/bench_sglang.py
@@ -0,0 +1,124 @@
+import argparse
+import json
+import time
+
+import numpy as np
+
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import read_jsonl
+
+
+def get_example(lines, i, answer):
+    prompt = "Question: " + lines[i]["question"] + lines[i]["passage"] + "\nAnswer:"
+    if answer:
+        prompt += str(lines[i]["answer"])
+    return prompt
+
+
+def few_shot_examples(lines, k):
+    prompts = ""
+    for i in range(k):
+        prompts += get_example(lines, i, True) + "\n\n"
+    return prompts
+
+
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Read data
+    train_data_path = args.train_data_path
+    test_data_path = args.test_data_path
+    lines_train = list(read_jsonl(train_data_path))
+    lines_test = list(read_jsonl(test_data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shots = few_shot_examples(lines_train, num_shots)
+
+    questions = []
+    answer = []
+    for i in range(len(lines_test[:num_questions])):
+        questions.append(get_example(lines_test, i, False))
+        answer.append(str(lines_test[i]["answer"]))
+    arguments = [{"question": q} for q in questions]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_boolq(s, question):
+        s += few_shots + question
+        s += sgl.gen("answer", max_tokens=5, stop=["\n"])
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_boolq.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(states[i]["answer"])
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(answer))
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "boolq",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument(
+        "--train-data-path", type=str, default="./boolq/data/train-00000-of-00001.json"
+    )
+    parser.add_argument(
+        "--test-data-path",
+        type=str,
+        default="./boolq/data/validation-00000-of-00001.json",
+    )
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/boolq/convert_parquet_to_json.py
+++ b/benchmark/boolq/convert_parquet_to_json.py
@@ -0,0 +1,28 @@
+import sys
+
+import pyarrow.parquet as pq
+
+
+def convert_parquet_to_json(input_file, output_file):
+    # read parquet file
+    table = pq.read_table(input_file)
+
+    # turn parquet data to dataframe
+    df = table.to_pandas()
+
+    # turn dataframe to json form
+    json_data = df.to_json(orient="records", lines=True)
+
+    # write json to file
+    with open(output_file, "w") as f:
+        f.write(json_data)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage:python convert_parquet_to_json.py <input_file> <output_file>")
+
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+
+    convert_parquet_to_json(input_file, output_file)
--- a/benchmark/boolq/parquet_to_json.sh
+++ b/benchmark/boolq/parquet_to_json.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#define input and output direction
+input_dir="./boolq/data"
+output_dir="./boolq/data"
+
+#define files needed to be handled
+files=(
+        "train-00000-of-00001.parquet"
+        "validation-00000-of-00001.parquet"
+)
+
+#foe files above, use python script to convert the form
+for file in "${files[@]}"; do
+    input_file="${input_dir}/${file}"
+    output_file="${output_dir}/${file%.parquet}.json"
+
+    echo "Converting ${input_file} to ${output_file} ..."
+    python3 convert_parquet_to_json.py "${input_file}" "${output_file}"
+
+    if [ $? -eq 0 ]; then
+        echo "Conversion successful: ${output_file}"
+    else
+        echo "Conversion failed: ${input_file}"
+    fi
+done
--- a/benchmark/ceval/README.md
+++ b/benchmark/ceval/README.md
@@ -0,0 +1,15 @@
+## Download data
+```
+git lfs clone https://huggingface.co/datasets/ceval/ceval-exam
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+
+```
+python3 bench_sglang.py
+```
--- a/benchmark/ceval/bench_sglang.py
+++ b/benchmark/ceval/bench_sglang.py
@@ -0,0 +1,138 @@
+import argparse
+import json
+import os
+import random
+import re
+import time
+
+import numpy as np
+from datasets import load_dataset
+
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+
+choices = ["A", "B", "C", "D"]
+
+
+def get_one_example(line, include_answer):
+    res = line["question"]
+    res += f"\nA. {line['A']}"
+    res += f"\nB. {line['B']}"
+    res += f"\nC. {line['C']}"
+    res += f"\nD. {line['D']}"
+
+    if include_answer:
+        res += f"\nAnswer: {line['answer']} \n\n"
+    return res
+
+
+def get_few_shot_examples(lines):
+    res = ""
+    for line in lines:
+        res += get_one_example(line, True) + "\n\n"
+    return res
+
+
+def get_answer_value(response):
+    pattern = r"(Answer:|answer:|答案是|答案是:|正确答案是:|答案:|Assistant:)\s*([A-D])(?![\w])"
+    match = re.search(pattern, response)
+
+    if match:
+        return match.group(2)
+
+    return random.choice(choices)
+
+
+def main(args):
+    # Read data && Construct prompts
+    arguments = []
+    labels = []
+    examples = "examples:\n"
+    data_path = args.data_path
+    for subject in os.listdir(data_path):
+        subject_path = os.path.join(data_path, subject)
+        if os.path.isdir(subject_path) and subject != ".git":
+            dataset = load_dataset(data_path, name=subject)
+            dev_lines_temp = dataset["dev"]
+            val_lines_temp = dataset["val"]
+            few_shot_examples = get_few_shot_examples(dev_lines_temp, subject)
+            examples += f"{few_shot_examples}"
+            for val_line in val_lines_temp:
+                arguments.append(
+                    {
+                        "examples": few_shot_examples,
+                        "question": get_one_example(val_line, False),
+                    }
+                )
+                labels.append(val_line["answer"])
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_ceval(s, examples, question):
+        s += examples + question + sgl.gen("Answer")
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    num_questions = args.num_questions if args.num_questions else len(arguments)
+
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_ceval.run_batch(
+        arguments[:num_questions],
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = [get_answer_value(states[i]["Answer"]) for i in range(num_questions)]
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels[:num_questions]))
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("Answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ceval",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="ceval-exam")
+    parser.add_argument("--num-questions", type=int, default=None)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/deepseek_v3/README.md
+++ b/benchmark/deepseek_v3/README.md
@@ -0,0 +1,373 @@
+# DeepSeek V3.1/V3/R1 Support
+
+The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVIDIA and AMD GPUs **from day one**. SGLang also supports [MLA optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [DP attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models), making SGLang one of the best open-source LLM engines for running DeepSeek models. SGLang is the inference engine recommended by the official [DeepSeek team](https://github.com/deepseek-ai/DeepSeek-V3/tree/main?tab=readme-ov-file#62-inference-with-sglang-recommended).
+
+Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources.
+
+For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://docs.sglang.ai/basic_usage/deepseek.html).
+
+## Installation & Launch
+
+If you encounter errors when starting the server, ensure the weights have finished downloading. It's recommended to download them beforehand or restart multiple times until all weights are downloaded.
+
+### Using Docker (Recommended)
+
+```bash
+# Pull latest image
+# https://hub.docker.com/r/lmsysorg/sglang/tags
+docker pull lmsysorg/sglang:latest
+
+# Launch
+docker run --gpus all --shm-size 32g -p 30000:30000 -v ~/.cache/huggingface:/root/.cache/huggingface --ipc=host --network=host --privileged lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code --port 30000
+```
+
+If you are using RDMA, please note that:
+
+1. `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them.
+2. You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
+
+Add [performance optimization options](#performance-optimization-options) as needed.
+
+### Using pip
+
+```bash
+# Installation
+pip install "sglang[all]>=0.5.2"
+
+# Launch
+python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
+```
+
+Add [performance optimization options](#performance-optimization-options) as needed.
+
+<a id="option_args"></a>
+
+### Performance Optimization Options
+
+[MLA optimizations](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) are enabled by default. Here are some optional optimizations can be enabled as needed.
+
+- [Data Parallelism Attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models): For high QPS scenarios, add the `--enable-dp-attention` argument to boost throughput.
+- [Torch.compile Optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#torchcompile-latency-optimizations): Add `--enable-torch-compile` argument to enable it. This will take some time while server starts. The maximum batch size for torch.compile optimization can be controlled with `--torch-compile-max-bs`. It's recommended to set it between `1` and `8`. (e.g., `--torch-compile-max-bs 8`)
+
+### Usage: Chat with DeepSeek
+
+#### DeepSeek V3/R1
+
+```python3
+import openai
+client = openai.Client(
+    base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+# Chat completion
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {"role": "user", "content": "List 3 countries and their capitals."},
+    ],
+    temperature=0,
+    max_tokens=64,
+)
+print(response)
+```
+
+#### DeepSeek V3.1
+On top of the basic usage similar to the DeepSeek V3/R1 example, DeepSeek V3.1 supports a request-level thinking/non-thinking toggle. Simply switch the `"thinking"` field in `extra_body={"chat_template_kwargs": {"thinking": True}}` to enable/disable the thinking mode.
+
+##### Non Thinking
+```python3
+import openai
+client = openai.Client(
+    base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+# Chat completion
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {"role": "user", "content": "Answer the following with the second letter of the correct answer only: What is the capital of France?"},
+    ],
+    temperature=0,
+    max_tokens=1024,
+    extra_body = {"chat_template_kwargs": {"thinking": False}}
+)
+print(response.choices[0].message.content)
+```
+Answer:
+```
+h
+```
+* The correct response should be 'A', as the correct answer to the question is 'Paris'.
+##### Thinking
+```python3
+import openai
+client = openai.Client(
+    base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+# Chat completion
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {"role": "user", "content": "Answer the following with the second letter of the correct answer only: What is the capital of France?"},
+    ],
+    temperature=0,
+    max_tokens=1024,
+    extra_body = {"chat_template_kwargs": {"thinking": True}}
+)
+print(response)
+```
+Answer:
+```
+First, the question is: "What is the capital of France?" I know that the capital of France is Paris.
+
+The user says: "Answer the following with the second letter of the correct answer only." So, I need to provide only the second letter of the correct answer.
+
+The correct answer is "Paris". Now, I need to find the second letter of "Paris".
+
+Let's spell it out: P-A-R-I-S.
+
+- First letter: P
+
+- Second letter: A
+
+- Third letter: R
+
+- Fourth letter: I
+
+- Fifth letter: S
+
+So, the second letter is "A".
+
+I should only output the second letter, which is "A". No additional text or explanation, just the letter.
+
+The user emphasized "the second letter of the correct answer only", so my response should be just "A".
+
+Finally, I need to make sure that this is the correct answer. Yes, Paris is indeed the capital of France.</think>A
+```
+* The response contains `</think>` thinking trace and model was able to derive the correct answer from it.
+
+### Example: Serving with two H20\*8 nodes
+
+For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`. Please **use the first node's IP** for both commands.
+
+If the command fails, try setting the `GLOO_SOCKET_IFNAME` parameter. For more information, see [Common Environment Variables](https://pytorch.org/docs/stable/distributed.html#common-environment-variables).
+
+If the multi nodes support NVIDIA InfiniBand and encounter hanging issues during startup, consider adding the parameter `export NCCL_IB_GID_INDEX=3`. For more information, see [this](https://github.com/sgl-project/sglang/issues/3516#issuecomment-2668493307).
+
+```bash
+# node 1
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 10.0.0.1:5000 --nnodes 2 --node-rank 0 --trust-remote-code
+
+# node 2
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 10.0.0.1:5000 --nnodes 2 --node-rank 1 --trust-remote-code
+```
+
+If you have two H100 nodes, the usage is similar to the aforementioned H20.
+
+> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+### Example: Serving with two H200\*8 nodes and docker
+
+There are two H200 nodes, each with 8 GPUs. The first node's IP is `192.168.114.10`, and the second node's IP is `192.168.114.11`. Configure the endpoint to expose it to another Docker container using `--host 0.0.0.0` and `--port 40000`, and set up communications with `--dist-init-addr 192.168.114.10:20000`.
+A single H200 with 8 devices can run DeepSeek V3, the dual H200 setup is just to demonstrate multi-node usage.
+
+```bash
+# node 1
+docker run --gpus all \
+    --shm-size 32g \
+    --network=host \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --name sglang_multinode1 \
+    -it \
+    --rm \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 192.168.114.10:20000 --nnodes 2 --node-rank 0 --trust-remote-code --host 0.0.0.0 --port 40000
+```
+
+```bash
+# node 2
+docker run --gpus all \
+    --shm-size 32g \
+    --network=host \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --name sglang_multinode2 \
+    -it \
+    --rm \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 192.168.114.10:20000 --nnodes 2 --node-rank 1 --trust-remote-code --host 0.0.0.0 --port 40000
+```
+
+To ensure functionality, we include a test from a client Docker container.
+
+```bash
+docker run --gpus all \
+    --shm-size 32g \
+    --network=host \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --name sglang_multinode_client \
+    -it \
+    --rm \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1 --random-output 512 --random-range-ratio 1 --num-prompts 1 --host 0.0.0.0 --port 40000 --output-file "deepseekv3_multinode.jsonl"
+```
+
+> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+### Example: Serving with four A100\*8 nodes
+
+To serve DeepSeek-V3 with A100 GPUs, we need to convert the [FP8 model checkpoints](https://huggingface.co/deepseek-ai/DeepSeek-V3) to BF16 with [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) mentioned [here](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) first.
+
+Since the BF16 model is over 1.3 TB, we need to prepare four A100 nodes, each with 8 80GB GPUs. Assume the first node's IP is `10.0.0.1`, and the converted model path is `/path/to/DeepSeek-V3-BF16`, we can have following commands to launch the server.
+
+```bash
+# node 1
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 0 --trust-remote-code --host 0.0.0.0 --port 30000
+
+# node 2
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 1 --trust-remote-code
+
+# node 3
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 2 --trust-remote-code
+
+# node 4
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 3 --trust-remote-code
+```
+
+> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+Then we can benchmark the accuracy and latency by accessing the first node's exposed port with the following example commands.
+
+```bash
+# bench accuracy
+python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319 --host http://10.0.0.1 --port 30000
+
+# bench latency
+python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1:30000 --batch-size 1 --input-len 128 --output-len 128
+```
+
+
+### Example: Serving with 8 A100/A800 with AWQ Quantization
+
+**Recommended Usage**
+
+Add `--quantization moe_wna16` flag to enable moe wna16 kernel for better performance.
+One example is as follows:
+
+```bash
+python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization moe_wna16
+```
+
+Alternatively, you can use `--quantization awq_marlin` as follows:
+
+```bash
+python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization awq_marlin --dtype float16
+```
+
+Note that `awq_marlin` only supports `float16` now, which may lead to some precision loss.
+
+### Example: Serving with 16 A100/A800 with int8 Quantization
+
+There are block-wise and per-channel quantization methods, and the quantization parameters have already been uploaded to Huggingface. One example is as follows:
+
+- [meituan/DeepSeek-R1-Block-INT8](https://huggingface.co/meituan/DeepSeek-R1-Block-INT8)
+- [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8)
+
+Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-INT8` and port=5000, we can have following commands to launch the server:
+```bash
+#master
+python3 -m sglang.launch_server \
+	--model meituan/DeepSeek-R1-Block-INT8 --tp 16 --dist-init-addr \
+	MASTER_IP:5000 --nnodes 2 --node-rank 0 --trust-remote-code --enable-torch-compile --torch-compile-max-bs 8
+#cluster
+python3 -m sglang.launch_server \
+	--model meituan/DeepSeek-R1-Block-INT8 --tp 16 --dist-init-addr \
+	MASTER_IP:5000 --nnodes 2 --node-rank 1 --trust-remote-code --enable-torch-compile --torch-compile-max-bs 8
+```
+
+> **Note that the launch command here enables `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+Then on the **master node**, supposing the ShareGPT data is located at `/path/to/ShareGPT_V3_unfiltered_cleaned_split.json`, you can run the following commands to benchmark the launched server:
+
+```bash
+# bench accuracy
+python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319
+
+# bench serving
+python3 -m sglang.bench_serving --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --dataset-name random  --random-input 128 --random-output 128 --num-prompts 1000 --request-rate 128 --random-range-ratio 1.0
+```
+
+> **Note: using `--parallel 200` can accelerate accuracy benchmarking**.
+
+### Example: Serving with 32 L40S with int8 Quantization
+
+Running with per-channel quantization model:
+
+- [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8)
+
+Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-Channel-INT8` and port=5000, we can have following commands to launch the server:
+
+```bash
+#master
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 0 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+#cluster
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 1 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 2 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 3 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+```
+
+The benchmarking method is the same as describted in the previous [16 x A100](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-16-a100a800-with-int8-quantization) example.
+
+### Example: Serving on any cloud or Kubernetes with SkyPilot
+
+SkyPilot helps find cheapest available GPUs across any cloud or existing Kubernetes clusters and launch distributed serving with a single command. See details [here](https://github.com/skypilot-org/skypilot/tree/master/llm/deepseek-r1).
+
+To serve on multiple nodes:
+
+```bash
+git clone https://github.com/skypilot-org/skypilot.git
+# Serve on 2 H100/H200x8 nodes
+sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B.yaml --retry-until-up
+# Serve on 4 A100x8 nodes
+sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B-A100.yaml --retry-until-up
+```
+
+#### Troubleshooting
+
+If you encounter the following error with fp16/bf16 checkpoint:
+
+```bash
+ValueError: Weight output_partition_size = 576 is not divisible by weight quantization block_n = 128.
+```
+
+edit your `config.json` and remove the `quantization_config` block. For example:
+
+```json
+"quantization_config": {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "quant_method": "fp8",
+    "weight_block_size": [128, 128]
+},
+```
+
+Removing this block typically resolves the error. For more details, see the discussion in [sgl-project/sglang#3491](https://github.com/sgl-project/sglang/issues/3491#issuecomment-2650779851).
+
+## DeepSeek V3 Optimization Plan
+
+https://github.com/sgl-project/sglang/issues/2591
--- a/benchmark/dspy/README.md
+++ b/benchmark/dspy/README.md
@@ -0,0 +1,51 @@
+## Install
+
+```
+pip3 install dspy-ai
+```
+
+Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/dsp/modules/cache_utils.py#L10.
+```
+cache_turn_on = False
+```
+
+or set the environment variable
+
+```
+export DSP_CACHEBOOL=false
+```
+
+## Benchmark SGLang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_dspy_intro.py --backend sglang
+```
+
+
+## Benchmark TGI
+```
+docker run --name tgi --rm -ti --gpus all --network host \
+  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
+  ghcr.io/huggingface/text-generation-inference:1.3.0 \
+  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
+  --max-input-length 2048 --max-total-tokens 4096 \
+  --port 24000
+```
+
+```
+python3 bench_dspy_intro.py --backend tgi
+```
+
+
+
+## Benchmark vLLM
+```
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests  --port 21000
+```
+
+```
+python3 bench_dspy_intro.py --backend vllm
+```
--- a/benchmark/dspy/bench_dspy_intro.py
+++ b/benchmark/dspy/bench_dspy_intro.py
@@ -0,0 +1,192 @@
+"""
+Adapted from
+https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/intro.ipynb#L9
+"""
+
+import argparse
+
+import dspy
+from dspy.datasets import HotPotQA
+
+
+class BasicQA(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+class RAG(dspy.Module):
+    def __init__(self, num_passages=3):
+        super().__init__()
+
+        self.retrieve = dspy.Retrieve(k=num_passages)
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+
+    def forward(self, question):
+        context = self.retrieve(question).passages
+        prediction = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=prediction.answer)
+
+
+def main(args):
+    # lm = dspy.OpenAI(model='gpt-3.5-turbo')
+    if args.backend == "tgi":
+        lm = dspy.HFClientTGI(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    elif args.backend == "sglang":
+        lm = dspy.HFClientSGLang(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    elif args.backend == "vllm":
+        lm = dspy.HFClientVLLM(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+    colbertv2_wiki17_abstracts = dspy.ColBERTv2(
+        url="http://20.102.90.50:2017/wiki17_abstracts"
+    )
+    dspy.settings.configure(lm=lm, rm=colbertv2_wiki17_abstracts)
+
+    # Load the dataset.
+    dataset = HotPotQA(
+        train_seed=1, train_size=20, eval_seed=2023, dev_size=args.dev_size, test_size=0
+    )
+
+    # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+
+    print(len(trainset), len(devset))
+
+    train_example = trainset[0]
+    print(f"Question: {train_example.question}")
+    print(f"Answer: {train_example.answer}")
+
+    dev_example = devset[18]
+    print(f"Question: {dev_example.question}")
+    print(f"Answer: {dev_example.answer}")
+    print(f"Relevant Wikipedia Titles: {dev_example.gold_titles}")
+
+    print(
+        f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys {train_example.labels().keys()}"
+    )
+    print(
+        f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and label keys {dev_example.labels().keys()}"
+    )
+
+    # Define the predictor.
+    generate_answer = dspy.Predict(BasicQA)
+
+    # Call the predictor on a particular input.
+    pred = generate_answer(question=dev_example.question)
+
+    # Print the input and the prediction.
+    print(f"Question: {dev_example.question}")
+    print(f"Predicted Answer: {pred.answer}")
+
+    lm.inspect_history(n=1)
+
+    # Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged.
+    generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)
+
+    # Call the predictor on the same input.
+    pred = generate_answer_with_chain_of_thought(question=dev_example.question)
+
+    # Print the input, the chain of thought, and the prediction.
+    print(f"Question: {dev_example.question}")
+    print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
+    print(f"Predicted Answer: {pred.answer}")
+
+    retrieve = dspy.Retrieve(k=3)
+    topK_passages = retrieve(dev_example.question).passages
+
+    print(
+        f"Top {retrieve.k} passages for question: {dev_example.question} \n",
+        "-" * 30,
+        "\n",
+    )
+
+    for idx, passage in enumerate(topK_passages):
+        print(f"{idx+1}]", passage, "\n")
+
+    retrieve("When was the first FIFA World Cup held?").passages[0]
+
+    from dspy.teleprompt import BootstrapFewShot
+
+    # Validation logic: check that the predicted answer is correct.
+    # Also check that the retrieved context does actually contain that answer.
+    def validate_context_and_answer(example, pred, trace=None):
+        answer_EM = dspy.evaluate.answer_exact_match(example, pred)
+        answer_PM = dspy.evaluate.answer_passage_match(example, pred)
+        return answer_EM and answer_PM
+
+    # Set up a basic teleprompter, which will compile our RAG program.
+    teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
+
+    # Compile!
+    compiled_rag = teleprompter.compile(RAG(), trainset=trainset)
+
+    # Ask any question you like to this simple RAG program.
+    my_question = "What castle did David Gregory inherit?"
+
+    # Get the prediction. This contains `pred.context` and `pred.answer`.
+    pred = compiled_rag(my_question)
+
+    # Print the contexts and the answer.
+    print(f"Question: {my_question}")
+    print(f"Predicted Answer: {pred.answer}")
+    print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")
+
+    from dspy.evaluate.evaluate import Evaluate
+
+    # Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
+    evaluate_on_hotpotqa = Evaluate(
+        devset=devset,
+        num_threads=args.num_threads,
+        display_progress=True,
+        display_table=5,
+    )
+
+    # Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
+    metric = dspy.evaluate.answer_exact_match
+    evaluate_on_hotpotqa(compiled_rag, metric=metric)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--num-threads", type=int, default=32)
+    parser.add_argument("--dev-size", type=int, default=150)
+    parser.add_argument(
+        "--backend", type=str, choices=["sglang", "tgi", "vllm"], default="sglang"
+    )
+    args = parser.parse_args()
+
+    if args.port is None:
+        default_port = {
+            "vllm": 21000,
+            "lightllm": 22000,
+            "tgi": 24000,
+            "sglang": 30000,
+        }
+        args.port = default_port.get(args.backend, None)
+
+    main(args)
--- a/benchmark/generative_agents/README.md
+++ b/benchmark/generative_agents/README.md
@@ -0,0 +1,38 @@
+## Download the dataset
+
+```
+wget -O agent_calls.jsonl https://drive.google.com/uc?export=download&id=19qLpD45e9JGTKF2cUjJJegwzSUEZEKht
+```
+
+## Run benchmark
+
+Ensure that this benchmark is run in a serial manner (using --parallel 1) to preserve any potential dependencies between requests.
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-events 1000 --parallel 1
+```
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-events 1000 --backend vllm --parallel 1
+```
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-events 1000 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --num-events 1000 --backend lmql --parallel 1
+```
--- a/benchmark/generative_agents/agent_functions.py
+++ b/benchmark/generative_agents/agent_functions.py
@@ -0,0 +1,300 @@
+import sglang as sgl
+
+# here are the top five agent functions contributing ~70% LLM calls
+# reference: https://github.com/joonspk-research/generative_agents/
+
+
+@sgl.function
+def poignancy_event(s, persona_name, persona_iss, event):
+    s += "Here is a brief description of " + persona_name + ".\n"
+    s += persona_iss + "\n"
+    s += "On the scale of 1 to 10, where 1 is purely mundane (e.g., brushing teeth, making bed) and 10 is extremely poignant (e.g., a break up, college acceptance), rate the likely poignancy of the following event for"
+    s += persona_name + ".\n\n"
+    s += "Event: " + event
+    s += "Rate (return a number between 1 to 10):"
+    s += sgl.gen(name="Rate", max_tokens=2)
+
+
+def poignancy_event_prompt(persona_name, persona_iss, event):
+    # return prompt and max_tokens
+    s = ""
+    s += "Here is a brief description of " + persona_name + ".\n"
+    s += persona_iss + "\n"
+    s += "On the scale of 1 to 10, where 1 is purely mundane (e.g., brushing teeth, making bed) and 10 is extremely poignant (e.g., a break up, college acceptance), rate the likely poignancy of the following event for"
+    s += persona_name + ".\n\n"
+    s += "Event: " + event
+    s += "Rate (return a number between 1 to 10):"
+    return {"prompt": s, "max_tokens": 2, "stop": None}
+
+
+@sgl.function
+def generate_event_triple(s, persona_name, action):
+    s += """Task: Turn the input into (subject, predicate, object).
+Input: Sam Johnson is eating breakfast.
+Output: (Dolores Murphy, eat, breakfast)
+---
+Input: Joon Park is brewing coffee.
+Output: (Joon Park, brew, coffee)
+---
+Input: Jane Cook is sleeping.
+Output: (Jane Cook, is, sleep)
+---
+Input: Michael Bernstein is writing email on a computer.
+Output: (Michael Bernstein, write, email)
+---
+Input: Percy Liang is teaching students in a classroom.
+Output: (Percy Liang, teach, students)
+---
+Input: Merrie Morris is running on a treadmill.
+Output: (Merrie Morris, run, treadmill)
+---"""
+    s += persona_name + "is" + action + ".\n"
+    s += "(" + persona_name + ","
+    s += sgl.gen(name="Triple", max_tokens=20, stop=")")
+
+
+def generate_event_triple_prompt(persona_name, action):
+    s = ""
+    s += """Task: Turn the input into (subject, predicate, object).
+Input: Sam Johnson is eating breakfast.
+Output: (Dolores Murphy, eat, breakfast)
+---
+Input: Joon Park is brewing coffee.
+Output: (Joon Park, brew, coffee)
+---
+Input: Jane Cook is sleeping.
+Output: (Jane Cook, is, sleep)
+---
+Input: Michael Bernstein is writing email on a computer.
+Output: (Michael Bernstein, write, email)
+---
+Input: Percy Liang is teaching students in a classroom.
+Output: (Percy Liang, teach, students)
+---
+Input: Merrie Morris is running on a treadmill.
+Output: (Merrie Morris, run, treadmill)
+---"""
+    s += persona_name + "is" + action + ".\n"
+    s += "(" + persona_name + ","
+    return {"prompt": s, "max_tokens": 20, "stop": ")"}
+
+
+@sgl.function
+def generate_pronunciatio(s, action):
+    s += "Convert an action description to an emoji (important: use two or less emojis).\n"
+    s += "Action description: " + action + ".\n"
+    s += "Emoji:" + sgl.gen(name="Emoji", max_tokens=6)
+
+
+def generate_pronunciatio_prompt(action):
+    s = ""
+    s += "Convert an action description to an emoji (important: use two or less emojis).\n"
+    s += "Action description: " + action + ".\n"
+    s += "Emoji:"
+    return {"prompt": s, "max_tokens": 6, "stop": None}
+
+
+@sgl.function
+def action_location_sector(
+    s,
+    persona_name,
+    living_sector,
+    living_sector_areas,
+    current_sector,
+    current_sector_areas,
+    daily_plan,
+    sector_options,
+    current_action,
+    next_action,
+):
+    s += """Task -- choose an appropriate area  from the area options for a task at hand.
+Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For taking a walk, Sam Kim should go to the following area: {Johnson Park}
+---
+Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
+Jane Anderson is currently in {Oak Hill College} that has a classroom, library
+Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
+---"""
+    s += (
+        persona_name
+        + " lives in "
+        + living_sector
+        + " that has "
+        + living_sector_areas
+        + ".\n"
+    )
+    s += (
+        persona_name
+        + " is currently in "
+        + current_sector
+        + " that has "
+        + current_sector_areas
+        + ".\n"
+    )
+    s += daily_plan + ".\n"
+    s += "Area options: " + sector_options + ".\n"
+    s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.\n"""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + " should go to the following area: {"
+    )
+    s += sgl.gen(name="Location", max_tokens=10, stop="}")
+
+
+def action_location_sector_prompt(
+    persona_name,
+    living_sector,
+    living_sector_areas,
+    current_sector,
+    current_sector_areas,
+    daily_plan,
+    sector_options,
+    current_action,
+    next_action,
+):
+    s = ""
+    s += """Task -- choose an appropriate area  from the area options for a task at hand.
+Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For taking a walk, Sam Kim should go to the following area: {Johnson Park}
+---
+Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
+Jane Anderson is currently in {Oak Hill College} that has a classroom, library
+Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
+---"""
+    s += (
+        persona_name
+        + " lives in "
+        + living_sector
+        + " that has "
+        + living_sector_areas
+        + ".\n"
+    )
+    s += (
+        persona_name
+        + " is currently in "
+        + current_sector
+        + " that has "
+        + current_sector_areas
+        + ".\n"
+    )
+    s += daily_plan + ".\n"
+    s += "Area options: " + sector_options + ".\n"
+    s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.\n"""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + " should go to the following area: {"
+    )
+    return {"prompt": s, "max_tokens": 10, "stop": "}"}
+
+
+@sgl.function
+def action_location_object(
+    s, persona_name, target_sector, target_sector_areas, current_action, next_action
+):
+    s += """
+Jane Anderson is in kitchen in Jane Anderson's house.
+Jane Anderson is going to Jane Anderson's house that has the following areas: {kitchen,  bedroom, bathroom}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
+Answer: {kitchen}
+---
+Tom Watson is in common room in Tom Watson's apartment.
+Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
+Answer: {cafe}
+---"""
+    s += (
+        persona_name
+        + " is going to "
+        + target_sector
+        + " that has the following areas: {"
+        + target_sector_areas
+        + "}\n"
+    )
+    s += """* Stay in the current area if the activity can be done there.
+* NEVER go into other people's rooms unless necessary."""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + "should go to the following area in "
+        + target_sector
+    )
+    s += " (MUST pick one of {" + target_sector_areas + "}):\n"
+    s += "Answer: {" + sgl.gen(name="Area", max_tokens=5, stop="}")
+
+
+def action_location_object_prompt(
+    persona_name, target_sector, target_sector_areas, current_action, next_action
+):
+    s = ""
+    s += """
+Jane Anderson is in kitchen in Jane Anderson's house.
+Jane Anderson is going to Jane Anderson's house that has the following areas: {kitchen,  bedroom, bathroom}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
+Answer: {kitchen}
+---
+Tom Watson is in common room in Tom Watson's apartment.
+Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
+Answer: {cafe}
+---"""
+    s += (
+        persona_name
+        + " is going to "
+        + target_sector
+        + " that has the following areas: {"
+        + target_sector_areas
+        + "}\n"
+    )
+    s += """* Stay in the current area if the activity can be done there.
+* NEVER go into other people's rooms unless necessary."""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + "should go to the following area in "
+        + target_sector
+    )
+    s += " (MUST pick one of {" + target_sector_areas + "}):\n"
+    s += "Answer: {"
+    return {"prompt": s, "max_tokens": 5, "stop": "}"}
--- a/benchmark/generative_agents/bench_other.py
+++ b/benchmark/generative_agents/bench_other.py
@@ -0,0 +1,80 @@
+import argparse
+import json
+import time
+
+from agent_functions import (
+    action_location_object_prompt,
+    action_location_sector_prompt,
+    generate_event_triple_prompt,
+    generate_pronunciatio_prompt,
+    poignancy_event_prompt,
+)
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_events]
+    mapping = {
+        "poignancy_event": poignancy_event_prompt,
+        "generate_event_triple": generate_event_triple_prompt,
+        "generate_pronunciatio": generate_pronunciatio_prompt,
+        "action_location_sector": action_location_sector_prompt,
+        "action_location_object": action_location_object_prompt,
+    }
+
+    arguments = [mapping[k](**v) for l in lines for k, v in l.items()]
+    states = []
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    def get_one_answer(arg):
+        answer = call_generate(**arg, temperature=0)
+        states.append(answer)
+
+    async def get_one_answer_async(arg):
+        answer = await call_generate(**arg, temperature=0)
+        states.append(answer)
+
+    tic = time.perf_counter()
+    # we always sequentially execute agent calls to maintain its dependency
+    if args.backend != "lmql":
+        for arg in tqdm(arguments):
+            get_one_answer(arg)
+    else:
+        import asyncio
+
+        loop = asyncio.get_event_loop()
+        for arg in tqdm(arguments):
+            loop.run_until_complete(get_one_answer_async(arg))
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "Generative Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            # to pack weighted functions as a single agent
+            "num_requests": len(arguments) / len(mapping),
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="agent_calls.jsonl")
+    parser.add_argument("--num-events", type=int, default=10)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/generative_agents/bench_sglang.py
+++ b/benchmark/generative_agents/bench_sglang.py
@@ -0,0 +1,74 @@
+import argparse
+import json
+import time
+
+from agent_functions import (
+    action_location_object,
+    action_location_sector,
+    generate_event_triple,
+    generate_pronunciatio,
+    poignancy_event,
+)
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_events]
+    mapping = {
+        "poignancy_event": poignancy_event,
+        "generate_event_triple": generate_event_triple,
+        "generate_pronunciatio": generate_pronunciatio,
+        "action_location_sector": action_location_sector,
+        "action_location_object": action_location_object,
+    }
+    arguments = [{mapping[k]: v for k, v in l.items()} for l in lines]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    states = []
+    # Run requests
+    tic = time.perf_counter()
+    for a in arguments:
+        # only a single key in the dict
+        for func, arg in a.items():
+            result = func.run(**arg)
+        result.sync()
+        states.append(result)
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "Generative Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            # to pack weighted functions as a single agent
+            "num_requests": len(arguments) / len(mapping),
+            "other": {
+                "num_events": args.num_events,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="agent_calls.jsonl")
+    parser.add_argument("--num-events", type=int, default=10)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/gpt_oss/README.md
+++ b/benchmark/gpt_oss/README.md
@@ -0,0 +1,163 @@
+# How to reproduce the result of GPT-OSS with SGLang
+
+### Install the latest SGLang
+
+```bash
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+git checkout v0.5.1.post3
+
+pip install --upgrade pip
+pip install -e "python[all]"
+```
+
+### Reproduce the benchmark throughput result (Batch Size 1)
+
+Launch Command
+
+```bash
+# MXFP4 120B on H100
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 8 --attention-backend triton
+
+# BF16 120B on H100
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 8 --attention-backend triton
+
+# MXFP4 120B on B200
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 4
+
+# BF16 120B on B200
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 4
+```
+
+Benchmark Command
+
+```bash
+
+# MXFP4 120B on H100
+python3 -m sglang.bench_one_batch_server --model openai/gpt-oss-120b --base-url http://localhost:30000 --batch-size 1 --input-len 1024 --output-len 512 --show-report
+```
+
+### Reproduce the benchmark throughput result (Batch Size 32)
+
+Launch Command
+
+```bash
+# MXFP4 120B on H100
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 8
+
+# BF16 120B on H100
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 8
+
+# MXFP4 120B on B200
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 4
+
+# BF16 120B on B200
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 4
+```
+
+Benchmark Command
+
+```bash
+python3 -m sglang.bench_one_batch_server --model openai/gpt-oss-120b --base-url http://localhost:30000 --batch-size 32 --input-len 1024 8192 --output-len 512 --show-report
+```
+
+### Reproduce the evaluation result
+
+Install gpt-oss
+
+```bash
+git clone https://github.com/openai/gpt-oss.git
+cd gpt-oss
+pip install -e .
+```
+
+Evaluation Command
+
+```bash
+DATASET=gpqa
+BASE_URL=YOUR_BASE_URL
+OPENAI_API_KEY=dummy python -m gpt_oss.evals \
+    --base-url ${BASE_URL}/v1 \
+    --model dummy \
+    --reasoning-effort low,medium,high \
+    --eval $DATASET \
+    --n-threads 1000
+```
+
+### Reproduce the benchmark result of acceptance length
+> Note: On B200, if top k is 1, set `--attention-backend trtllm_mha`
+```bash
+git clone https://github.com/sgl-project/SpecForge.git
+cd SpecForge/benchmarks
+config_list=(
+    "1,0,0,0"
+    "1,3,1,4"
+    "1,5,4,8"
+)
+python3 bench_model_speedup.py \
+    --model-path openai/gpt-oss-120b \
+    --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
+    --port 20001 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 4 \
+    --attention-backend fa3 \
+    --config-list "${config_list[@]}" \
+    --benchmark-list mtbench:80 gsm8k:200 humaneval:200 math500:200 \
+    --output lmsys_gpt-oss-120b_Eagle3_result.jsonl
+
+python3 bench_model_speedup.py \
+    --model-path openai/gpt-oss-120b \
+    --speculative-draft-model-path nvidia/gpt-oss-120b-Eagle3 \
+    --port 20001 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 4 \
+    --attention-backend fa3 \
+    --config-list "${config_list[@]}" \
+    --benchmark-list mtbench:80 gsm8k:200 humaneval:200 math500:200 \
+    --output nv_gpt-oss-120b_Eagle3_result.jsonl
+```
+
+### Reproduce the result of speculative decoding speedup
+
+Launch Command
+
+```bash
+# On Hopper:
+# - Tree decoding (topk > 1) and chain decoding (topk = 1) are supported on both FA3 and Triton backends.
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algorithm EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algorithm EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --tp 4
+
+# On Blackwell:
+# - Chain decoding (topk = 1) is supported on TRTLLM-MHA backend. Tree decoding (topk > 1) is in progress, stay tuned!
+# - Both tree decoding (topk > 1) and chain decoding (topk = 1) are supported on the Triton backend.
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --attention-backend triton --tp 4
+```
+
+Benchmark Command
+
+```bash
+config_list=(
+    "1,0,0,0"
+    "1,3,1,4"
+    "1,5,4,8"
+)
+python3 bench_model_speedup.py \
+    --model-path openai/gpt-oss-120b \
+    --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
+    --port 20001 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 4 \
+    --attention-backend fa3 \
+    --config-list "${config_list[@]}" \
+    --benchmark-list gsm8k:200 humaneval:200 math500:200 \
+    --output lmsys_gpt-oss-120b_Eagle3_result.jsonl
+```
+
+We can gain the best speedup with the following settings:
+
+- **1.39x** speedup with the `--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4` setting.
+- **1.52x** speedup with the `--speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8` setting.
--- a/benchmark/gsm8k/README.md
+++ b/benchmark/gsm8k/README.md
@@ -0,0 +1,47 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 200
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+
+### Benchmark lmql
+```
+CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+
+```
+python3 bench_other.py --num-questions 100 --backend lmql --parallel 2
+```
--- a/benchmark/gsm8k/bench_other.py
+++ b/benchmark/gsm8k/bench_other.py
@@ -0,0 +1,151 @@
+import argparse
+import ast
+import asyncio
+import json
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def main(args):
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    # Read data
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+
+    states = [None] * len(labels)
+
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            answer = call_generate(
+                prompt=few_shot_examples + questions[i],
+                temperature=0,
+                max_tokens=256,
+                stop=["Question", "Assistant:", "<|separator|>"],
+            )
+            states[i] = answer
+
+        tic = time.perf_counter()
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(questions), batch_size):
+                tasks = []
+                for q in questions[i : i + batch_size]:
+                    tasks.append(
+                        call_generate(
+                            few_shot_examples + q,
+                            temperature=0,
+                            max_tokens=256,
+                            stop="Question",
+                        )
+                    )
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    states[i + j] = rets[j]
+
+        tic = time.perf_counter()
+        asyncio.run(batched_call(batch_size=args.parallel))
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/gsm8k/bench_sglang.py
+++ b/benchmark/gsm8k/bench_sglang.py
@@ -0,0 +1,148 @@
+import argparse
+import ast
+import json
+import os
+import re
+import time
+
+import numpy as np
+
+from sglang.lang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    dump_bench_raw_result,
+    select_sglang_backend,
+)
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Read data
+    data_path = args.data_path
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    if not os.path.isfile(data_path):
+        data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+        )
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_gsm8k.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    dump_bench_raw_result(
+        path=args.raw_result_file,
+        states=states,
+        preds=preds,
+        labels=labels,
+    )
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/hellaswag/README.md
+++ b/benchmark/hellaswag/README.md
@@ -0,0 +1,47 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 200
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+
+### Benchmark lmql
+```
+lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend lmql --port 23000 --parallel 1
+```
--- a/benchmark/hellaswag/bench_other.py
+++ b/benchmark/hellaswag/bench_other.py
@@ -0,0 +1,118 @@
+import argparse
+import asyncio
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_select
+from sglang.utils import download_and_cache_file, read_jsonl
+
+
+def get_one_example(lines, i, include_answer):
+    ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
+    if include_answer:
+        ret += lines[i]["endings"][lines[i]["label"]]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def main(args):
+    # Select backend
+    call_select = get_call_select(args)
+
+    # Read data
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    choices = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        choices.append(lines[i]["endings"])
+        labels.append(lines[i]["label"])
+
+    preds = [None] * len(labels)
+
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            preds[i] = call_select(
+                context=few_shot_examples + questions[i], choices=choices[i]
+            )
+
+        tic = time.perf_counter()
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(questions), batch_size):
+                tasks = []
+                for q, c in zip(
+                    questions[i : i + batch_size], choices[i : i + batch_size]
+                ):
+                    tasks.append(call_select(context=few_shot_examples + q, choices=c))
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    preds[i + j] = rets[j]
+
+        tic = time.perf_counter()
+        asyncio.run(batched_call(batch_size=args.parallel))
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    print(f"Latency: {latency:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "hellaswag",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=20)
+    parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/hellaswag/bench_sglang.py
+++ b/benchmark/hellaswag/bench_sglang.py
@@ -0,0 +1,109 @@
+import argparse
+import json
+import os
+import time
+
+import numpy as np
+
+from sglang.lang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import download_and_cache_file, read_jsonl
+
+
+def get_one_example(lines, i, include_answer):
+    ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
+    if include_answer:
+        ret += lines[i]["endings"][lines[i]["label"]]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Read data
+    data_path = args.data_path
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    if not os.path.isfile(data_path):
+        data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    choices = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        choices.append(lines[i]["endings"])
+        labels.append(lines[i]["label"])
+    arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_hellaswag(s, question, choices):
+        s += few_shot_examples + question
+        s += sgl.select("answer", choices=choices)
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    rets = few_shot_hellaswag.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    print(f"Latency: {latency:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "hellaswag",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=20)
+    parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/hf3fs/bench.sh
+++ b/benchmark/hf3fs/bench.sh
@@ -0,0 +1,59 @@
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+python3 benchmark/hf3fs/bench_client.py
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs.json \
+python3 benchmark/hf3fs/bench_storage.py
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+export SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs.json
+echo '{"file_path_prefix": "/data/hf3fs-test-0", "file_size": 1099511627776, "numjobs": 16, "entries": 8}' > \
+${SGLANG_HICACHE_HF3FS_CONFIG_PATH}
+python3 benchmark/hf3fs/bench_zerocopy.py
+
+####################################################################################################
+
+rm -rf nohup.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 33301 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs &
+
+rm -rf bench_multiturn.out && \
+nohup python3 benchmark/hicache/bench_multiturn.py \
+    --model-path /code/models/Qwen3-32B \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 33301 \
+    --request-length 2048 --num-clients 512 --num-rounds 3 --max-parallel 8 \
+    > bench_multiturn.out &
+
+####################################################################################################
+
+rm -rf nohup.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/DeepSeek-R1/ \
+    --tp 16 --nnodes 2 --node-rank 0 \
+    --dist-init-addr 10.74.249.153:5000 \
+    --host 0.0.0.0 --port 33301 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 60 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs &
+
+rm -rf bench_multiturn.out && \
+nohup python3 benchmark/hicache/bench_multiturn.py \
+    --model-path /code/models/Qwen3-32B \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 33301 \
+    --request-length 2048 --num-clients 1024 --num-rounds 3 --max-parallel 8 \
+    > bench_multiturn.out &
+
+####################################################################################################
+
+ps aux | grep "sglang.launch_server" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "bench_multiturn.py" | grep -v grep | awk '{print $2}' | xargs kill -9
--- a/benchmark/hf3fs/bench_client.py
+++ b/benchmark/hf3fs/bench_client.py
@@ -0,0 +1,162 @@
+import concurrent.futures
+import logging
+import random
+import time
+from typing import List
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.mem_cache.storage.hf3fs.client_hf3fs import Hf3fsClient
+
+
+def print_stats(x: List[int]):
+    x = sorted(x)
+    lenx = len(x)
+    print(
+        f"mean = {sum(x)/len(x):.2f}, "
+        f"min = {min(x):.2f}, "
+        f"p25 = {x[int(lenx*0.25)]:.2f}, "
+        f"p50 = {x[int(lenx*0.5)]:.2f}, "
+        f"p75 = {x[int(lenx*0.75)]:.2f}, "
+        f"max = {max(x):.2f}"
+    )
+
+
+def test():
+    # /path/to/hf3fs
+    file_path = "/data/bench.bin"
+    file_size = 1 << 40
+    bytes_per_page = 16 << 20
+    entries = 32
+    file_ops = Hf3fsClient(file_path, file_size, bytes_per_page, entries)
+
+    print("test batch_read / batch_write")
+    num_pages = 128
+    dtype = torch.bfloat16
+    numel = bytes_per_page // dtype.itemsize
+    offsets = list(range(file_size // bytes_per_page))
+    random.shuffle(offsets)
+    offsets = offsets[:num_pages]
+    offsets = [i * bytes_per_page for i in offsets]
+    tensor_writes = [
+        torch.randn(numel, dtype=dtype)
+        for _ in tqdm(range(num_pages), desc="prepare tensor")
+    ]
+    for i in tqdm(range(0, num_pages, file_ops.entries), desc="batch_write"):
+        results = file_ops.batch_write(
+            offsets[i : i + file_ops.entries], tensor_writes[i : i + file_ops.entries]
+        )
+        assert all([result == numel * dtype.itemsize for result in results])
+    tensor_reads = [
+        torch.empty(numel, dtype=dtype)
+        for _ in tqdm(range(num_pages), desc="prepare tensor")
+    ]
+    for i in tqdm(range(0, num_pages, file_ops.entries), desc="batch_read"):
+        results = file_ops.batch_read(
+            offsets[i : i + file_ops.entries], tensor_reads[i : i + file_ops.entries]
+        )
+        assert all([result == numel * dtype.itemsize for result in results])
+    assert all([torch.allclose(r, w) for r, w in zip(tensor_reads, tensor_writes)])
+
+    file_ops.close()
+    print("test done")
+
+
+def bench():
+    file_path = "/data/bench.bin"
+    file_size = 1 << 40
+    bytes_per_page = 16 << 20
+    entries = 8
+    numjobs = 16
+
+    dtype = torch.bfloat16
+    numel = bytes_per_page // dtype.itemsize
+
+    file_ops = [
+        Hf3fsClient(file_path, file_size, bytes_per_page, entries)
+        for _ in range(numjobs)
+    ]
+
+    num_page = entries
+
+    offsets = list(range(file_size // bytes_per_page))
+    tensors_write = [torch.randn(numel, dtype=dtype)] * num_page
+    tensors_read = [torch.empty(numel, dtype=dtype)] * num_page
+    random.shuffle(offsets)
+
+    warmup = 50
+    iteration = 100
+
+    executor = concurrent.futures.ThreadPoolExecutor(max_workers=numjobs)
+
+    w_bw = []
+    w_size = num_page * numjobs * bytes_per_page / (1 << 30)
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking write (GB/s)"):
+        _offsets = [
+            [
+                offset * bytes_per_page
+                for offset in offsets[
+                    (i * numjobs + j) * num_page : (i * numjobs + j + 1) * num_page
+                ]
+            ]
+            for j in range(numjobs)
+        ]
+        tik = time.perf_counter()
+        futures = [
+            executor.submit(file_ops[j].batch_write, offset, tensors_write)
+            for j, offset in enumerate(_offsets)
+        ]
+        results = [future.result() for future in futures]
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        w_bw.append(w_size / (tok - tik))
+        results = [
+            _result == bytes_per_page for result in results for _result in result
+        ]
+        assert all(results)
+    print_stats(w_bw)
+
+    r_bw = []
+    r_size = w_size
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking read (GB/s)"):
+        _offsets = [
+            [
+                offset * bytes_per_page
+                for offset in offsets[
+                    (i * numjobs + j) * num_page : (i * numjobs + j + 1) * num_page
+                ]
+            ]
+            for j in range(numjobs)
+        ]
+        tik = time.perf_counter()
+        futures = [
+            executor.submit(file_ops[j].batch_read, offset, tensors_read)
+            for j, offset in enumerate(_offsets)
+        ]
+        results = [future.result() for future in futures]
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        r_bw.append(r_size / (tok - tik))
+        results = [
+            _result == bytes_per_page for result in results for _result in result
+        ]
+        assert all(results)
+    print_stats(r_bw)
+
+    executor.shutdown(wait=True)
+    for _file_ops in file_ops:
+        _file_ops.close()
+    print("bench done")
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    test()
+    bench()
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/hf3fs/bench_storage.py
+++ b/benchmark/hf3fs/bench_storage.py
@@ -0,0 +1,258 @@
+import json
+import logging
+import os
+import random
+import time
+from typing import List
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.mem_cache.storage.hf3fs.mini_3fs_metadata_server import (
+    Hf3fsLocalMetadataClient,
+)
+from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import HiCacheHF3FS
+
+
+def print_stats(x: List[int]):
+    x = sorted(x)
+    lenx = len(x)
+    print(
+        f"mean = {sum(x)/len(x):.2f}, "
+        f"min = {min(x):.2f}, "
+        f"p25 = {x[int(lenx*0.25)]:.2f}, "
+        f"p50 = {x[int(lenx*0.5)]:.2f}, "
+        f"p75 = {x[int(lenx*0.75)]:.2f}, "
+        f"max = {max(x):.2f}"
+    )
+
+
+def test():
+    # Qwen3-32B
+    layer_num = 64
+    head_num, head_dim = 8, 128
+    kv_lora_rank, qk_rope_head_dim = 0, 0
+    store_dtype = torch.bfloat16
+    tokens_per_page = 64
+
+    file_path_prefix = "/data/test"
+    file_size = 128 << 20
+    numjobs = 16
+    bytes_per_page = 16 << 20
+    entries = 2
+    dtype = store_dtype
+
+    config_path = os.getenv(HiCacheHF3FS.default_env_var)
+    assert config_path
+    try:
+        with open(config_path, "w") as f:
+            json.dump(
+                {
+                    "file_path_prefix": file_path_prefix,
+                    "file_size": file_size,
+                    "numjobs": numjobs,
+                    "entries": entries,
+                },
+                f,
+            )
+    except Exception as e:
+        raise RuntimeError(f"Failed to dump config to {config_path}: {str(e)}")
+    hicache_hf3fs = HiCacheHF3FS.from_env_config(bytes_per_page, dtype)
+
+    numel = 2 * tokens_per_page * layer_num * head_num * head_dim
+    assert numel * dtype.itemsize == bytes_per_page
+
+    num_pages = 10
+    tensors = {}
+    for i in range(num_pages):
+        k = f"key_{i}"
+        v = torch.randn((numel,)).to(dtype=dtype)
+        ok = hicache_hf3fs.set(k, v)
+        if i < (file_size // bytes_per_page):
+            assert ok, f"Failed to insert {k}"
+        else:
+            assert not ok
+        tensors[k] = v
+    assert hicache_hf3fs.get("key_8") is None
+    assert hicache_hf3fs.get("key_9") is None
+
+    start = 0
+    for i in range(start, start + hicache_hf3fs.num_pages):
+        k = f"key_{i}"
+        assert hicache_hf3fs.exists(k)
+        out = hicache_hf3fs.get(k)
+        assert out is not None
+        v = tensors[k]
+        assert torch.allclose(v, out, atol=1e-3), f"Tensor mismatch for {k}"
+
+    assert not hicache_hf3fs.exists("not_exists")
+
+    hicache_hf3fs.delete("key_7")
+    v2 = torch.randn((numel,)).to(dtype=dtype)
+    assert hicache_hf3fs.set("key_new", v2)
+    assert torch.allclose(hicache_hf3fs.get("key_new"), v2, atol=1e-3)
+
+    hicache_hf3fs.clear()
+    assert (
+        len(hicache_hf3fs.metadata_client.rank_metadata.free_pages)
+        == hicache_hf3fs.metadata_client.rank_metadata.num_pages
+    )
+
+    # batch
+    num_pages = 10
+    tensors = {}
+    keys = []
+    values = []
+    for i in range(num_pages):
+        k = f"key_{i}"
+        keys.append(k)
+        v = torch.randn((numel,)).to(dtype=dtype)
+        values.append(v)
+
+    ok = hicache_hf3fs.batch_set(keys, values)
+    assert not ok
+    assert hicache_hf3fs.get("key_8") is None
+    assert hicache_hf3fs.get("key_9") is None
+
+    results = hicache_hf3fs.batch_get(keys[: hicache_hf3fs.num_pages])
+    for result, key, value in zip(
+        results, keys[: hicache_hf3fs.num_pages], values[: hicache_hf3fs.num_pages]
+    ):
+        assert torch.allclose(value, result, atol=1e-3), f"Tensor mismatch for {key}"
+
+    hicache_hf3fs.close()
+    os.remove(hicache_hf3fs.file_path)
+
+    print("All test cases passed.")
+
+
+def bench():
+    # Qwen3-32B
+    layer_num = 64
+    head_num, head_dim = 8, 128
+    kv_lora_rank, qk_rope_head_dim = 0, 0
+    store_dtype = torch.bfloat16
+    tokens_per_page = 64
+
+    file_path = "/data/test.bin"
+    file_size = 1 << 40
+    numjobs = 16
+    bytes_per_page = 16 << 20
+    entries = 8
+    dtype = store_dtype
+    hicache_hf3fs = HiCacheHF3FS(
+        rank=0,
+        file_path=file_path,
+        file_size=file_size,
+        numjobs=numjobs,
+        bytes_per_page=bytes_per_page,
+        entries=entries,
+        dtype=dtype,
+        metadata_client=Hf3fsLocalMetadataClient(),
+    )
+
+    numel = 2 * tokens_per_page * layer_num * head_num * head_dim
+    assert numel * dtype.itemsize == bytes_per_page
+
+    num_page = 128
+    values = [torch.randn((numel,)).to(dtype=dtype) for _ in tqdm(range(num_page))]
+
+    warmup = 50
+    iteration = 100
+
+    w_bw = []
+    w_size = num_page * bytes_per_page / (1 << 30)
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking write (GB/s)"):
+        keys = [f"{j}" for j in range(i * num_page, (i + 1) * num_page)]
+        tik = time.perf_counter()
+        ok = hicache_hf3fs.batch_set(keys, values)
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        w_bw.append(w_size / (tok - tik))
+        assert ok
+    print_stats(w_bw)
+
+    r_bw = []
+    r_size = num_page * bytes_per_page / (1 << 30)
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking read (GB/s)"):
+        keys = random.sample(
+            list(hicache_hf3fs.metadata_client.rank_metadata.key_to_index.keys()),
+            num_page,
+        )
+        tik = time.perf_counter()
+        results = hicache_hf3fs.batch_get(keys)
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        r_bw.append(r_size / (tok - tik))
+        assert all([r is not None for r in results])
+    print_stats(r_bw)
+
+    hicache_hf3fs.close()
+
+
+def allclose():
+    # Qwen3-32B
+    layer_num = 64
+    head_num, head_dim = 8, 128
+    kv_lora_rank, qk_rope_head_dim = 0, 0
+    store_dtype = torch.bfloat16
+    tokens_per_page = 64
+
+    file_path = "/data/test.bin"
+    file_size = 1 << 40
+    numjobs = 16
+    bytes_per_page = 16 << 20
+    entries = 8
+    dtype = store_dtype
+    hicache_hf3fs = HiCacheHF3FS(
+        rank=0,
+        file_path=file_path,
+        file_size=file_size,
+        numjobs=numjobs,
+        bytes_per_page=bytes_per_page,
+        entries=entries,
+        dtype=dtype,
+        metadata_client=Hf3fsLocalMetadataClient(),
+    )
+
+    numel = 2 * tokens_per_page * layer_num * head_num * head_dim
+    assert numel * dtype.itemsize == bytes_per_page
+
+    num_page = 128
+    values = [torch.randn((numel,)).to(dtype=dtype) for _ in tqdm(range(num_page))]
+
+    iteration = 100
+
+    for i in tqdm(range(iteration), desc="Benchmarking write (GB/s)"):
+        keys = [f"{j}" for j in range(i * num_page, (i + 1) * num_page)]
+        ok = hicache_hf3fs.batch_set(keys, values)
+        assert ok
+
+    read_keys, read_results = [], []
+    for i in tqdm(range(iteration), desc="Benchmarking read (GB/s)"):
+        keys = random.sample(
+            list(hicache_hf3fs.metadata_client.rank_metadata.key_to_index.keys()),
+            num_page,
+        )
+        results = hicache_hf3fs.batch_get(keys)
+        read_keys.extend(keys)
+        read_results.extend(results)
+        assert all([r is not None for r in results])
+
+    for key, result in tqdm(zip(read_keys, read_results)):
+        assert torch.allclose(values[int(key) % num_page], result, atol=1e-3)
+
+    hicache_hf3fs.close()
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    test()
+    bench()
+    allclose()
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/hf3fs/bench_zerocopy.py
+++ b/benchmark/hf3fs/bench_zerocopy.py
@@ -0,0 +1,140 @@
+import threading
+import time
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.distributed import (
+    get_world_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from sglang.srt.managers.cache_controller import (
+    HiCacheController,
+    PrefetchOperation,
+    StorageOperation,
+)
+from sglang.srt.mem_cache.allocator import TokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool
+from sglang.srt.mem_cache.memory_pool_host import MHATokenToKVPoolHost
+
+init_distributed_environment(
+    world_size=1,
+    rank=0,
+    distributed_init_method="tcp://127.0.0.1:23456",
+    local_rank=0,
+    backend="gloo",
+)
+
+initialize_model_parallel(
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+)
+
+group = get_world_group().cpu_group
+
+max_total_num_tokens = 524288
+page_size = 64
+kv_cache_dtype = torch.bfloat16
+layer_num = 64
+head_num, head_dim = 8, 128
+device = "cuda"
+hicache_ratio = 2
+hicache_size = 0
+hicache_mem_layout = "page_first"
+# hicache_mem_layout = "layer_first"
+hicache_write_policy = "write_through"
+hicache_io_backend = "kernel"
+hicache_storage_backend = "hf3fs"
+prefetch_threshold = 256
+
+op_size = 1024
+op_num = 16
+
+token_to_kv_pool = MHATokenToKVPool(
+    max_total_num_tokens,
+    page_size=page_size,
+    dtype=kv_cache_dtype,
+    head_num=head_num,
+    head_dim=head_dim,
+    layer_num=layer_num,
+    device=device,
+    enable_memory_saver=True,
+)
+
+token_to_kv_pool_allocator = TokenToKVPoolAllocator(
+    max_total_num_tokens,
+    dtype=kv_cache_dtype,
+    device=device,
+    kvcache=token_to_kv_pool,
+    need_sort=False,
+)
+
+kv_cache = token_to_kv_pool_allocator.get_kvcache()
+token_to_kv_pool_host = MHATokenToKVPoolHost(
+    kv_cache,
+    hicache_ratio,
+    hicache_size,
+    page_size,
+    hicache_mem_layout,
+)
+
+load_cache_event = threading.Event()
+cache_controller = HiCacheController(
+    token_to_kv_pool_allocator,
+    token_to_kv_pool_host,
+    page_size,
+    group,
+    load_cache_event=load_cache_event,
+    write_policy=hicache_write_policy,
+    io_backend=hicache_io_backend,
+    storage_backend=hicache_storage_backend,
+    prefetch_threshold=prefetch_threshold,
+)
+
+operations = [
+    StorageOperation(
+        torch.tensor(list(range(i, i + op_size))),
+        list(range(i, i + op_size)),
+        hash_value=[f"{j}" for j in range(i, i + op_size, page_size)],
+    )
+    for i in tqdm(range(0, op_num * op_size, op_size))
+]
+
+tik = time.monotonic()
+if hicache_mem_layout == "page_first":
+    for operation in operations:
+        cache_controller.zerocopy_page_backup(operation, batch_size=128)
+elif hicache_mem_layout == "layer_first":
+    for operation in operations:
+        cache_controller.generic_page_backup(operation, batch_size=128)
+tok = time.monotonic()
+print(f"{tok-tik:.6f} s")
+
+operations = [
+    PrefetchOperation(
+        f"{i}",
+        torch.tensor(list(range(i, i + op_size))),
+        list(range(i, i + op_size)),
+        f"{i}",
+    )
+    for i in tqdm(range(0, op_num * op_size, op_size))
+]
+
+for operation in operations:
+    operation.hash_value = [
+        f"{j}"
+        for j in range(
+            int(operation.last_hash), int(operation.last_hash) + op_size, page_size
+        )
+    ]
+
+tik = time.monotonic()
+if hicache_mem_layout == "page_first":
+    for operation in operations:
+        cache_controller.zerocopy_page_transfer(operation, batch_size=128)
+elif hicache_mem_layout == "layer_first":
+    for operation in operations:
+        cache_controller.generic_page_transfer(operation, batch_size=128)
+tok = time.monotonic()
+print(f"{tok-tik:.6f} s")
--- a/benchmark/hicache/README.md
+++ b/benchmark/hicache/README.md
@@ -0,0 +1,91 @@
+## Run synthetic multi-turn benchmark
+
+```
+# SGLang server with radix cache disabled
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000 --disable-radix-cache
+
+# SGLang server with radix cache on and first-come-first-serve policy
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000 --schedule-policy fcfs
+
+# The default SGLang server with radix cache on and long-prefix-match policy
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000
+
+# SGLang server with hierarchical radix cache enabled
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000 --enable-hierarchical-cache
+
+```
+
+```
+python bench_multiturn.py --model-path Qwen/Qwen2.5-14B-Instruct
+```
+
+Note: The performance gain of hierarchical caching depends on the ratio of reusable tokens to GPU memory capacity. The more tokens to be reused, the larger the model, and the more constrained the GPU memory size, the greater the benefit one can expect from hierarchical caching.
+
+
+# Benchmark with more datasets
+## Download Dataset
+```bash
+./download.sh {sharegpt|ultragpt|loogle|nextqa|all}
+```
+This script will automatically download the required dataset to the current working directory
+
+## Multiturn Benchmark
+### Supported Datasets
+- sharegpt
+- ultrachat
+- loogle
+### Example Usage:
+```bash
+python3 bench_serving.py --model mistralai/Mistral-7B-Instruct-v0.3 --backend sglang \
+--dataset-path longdep_qa.json --dataset-name loogle --request-rate 10 --num-prompts 10  \
+--port 8001 --enable-multiturn --disable-shuffle
+```
+This uses `mistralai/Mistral-7B-Instruct-v0.3` model with `sglang` as backend. The dataset
+is `longdep_qa.json`. We send `10 conversations` with `10 req/s` to port 8001. We enable
+multiturn chat without shuffling the order of conversations (i.e. following the original
+order in the dataset file).
+
+### Note:
+The requests of multiple conversations are sent in a round robin fashion.
+For example, if we have 3 conversations A, B, C whose rounds are `[2, 3, 4]` correspondingly,
+multiturn chat will send the requests to the backend in the following order: `[A1, B1, C1, A2, B2, C2, B3, C3, C4]`
+This has implications on the cache reuse patterns: the cache reuse distance is the largest
+under this request pattern (which means a prefix-aware local scheduler in the backend can
+yield the most benefit compared to a FIFO scheduler)
+
+## Shared Prefix Benchmark
+### Supported Datasets
+- loogle
+### Example Usage:
+```bash
+python3 bench_serving.py --model mistralai/Mistral-7B-Instruct-v0.3 --backend sglang \
+--dataset-path longdep_qa.json --dataset-name loogle --request-rate 10 --num-prompts 10  \
+--port 8001 --enable-shared-prefix --disable-shuffle
+```
+### Note:
+Shared Prefix benchmark sends the questions for the same prompt together. For example,
+if we have 3 shared prefix A, B, C, which have [2, 3, 4] questions correspondingly,
+the shared prefix benchmark will send the requests to the
+backend in the following order: `[A+Q1, A+Q2, B+Q1, B+Q2, B+Q3, C+Q1, C+Q2, C+Q3]`.
+
+
+## Multi Modality Benchmark (WIP)
+### Supported Datasets:
+- nextqa
+### Example Usage:
+```bash
+Server:
+python3 -m sglang.launch_server --model-path lmms-lab/LLaVA-NeXT-Video-7B  --tp 2 --dp 1 --port 8001 \
+--host 0.0.0.0 --mem-fraction-static 0.9 --tokenizer-path llava-hf/llava-1.5-7b-hf \
+--json-model-override-args "{\"architectures\": [\"LlavaVidForCausalLM\"], \"model_type\":\"llava\", \"mm_spatial_pool_stride\":2}"
+
+Client:
+python3 bench_serving.py --model lmms-lab/LLaVA-NeXT-Video-7B --backend sglang  --dataset-path \
+NExTVideo  --dataset-name nextqa --request-rate 10 --num-prompts 1 --disable-shuffle --port 8001 \ --enable-multiturn --max-frames 16 --tokenizer llava-hf/llava-1.5-7b-hf --fixed-output-len 2048
+```
+Note: for the server args, `tokenizer-path`, overriding architecture are necessary.
+
+## Supported Backend
+- sglang (oai)
+- vllm (oai)
+- lmdeploy (oai)
--- a/benchmark/hicache/bench_long_context.py
+++ b/benchmark/hicache/bench_long_context.py
@@ -0,0 +1,101 @@
+import json
+import queue
+import time
+
+import requests
+from bench_multiturn import (
+    ReadyQueue,
+    WorkloadGenerator,
+    gen_payload,
+    log_to_jsonl_file,
+    parse_args,
+)
+from tqdm.asyncio import tqdm
+
+from sglang.bench_serving import get_tokenizer
+
+
+class ContextWorkloadGenerator(WorkloadGenerator):
+    def __init__(self, args):
+        # Construct the base URL for requests
+        self.baseurl = f"http://{args.host}:{args.port}/"
+        self.url = self.baseurl + "generate"
+
+        self.tokenizer = get_tokenizer(args.model_path)
+        self.distribution = args.distribution
+        self.request_rate = args.request_rate
+        self.start_time = None
+        self.finished_time = None
+
+        self.sent_requests = 0
+        self.completed_requests = 0
+
+        self.dataset = json.load(open(args.dataset_path))
+        num_requests = min(args.num_clients, len(self.dataset["queries"]))
+
+        init_requests = []
+        for i in range(num_requests):
+            context_id = self.dataset["queries"][i]["context"]
+            init_requests.append(
+                (
+                    i,
+                    gen_payload(
+                        self.dataset["contexts"][context_id]
+                        + self.dataset["queries"][i]["question"],
+                        len(
+                            self.tokenizer(
+                                self.dataset["queries"][i]["reference_answer"]
+                            )["input_ids"]
+                        ),
+                    ),
+                )
+            )
+        self.ready_queue = ReadyQueue(init_requests=init_requests)
+
+        self.response_queue = queue.Queue()
+        self.pbar = tqdm(total=num_requests)
+        self.performance_metrics = {
+            "ttft": [],
+            "latency": [],
+            "itl": [],
+            "prompt_len": [],
+            "cached_tokens": [],
+            "generated_len": [],
+        }
+
+        self.max_parallel = args.max_parallel
+        self.logfile = args.log_file
+
+    def response_handler(self):
+        while True:
+            try:
+                client_id, response = self.response_queue.get(
+                    timeout=10
+                )  # Block until response is available
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["itl"].extend(response.itl)
+                self.performance_metrics["latency"].append(response.latency)
+                self.performance_metrics["prompt_len"].append(response.prompt_len)
+                self.performance_metrics["cached_tokens"].append(response.cached_tokens)
+                self.performance_metrics["generated_len"].append(response.generated_len)
+                self.completed_requests += 1
+
+            except queue.Empty:
+                if self.pbar.n == self.pbar.total:
+                    break
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    args.num_rounds = 1
+    args.max_parallel = 24
+    flush_cache_url = f"http://{args.host}:{args.port}/flush_cache"
+
+    for request_rate in [24, 16, 12, 8, 4, 2, 1]:
+        args.request_rate = request_rate
+        requests.post(flush_cache_url)
+        time.sleep(1)
+        performance_data = ContextWorkloadGenerator(args).run()
+        log_to_jsonl_file(performance_data, args.log_file, args.tag)
--- a/Show More
+++ b/Show More